fix(core): ensure compaction is more reliable, add reserve token buffer to ensure that input window has enough room to compact (#12924)

Co-authored-by: James Lal <james@littlebearlabs.io>
2026-02-10 19:55:22 -06:00
parent 60bdb6e9ba
commit 0fd6f365be
16 changed files with 262 additions and 189 deletions
--- a/packages/opencode/test/provider/transform.test.ts
+++ b/packages/opencode/test/provider/transform.test.ts
@@ -175,100 +175,6 @@ describe("ProviderTransform.options - gpt-5 textVerbosity", () => {
  })
 })

-describe("ProviderTransform.maxOutputTokens", () => {
-  test("returns 32k when modelLimit > 32k", () => {
-    const modelLimit = 100000
-    const result = ProviderTransform.maxOutputTokens("@ai-sdk/openai", {}, modelLimit, OUTPUT_TOKEN_MAX)
-    expect(result).toBe(OUTPUT_TOKEN_MAX)
-  })
-
-  test("returns modelLimit when modelLimit < 32k", () => {
-    const modelLimit = 16000
-    const result = ProviderTransform.maxOutputTokens("@ai-sdk/openai", {}, modelLimit, OUTPUT_TOKEN_MAX)
-    expect(result).toBe(16000)
-  })
-
-  describe("azure", () => {
-    test("returns 32k when modelLimit > 32k", () => {
-      const modelLimit = 100000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/azure", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(OUTPUT_TOKEN_MAX)
-    })
-
-    test("returns modelLimit when modelLimit < 32k", () => {
-      const modelLimit = 16000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/azure", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(16000)
-    })
-  })
-
-  describe("bedrock", () => {
-    test("returns 32k when modelLimit > 32k", () => {
-      const modelLimit = 100000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/amazon-bedrock", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(OUTPUT_TOKEN_MAX)
-    })
-
-    test("returns modelLimit when modelLimit < 32k", () => {
-      const modelLimit = 16000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/amazon-bedrock", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(16000)
-    })
-  })
-
-  describe("anthropic without thinking options", () => {
-    test("returns 32k when modelLimit > 32k", () => {
-      const modelLimit = 100000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/anthropic", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(OUTPUT_TOKEN_MAX)
-    })
-
-    test("returns modelLimit when modelLimit < 32k", () => {
-      const modelLimit = 16000
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/anthropic", {}, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(16000)
-    })
-  })
-
-  describe("anthropic with thinking options", () => {
-    test("returns 32k when budgetTokens + 32k <= modelLimit", () => {
-      const modelLimit = 100000
-      const options = {
-        thinking: {
-          type: "enabled",
-          budgetTokens: 10000,
-        },
-      }
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/anthropic", options, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(OUTPUT_TOKEN_MAX)
-    })
-
-    test("returns modelLimit - budgetTokens when budgetTokens + 32k > modelLimit", () => {
-      const modelLimit = 50000
-      const options = {
-        thinking: {
-          type: "enabled",
-          budgetTokens: 30000,
-        },
-      }
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/anthropic", options, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(20000)
-    })
-
-    test("returns 32k when thinking type is not enabled", () => {
-      const modelLimit = 100000
-      const options = {
-        thinking: {
-          type: "disabled",
-          budgetTokens: 10000,
-        },
-      }
-      const result = ProviderTransform.maxOutputTokens("@ai-sdk/anthropic", options, modelLimit, OUTPUT_TOKEN_MAX)
-      expect(result).toBe(OUTPUT_TOKEN_MAX)
-    })
-  })
-})
-
 describe("ProviderTransform.schema - gemini array items", () => {
  test("adds missing items for array properties", () => {
    const geminiModel = {
--- a/packages/opencode/test/session/compaction.test.ts
+++ b/packages/opencode/test/session/compaction.test.ts
@@ -15,6 +15,7 @@ function createModel(opts: {
  output: number
  input?: number
  cost?: Provider.Model["cost"]
+  npm?: string
 }): Provider.Model {
  return {
    id: "test-model",
@@ -34,7 +35,7 @@ function createModel(opts: {
      input: { text: true, image: false, audio: false, video: false },
      output: { text: true, image: false, audio: false, video: false },
    },
-    api: { npm: "@ai-sdk/anthropic" },
+    api: { npm: opts.npm ?? "@ai-sdk/anthropic" },
    options: {},
  } as Provider.Model
 }
@@ -70,7 +71,7 @@ describe("session.compaction.isOverflow", () => {
      directory: tmp.path,
      fn: async () => {
        const model = createModel({ context: 100_000, output: 32_000 })
-        const tokens = { input: 50_000, output: 10_000, reasoning: 0, cache: { read: 10_000, write: 0 } }
+        const tokens = { input: 60_000, output: 10_000, reasoning: 0, cache: { read: 10_000, write: 0 } }
        expect(await SessionCompaction.isOverflow({ tokens, model })).toBe(true)
      },
    })
@@ -112,6 +113,86 @@ describe("session.compaction.isOverflow", () => {
    })
  })

+  // ─── Bug reproduction tests ───────────────────────────────────────────
+  // These tests demonstrate that when limit.input is set, isOverflow()
+  // does not subtract any headroom for the next model response. This means
+  // compaction only triggers AFTER we've already consumed the full input
+  // budget, leaving zero room for the next API call's output tokens.
+  //
+  // Compare: without limit.input, usable = context - output (reserves space).
+  // With limit.input, usable = limit.input (reserves nothing).
+  //
+  // Related issues: #10634, #8089, #11086, #12621
+  // Open PRs: #6875, #12924
+
+  test("BUG: no headroom when limit.input is set — compaction should trigger near boundary but does not", async () => {
+    await using tmp = await tmpdir()
+    await Instance.provide({
+      directory: tmp.path,
+      fn: async () => {
+        // Simulate Claude with prompt caching: input limit = 200K, output limit = 32K
+        const model = createModel({ context: 200_000, input: 200_000, output: 32_000 })
+
+        // We've used 198K tokens total. Only 2K under the input limit.
+        // On the next turn, the full conversation (198K) becomes input,
+        // plus the model needs room to generate output — this WILL overflow.
+        const tokens = { input: 180_000, output: 15_000, reasoning: 0, cache: { read: 3_000, write: 0 } }
+        // count = 180K + 3K + 15K = 198K
+        // usable = limit.input = 200K (no output subtracted!)
+        // 198K > 200K = false → no compaction triggered
+
+        // WITHOUT limit.input: usable = 200K - 32K = 168K, and 198K > 168K = true ✓
+        // WITH limit.input: usable = 200K, and 198K > 200K = false ✗
+
+        // With 198K used and only 2K headroom, the next turn will overflow.
+        // Compaction MUST trigger here.
+        expect(await SessionCompaction.isOverflow({ tokens, model })).toBe(true)
+      },
+    })
+  })
+
+  test("BUG: without limit.input, same token count correctly triggers compaction", async () => {
+    await using tmp = await tmpdir()
+    await Instance.provide({
+      directory: tmp.path,
+      fn: async () => {
+        // Same model but without limit.input — uses context - output instead
+        const model = createModel({ context: 200_000, output: 32_000 })
+
+        // Same token usage as above
+        const tokens = { input: 180_000, output: 15_000, reasoning: 0, cache: { read: 3_000, write: 0 } }
+        // count = 198K
+        // usable = context - output = 200K - 32K = 168K
+        // 198K > 168K = true → compaction correctly triggered
+
+        const result = await SessionCompaction.isOverflow({ tokens, model })
+        expect(result).toBe(true) // ← Correct: headroom is reserved
+      },
+    })
+  })
+
+  test("BUG: asymmetry — limit.input model allows 30K more usage before compaction than equivalent model without it", async () => {
+    await using tmp = await tmpdir()
+    await Instance.provide({
+      directory: tmp.path,
+      fn: async () => {
+        // Two models with identical context/output limits, differing only in limit.input
+        const withInputLimit = createModel({ context: 200_000, input: 200_000, output: 32_000 })
+        const withoutInputLimit = createModel({ context: 200_000, output: 32_000 })
+
+        // 170K total tokens — well above context-output (168K) but below input limit (200K)
+        const tokens = { input: 166_000, output: 10_000, reasoning: 0, cache: { read: 5_000, write: 0 } }
+
+        const withLimit = await SessionCompaction.isOverflow({ tokens, model: withInputLimit })
+        const withoutLimit = await SessionCompaction.isOverflow({ tokens, model: withoutInputLimit })
+
+        // Both models have identical real capacity — they should agree:
+        expect(withLimit).toBe(true) // should compact (170K leaves no room for 32K output)
+        expect(withoutLimit).toBe(true) // correctly compacts (170K > 168K)
+      },
+    })
+  })
+
  test("returns false when model context limit is 0", async () => {
    await using tmp = await tmpdir()
    await Instance.provide({
@@ -290,4 +371,53 @@ describe("session.getUsage", () => {

    expect(result.cost).toBe(3 + 1.5)
  })
+
+  test.each(["@ai-sdk/anthropic", "@ai-sdk/amazon-bedrock", "@ai-sdk/google-vertex/anthropic"])(
+    "computes total from components for %s models",
+    (npm) => {
+      const model = createModel({ context: 100_000, output: 32_000, npm })
+      const usage = {
+        inputTokens: 1000,
+        outputTokens: 500,
+        // These providers typically report total as input + output only,
+        // excluding cache read/write.
+        totalTokens: 1500,
+        cachedInputTokens: 200,
+      }
+      if (npm === "@ai-sdk/amazon-bedrock") {
+        const result = Session.getUsage({
+          model,
+          usage,
+          metadata: {
+            bedrock: {
+              usage: {
+                cacheWriteInputTokens: 300,
+              },
+            },
+          },
+        })
+
+        expect(result.tokens.input).toBe(1000)
+        expect(result.tokens.cache.read).toBe(200)
+        expect(result.tokens.cache.write).toBe(300)
+        expect(result.tokens.total).toBe(2000)
+        return
+      }
+
+      const result = Session.getUsage({
+        model,
+        usage,
+        metadata: {
+          anthropic: {
+            cacheCreationInputTokens: 300,
+          },
+        },
+      })
+
+      expect(result.tokens.input).toBe(1000)
+      expect(result.tokens.cache.read).toBe(200)
+      expect(result.tokens.cache.write).toBe(300)
+      expect(result.tokens.total).toBe(2000)
+    },
+  )
 })
--- a/packages/opencode/test/session/llm.test.ts
+++ b/packages/opencode/test/session/llm.test.ts
@@ -314,12 +314,7 @@ describe("session.llm.stream", () => {
        expect(body.stream).toBe(true)

        const maxTokens = (body.max_tokens as number | undefined) ?? (body.max_output_tokens as number | undefined)
-        const expectedMaxTokens = ProviderTransform.maxOutputTokens(
-          resolved.api.npm,
-          ProviderTransform.options({ model: resolved, sessionID }),
-          resolved.limit.output,
-          LLM.OUTPUT_TOKEN_MAX,
-        )
+        const expectedMaxTokens = ProviderTransform.maxOutputTokens(resolved)
        expect(maxTokens).toBe(expectedMaxTokens)

        const reasoning = (body.reasoningEffort as string | undefined) ?? (body.reasoning_effort as string | undefined)
@@ -442,12 +437,7 @@ describe("session.llm.stream", () => {
        expect((body.reasoning as { effort?: string } | undefined)?.effort).toBe("high")

        const maxTokens = body.max_output_tokens as number | undefined
-        const expectedMaxTokens = ProviderTransform.maxOutputTokens(
-          resolved.api.npm,
-          ProviderTransform.options({ model: resolved, sessionID }),
-          resolved.limit.output,
-          LLM.OUTPUT_TOKEN_MAX,
-        )
+        const expectedMaxTokens = ProviderTransform.maxOutputTokens(resolved)
        expect(maxTokens).toBe(expectedMaxTokens)
      },
    })
@@ -565,14 +555,7 @@ describe("session.llm.stream", () => {

        expect(capture.url.pathname.endsWith("/messages")).toBe(true)
        expect(body.model).toBe(resolved.api.id)
-        expect(body.max_tokens).toBe(
-          ProviderTransform.maxOutputTokens(
-            resolved.api.npm,
-            ProviderTransform.options({ model: resolved, sessionID }),
-            resolved.limit.output,
-            LLM.OUTPUT_TOKEN_MAX,
-          ),
-        )
+        expect(body.max_tokens).toBe(ProviderTransform.maxOutputTokens(resolved))
        expect(body.temperature).toBe(0.4)
        expect(body.top_p).toBe(0.9)
      },
@@ -677,14 +660,7 @@ describe("session.llm.stream", () => {
        expect(capture.url.pathname).toBe(pathSuffix)
        expect(config?.temperature).toBe(0.3)
        expect(config?.topP).toBe(0.8)
-        expect(config?.maxOutputTokens).toBe(
-          ProviderTransform.maxOutputTokens(
-            resolved.api.npm,
-            ProviderTransform.options({ model: resolved, sessionID }),
-            resolved.limit.output,
-            LLM.OUTPUT_TOKEN_MAX,
-          ),
-        )
+        expect(config?.maxOutputTokens).toBe(ProviderTransform.maxOutputTokens(resolved))
      },
    })
  })
--- a/packages/opencode/test/session/retry.test.ts
+++ b/packages/opencode/test/session/retry.test.ts
@@ -112,6 +112,15 @@ describe("session.retry.retryable", () => {
    const error = wrap("not-json")
    expect(SessionRetry.retryable(error)).toBeUndefined()
  })
+
+  test("does not retry context overflow errors", () => {
+    const error = new MessageV2.ContextOverflowError({
+      message: "Input exceeds context window of this model",
+      responseBody: '{"error":{"code":"context_length_exceeded"}}',
+    }).toObject() as ReturnType<NamedError["toObject"]>
+
+    expect(SessionRetry.retryable(error)).toBeUndefined()
+  })
 })

 describe("session.message-v2.fromError", () => {