From 527a069ba9eff6717ccd4bb1e839674edebffceb Mon Sep 17 00:00:00 2001
From: Mathieu Virbel <mat@meltingrocks.com>
Date: Fri, 20 Feb 2026 12:07:34 -0600
Subject: [PATCH] fix: remove max_tokens cap to support thinking models
 (Kimi-K2.5) (#869)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: remove max_tokens cap to support thinking models (Kimi-K2.5)

Thinking/reasoning models like Kimi-K2.5 use output tokens for internal
chain-of-thought before generating the visible response. When max_tokens
was set (500 or 2048), the thinking budget consumed all available tokens,
leaving an empty response — causing TreeSummarize to return '' and
crashing the topic detection retry workflow.

Set max_tokens default to None so the model controls its own output
budget, allowing thinking models to complete both reasoning and response.

Also fix process.py CLI tool to import the Celery worker app before
dispatching tasks, ensuring the Redis broker config is used instead of
Celery's default AMQP transport.

* fix: remove max_tokens=200 cap from final title processor

Same thinking model issue — 200 tokens is especially tight and would be
entirely consumed by chain-of-thought reasoning, producing an empty title.

* Update server/reflector/tools/process.py

Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>

* fix: remove max_tokens=500 cap from topic detector processor

Same thinking model fix — this is the original callsite that was failing
with Kimi-K2.5, producing empty TreeSummarize responses.

---------

Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
---
 server/reflector/hatchet/workflows/topic_chunk_processing.py | 2 +-
 server/reflector/llm.py                                      | 4 +++-
 server/reflector/processors/transcript_final_title.py        | 2 +-
 server/reflector/processors/transcript_topic_detector.py     | 2 +-
 server/reflector/tools/process.py                            | 3 +++
 5 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/server/reflector/hatchet/workflows/topic_chunk_processing.py b/server/reflector/hatchet/workflows/topic_chunk_processing.py
index b545b082..82b68569 100644
--- a/server/reflector/hatchet/workflows/topic_chunk_processing.py
+++ b/server/reflector/hatchet/workflows/topic_chunk_processing.py
@@ -71,7 +71,7 @@ async def detect_chunk_topic(input: TopicChunkInput, ctx: Context) -> TopicChunk
     from reflector.settings import settings  # noqa: PLC0415
     from reflector.utils.text import clean_title  # noqa: PLC0415
 
-    llm = LLM(settings=settings, temperature=0.9, max_tokens=500)
+    llm = LLM(settings=settings, temperature=0.9)
 
     prompt = TOPIC_PROMPT.format(text=input.chunk_text)
     response = await llm.get_structured_response(
diff --git a/server/reflector/llm.py b/server/reflector/llm.py
index 4723b8be..8b6f8524 100644
--- a/server/reflector/llm.py
+++ b/server/reflector/llm.py
@@ -202,7 +202,9 @@ class StructuredOutputWorkflow(Workflow, Generic[OutputT]):
 
 
 class LLM:
-    def __init__(self, settings, temperature: float = 0.4, max_tokens: int = 2048):
+    def __init__(
+        self, settings, temperature: float = 0.4, max_tokens: int | None = None
+    ):
         self.settings_obj = settings
         self.model_name = settings.LLM_MODEL
         self.url = settings.LLM_URL
diff --git a/server/reflector/processors/transcript_final_title.py b/server/reflector/processors/transcript_final_title.py
index 75b62b5a..66362b78 100644
--- a/server/reflector/processors/transcript_final_title.py
+++ b/server/reflector/processors/transcript_final_title.py
@@ -39,7 +39,7 @@ class TranscriptFinalTitleProcessor(Processor):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.chunks: list[TitleSummary] = []
-        self.llm = LLM(settings=settings, temperature=0.5, max_tokens=200)
+        self.llm = LLM(settings=settings, temperature=0.5)
 
     async def _push(self, data: TitleSummary):
         self.chunks.append(data)
diff --git a/server/reflector/processors/transcript_topic_detector.py b/server/reflector/processors/transcript_topic_detector.py
index 5e67e39f..e775e073 100644
--- a/server/reflector/processors/transcript_topic_detector.py
+++ b/server/reflector/processors/transcript_topic_detector.py
@@ -35,7 +35,7 @@ class TranscriptTopicDetectorProcessor(Processor):
         super().__init__(**kwargs)
         self.transcript = None
         self.min_transcript_length = min_transcript_length
-        self.llm = LLM(settings=settings, temperature=0.9, max_tokens=500)
+        self.llm = LLM(settings=settings, temperature=0.9)
 
     async def _push(self, data: Transcript):
         if self.transcript is None:
diff --git a/server/reflector/tools/process.py b/server/reflector/tools/process.py
index a3a74138..7dff46b6 100644
--- a/server/reflector/tools/process.py
+++ b/server/reflector/tools/process.py
@@ -24,6 +24,9 @@ from reflector.pipelines.main_live_pipeline import (
     pipeline_process as live_pipeline_process,
 )
 from reflector.storage import Storage
+from reflector.worker.app import (
+    app as celery_app,  # noqa: F401 - ensure Celery uses Redis broker
+)
 
 
 def validate_s3_bucket_name(bucket: str) -> None: