mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-03-21 22:56:47 +00:00
fix: remove max_tokens cap to support thinking models (Kimi-K2.5) (#869)
* fix: remove max_tokens cap to support thinking models (Kimi-K2.5) Thinking/reasoning models like Kimi-K2.5 use output tokens for internal chain-of-thought before generating the visible response. When max_tokens was set (500 or 2048), the thinking budget consumed all available tokens, leaving an empty response — causing TreeSummarize to return '' and crashing the topic detection retry workflow. Set max_tokens default to None so the model controls its own output budget, allowing thinking models to complete both reasoning and response. Also fix process.py CLI tool to import the Celery worker app before dispatching tasks, ensuring the Redis broker config is used instead of Celery's default AMQP transport. * fix: remove max_tokens=200 cap from final title processor Same thinking model issue — 200 tokens is especially tight and would be entirely consumed by chain-of-thought reasoning, producing an empty title. * Update server/reflector/tools/process.py Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * fix: remove max_tokens=500 cap from topic detector processor Same thinking model fix — this is the original callsite that was failing with Kimi-K2.5, producing empty TreeSummarize responses. --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
This commit is contained in:
@@ -71,7 +71,7 @@ async def detect_chunk_topic(input: TopicChunkInput, ctx: Context) -> TopicChunk
|
||||
from reflector.settings import settings # noqa: PLC0415
|
||||
from reflector.utils.text import clean_title # noqa: PLC0415
|
||||
|
||||
llm = LLM(settings=settings, temperature=0.9, max_tokens=500)
|
||||
llm = LLM(settings=settings, temperature=0.9)
|
||||
|
||||
prompt = TOPIC_PROMPT.format(text=input.chunk_text)
|
||||
response = await llm.get_structured_response(
|
||||
|
||||
@@ -202,7 +202,9 @@ class StructuredOutputWorkflow(Workflow, Generic[OutputT]):
|
||||
|
||||
|
||||
class LLM:
|
||||
def __init__(self, settings, temperature: float = 0.4, max_tokens: int = 2048):
|
||||
def __init__(
|
||||
self, settings, temperature: float = 0.4, max_tokens: int | None = None
|
||||
):
|
||||
self.settings_obj = settings
|
||||
self.model_name = settings.LLM_MODEL
|
||||
self.url = settings.LLM_URL
|
||||
|
||||
@@ -39,7 +39,7 @@ class TranscriptFinalTitleProcessor(Processor):
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.chunks: list[TitleSummary] = []
|
||||
self.llm = LLM(settings=settings, temperature=0.5, max_tokens=200)
|
||||
self.llm = LLM(settings=settings, temperature=0.5)
|
||||
|
||||
async def _push(self, data: TitleSummary):
|
||||
self.chunks.append(data)
|
||||
|
||||
@@ -35,7 +35,7 @@ class TranscriptTopicDetectorProcessor(Processor):
|
||||
super().__init__(**kwargs)
|
||||
self.transcript = None
|
||||
self.min_transcript_length = min_transcript_length
|
||||
self.llm = LLM(settings=settings, temperature=0.9, max_tokens=500)
|
||||
self.llm = LLM(settings=settings, temperature=0.9)
|
||||
|
||||
async def _push(self, data: Transcript):
|
||||
if self.transcript is None:
|
||||
|
||||
@@ -24,6 +24,9 @@ from reflector.pipelines.main_live_pipeline import (
|
||||
pipeline_process as live_pipeline_process,
|
||||
)
|
||||
from reflector.storage import Storage
|
||||
from reflector.worker.app import (
|
||||
app as celery_app, # noqa: F401 - ensure Celery uses Redis broker
|
||||
)
|
||||
|
||||
|
||||
def validate_s3_bucket_name(bucket: str) -> None:
|
||||
|
||||
Reference in New Issue
Block a user