fix: improve hatchet workflow reliability (#900)

* Increase max connections * Classify hard and transient hatchet errors * Fan out partial success * Force reprocessing of error transcripts * Stop retrying on 402 payment required * Avoid httpx/hatchet timeout race * Add retry wrapper to get_response for for transient errors * Add retry backoff * Return falsy results so get_response won't retry on empty string * Skip error status in on_workflow_failure when transcript already ended * Fix precommit issues * Fail step on first fan-out failure instead of skipping
2026-05-06 03:15:17 +00:00 · 2026-03-06 17:07:26 +01:00
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions
--- a/docker-compose.selfhosted.yml
+++ b/docker-compose.selfhosted.yml
@@ -137,6 +137,7 @@ services:
  postgres:
    image: postgres:17-alpine
    restart: unless-stopped
    command: ["postgres", "-c", "max_connections=200"]
    environment:
      POSTGRES_USER: reflector
      POSTGRES_PASSWORD: reflector
--- a/server/reflector/hatchet/constants.py
+++ b/server/reflector/hatchet/constants.py
@@ -39,5 +39,12 @@ TIMEOUT_MEDIUM = (
    300  # Single LLM calls, waveform generation (5m for slow LLM responses)
 )
 TIMEOUT_LONG = 180  # Action items (larger context LLM)
-TIMEOUT_AUDIO = 720  # Audio processing: padding, mixdown
+TIMEOUT_TITLE = 300  # generate_title (single LLM call; doc: reduce from 600s)
-TIMEOUT_HEAVY = 600  # Transcription, fan-out LLM tasks
+TIMEOUT_AUDIO = 720  # Audio processing: padding, mixdown (Hatchet execution_timeout)
 TIMEOUT_AUDIO_HTTP = (
    660  # httpx timeout for pad_track — below 720 so Hatchet doesn't race
 )
 TIMEOUT_HEAVY = 600  # Transcription, fan-out LLM tasks (Hatchet execution_timeout)
 TIMEOUT_HEAVY_HTTP = (
    540  # httpx timeout for transcribe_track — below 600 so Hatchet doesn't race
 )
--- a/server/reflector/hatchet/error_classification.py
+++ b/server/reflector/hatchet/error_classification.py
@@ -0,0 +1,74 @@
 """Classify exceptions as non-retryable for Hatchet workflows.
 When a task raises NonRetryableException (or an exception classified as
 non-retryable and re-raised as such), Hatchet stops immediately — no further
 retries. Used by with_error_handling to avoid wasting retries on config errors,
 auth failures, corrupt data, etc.
 """
 # Optional dependencies: only classify if the exception type is available.
 # This avoids hard dependency on openai/av/botocore for code paths that don't use them.
 try:
    import openai
 except ImportError:
    openai = None  # type: ignore[assignment]
 try:
    import av
 except ImportError:
    av = None  # type: ignore[assignment]
 try:
    from botocore.exceptions import ClientError as BotoClientError
 except ImportError:
    BotoClientError = None  # type: ignore[misc, assignment]
 from hatchet_sdk import NonRetryableException
 from httpx import HTTPStatusError
 from reflector.llm import LLMParseError
 # HTTP status codes that won't change on retry (auth, not found, payment, payload)
 NON_RETRYABLE_HTTP_STATUSES = {401, 402, 403, 404, 413}
 NON_RETRYABLE_S3_CODES = {"AccessDenied", "NoSuchBucket", "NoSuchKey"}
 def is_non_retryable(e: BaseException) -> bool:
    """Return True if the exception should stop Hatchet retries immediately.
    Hard failures (config, auth, missing resource, corrupt data) return True.
    Transient errors (timeouts, 5xx, 429, connection) return False.
    """
    if isinstance(e, NonRetryableException):
        return True
    # Config/input errors
    if isinstance(e, (ValueError, TypeError)):
        return True
    # HTTP status codes that won't change on retry
    if isinstance(e, HTTPStatusError):
        return e.response.status_code in NON_RETRYABLE_HTTP_STATUSES
    # OpenAI auth errors
    if openai is not None and isinstance(e, openai.AuthenticationError):
        return True
    # LLM parse failures (already retried internally)
    if isinstance(e, LLMParseError):
        return True
    # S3 permission/existence errors
    if BotoClientError is not None and isinstance(e, BotoClientError):
        code = e.response.get("Error", {}).get("Code", "")
        return code in NON_RETRYABLE_S3_CODES
    # Corrupt audio (PyAV) — AVError in some versions; fallback to InvalidDataError
    if av is not None:
        av_error = getattr(av, "AVError", None) or getattr(
            getattr(av, "error", None), "InvalidDataError", None
        )
        if av_error is not None and isinstance(e, av_error):
            return True
    return False
--- a/server/reflector/hatchet/workflows/daily_multitrack_pipeline.py
+++ b/server/reflector/hatchet/workflows/daily_multitrack_pipeline.py
@@ -27,6 +27,7 @@ from hatchet_sdk import (
    ConcurrencyExpression,
    ConcurrencyLimitStrategy,
    Context,
    NonRetryableException,
 )
 from hatchet_sdk.labels import DesiredWorkerLabel
 from pydantic import BaseModel
@@ -43,8 +44,10 @@ from reflector.hatchet.constants import (
    TIMEOUT_LONG,
    TIMEOUT_MEDIUM,
    TIMEOUT_SHORT,
    TIMEOUT_TITLE,
    TaskName,
 )
 from reflector.hatchet.error_classification import is_non_retryable
 from reflector.hatchet.workflows.models import (
    ActionItemsResult,
    ConsentResult,
@@ -216,6 +219,13 @@ def make_audio_progress_logger(
 R = TypeVar("R")
 def _successful_run_results(
    results: list[dict[str, Any] | BaseException],
 ) -> list[dict[str, Any]]:
    """Return only successful (non-exception) results from aio_run_many(return_exceptions=True)."""
    return [r for r in results if not isinstance(r, BaseException)]
 def with_error_handling(
    step_name: TaskName, set_error_status: bool = True
 ) -> Callable[
@@ -243,8 +253,12 @@ def with_error_handling(
                    error=str(e),
                    exc_info=True,
                )
-                if set_error_status:
+                if is_non_retryable(e):
-                    await set_workflow_error_status(input.transcript_id)
+                    # Hard fail: stop retries, set error status, fail workflow
                    if set_error_status:
                        await set_workflow_error_status(input.transcript_id)
                    raise NonRetryableException(str(e)) from e
                # Transient: do not set error status — Hatchet will retry
                raise
        return wrapper  # type: ignore[return-value]
@@ -253,7 +267,10 @@ def with_error_handling(
@daily_multitrack_pipeline.task(
-    execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
+    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GET_RECORDING)
 async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
@@ -309,6 +326,8 @@ async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
    parents=[get_recording],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GET_PARTICIPANTS)
 async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsResult:
@@ -412,6 +431,8 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
    parents=[get_participants],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.PROCESS_TRACKS)
 async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksResult:
@@ -435,7 +456,7 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
        for i, track in enumerate(input.tracks)
    ]
-    results = await track_workflow.aio_run_many(bulk_runs)
+    results = await track_workflow.aio_run_many(bulk_runs, return_exceptions=True)
    target_language = participants_result.target_language
@@ -443,7 +464,18 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
    padded_tracks = []
    created_padded_files = set()
-    for result in results:
+    for i, result in enumerate(results):
        if isinstance(result, BaseException):
            logger.error(
                "[Hatchet] process_tracks: track workflow failed, failing step",
                transcript_id=input.transcript_id,
                track_index=i,
                error=str(result),
            )
            ctx.log(f"process_tracks: track {i} failed ({result}), failing step")
            raise ValueError(
                f"Track {i} workflow failed after retries: {result!s}"
            ) from result
        transcribe_result = TranscribeTrackResult(**result[TaskName.TRANSCRIBE_TRACK])
        track_words.append(transcribe_result.words)
@@ -481,7 +513,9 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
@daily_multitrack_pipeline.task(
    parents=[process_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
-    retries=3,
+    retries=2,
    backoff_factor=2.0,
    backoff_max_seconds=15,
    desired_worker_labels={
        "pool": DesiredWorkerLabel(
            value="cpu-heavy",
@@ -593,6 +627,8 @@ async def mixdown_tracks(input: PipelineInput, ctx: Context) -> MixdownResult:
    parents=[mixdown_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GENERATE_WAVEFORM)
 async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResult:
@@ -661,6 +697,8 @@ async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResul
    parents=[process_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.DETECT_TOPICS)
 async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
@@ -722,11 +760,22 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
        for chunk in chunks
    ]
-    results = await topic_chunk_workflow.aio_run_many(bulk_runs)
+    results = await topic_chunk_workflow.aio_run_many(bulk_runs, return_exceptions=True)
-    topic_chunks = [
+    topic_chunks: list[TopicChunkResult] = []
-        TopicChunkResult(**result[TaskName.DETECT_CHUNK_TOPIC]) for result in results
+    for i, result in enumerate(results):
-    ]
+        if isinstance(result, BaseException):
            logger.error(
                "[Hatchet] detect_topics: chunk workflow failed, failing step",
                transcript_id=input.transcript_id,
                chunk_index=i,
                error=str(result),
            )
            ctx.log(f"detect_topics: chunk {i} failed ({result}), failing step")
            raise ValueError(
                f"Topic chunk {i} workflow failed after retries: {result!s}"
            ) from result
        topic_chunks.append(TopicChunkResult(**result[TaskName.DETECT_CHUNK_TOPIC]))
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
@@ -764,8 +813,10 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
@daily_multitrack_pipeline.task(
    parents=[detect_topics],
-    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
+    execution_timeout=timedelta(seconds=TIMEOUT_TITLE),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.GENERATE_TITLE)
 async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
@@ -830,7 +881,9 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
@daily_multitrack_pipeline.task(
    parents=[detect_topics],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
-    retries=3,
+    retries=5,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.EXTRACT_SUBJECTS)
 async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult:
@@ -909,6 +962,8 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
    parents=[extract_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.PROCESS_SUBJECTS)
 async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubjectsResult:
@@ -935,12 +990,24 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
        for i, subject in enumerate(subjects)
    ]
-    results = await subject_workflow.aio_run_many(bulk_runs)
+    results = await subject_workflow.aio_run_many(bulk_runs, return_exceptions=True)
-    subject_summaries = [
+    subject_summaries: list[SubjectSummaryResult] = []
-        SubjectSummaryResult(**result[TaskName.GENERATE_DETAILED_SUMMARY])
+    for i, result in enumerate(results):
-        for result in results
+        if isinstance(result, BaseException):
-    ]
+            logger.error(
                "[Hatchet] process_subjects: subject workflow failed, failing step",
                transcript_id=input.transcript_id,
                subject_index=i,
                error=str(result),
            )
            ctx.log(f"process_subjects: subject {i} failed ({result}), failing step")
            raise ValueError(
                f"Subject {i} workflow failed after retries: {result!s}"
            ) from result
        subject_summaries.append(
            SubjectSummaryResult(**result[TaskName.GENERATE_DETAILED_SUMMARY])
        )
    ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")
@@ -951,6 +1018,8 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
    parents=[process_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.GENERATE_RECAP)
 async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
@@ -1040,6 +1109,8 @@ async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
    parents=[extract_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_LONG),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.IDENTIFY_ACTION_ITEMS)
 async def identify_action_items(
@@ -1108,6 +1179,8 @@ async def identify_action_items(
    parents=[process_tracks, generate_title, generate_recap, identify_action_items],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=5,
 )
@with_error_handling(TaskName.FINALIZE)
 async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
@@ -1177,7 +1250,11 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
@daily_multitrack_pipeline.task(
-    parents=[finalize], execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
+    parents=[finalize],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.CLEANUP_CONSENT, set_error_status=False)
 async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
@@ -1283,6 +1360,8 @@ async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
    parents=[cleanup_consent],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=5,
    backoff_factor=2.0,
    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.POST_ZULIP, set_error_status=False)
 async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
@@ -1310,6 +1389,8 @@ async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
    parents=[cleanup_consent],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=5,
    backoff_factor=2.0,
    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.SEND_WEBHOOK, set_error_status=False)
 async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
@@ -1378,3 +1459,32 @@ async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
        except Exception as e:
            ctx.log(f"send_webhook unexpected error, continuing anyway: {e}")
            return WebhookResult(webhook_sent=False)
 async def on_workflow_failure(input: PipelineInput, ctx: Context) -> None:
    """Run when the workflow is truly dead (all retries exhausted).
    Sets transcript status to 'error' only if it is not already 'ended'.
    Post-finalize tasks (cleanup_consent, post_zulip, send_webhook) use
    set_error_status=False; if one of them fails, we must not overwrite
    the 'ended' status that finalize already set.
    """
    async with fresh_db_connection():
        from reflector.db.transcripts import transcripts_controller  # noqa: PLC0415
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
        if transcript and transcript.status == "ended":
            logger.info(
                "[Hatchet] on_workflow_failure: transcript already ended, skipping error status (failure was post-finalize)",
                transcript_id=input.transcript_id,
            )
            ctx.log(
                "on_workflow_failure: transcript already ended, skipping error status"
            )
            return
    await set_workflow_error_status(input.transcript_id)
@daily_multitrack_pipeline.on_failure_task()
 async def _register_on_workflow_failure(input: PipelineInput, ctx: Context) -> None:
    await on_workflow_failure(input, ctx)
--- a/server/reflector/hatchet/workflows/padding_workflow.py
+++ b/server/reflector/hatchet/workflows/padding_workflow.py
@@ -34,7 +34,12 @@ padding_workflow = hatchet.workflow(
 )
-@padding_workflow.task(execution_timeout=timedelta(seconds=TIMEOUT_AUDIO), retries=3)
+@padding_workflow.task(
    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
 async def pad_track(input: PaddingInput, ctx: Context) -> PadTrackResult:
    """Pad audio track with silence based on WebM container start_time."""
    ctx.log(f"pad_track: track {input.track_index}, s3_key={input.s3_key}")
--- a/server/reflector/hatchet/workflows/subject_processing.py
+++ b/server/reflector/hatchet/workflows/subject_processing.py
@@ -13,7 +13,7 @@ from hatchet_sdk.rate_limit import RateLimit
 from pydantic import BaseModel
 from reflector.hatchet.client import HatchetClientManager
-from reflector.hatchet.constants import LLM_RATE_LIMIT_KEY, TIMEOUT_MEDIUM
+from reflector.hatchet.constants import LLM_RATE_LIMIT_KEY, TIMEOUT_HEAVY
 from reflector.hatchet.workflows.models import SubjectSummaryResult
 from reflector.logger import logger
 from reflector.processors.summary.prompts import (
@@ -41,8 +41,10 @@ subject_workflow = hatchet.workflow(
@subject_workflow.task(
-    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
+    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
-    retries=3,
+    retries=5,
    backoff_factor=2.0,
    backoff_max_seconds=60,
    rate_limits=[RateLimit(static_key=LLM_RATE_LIMIT_KEY, units=2)],
 )
 async def generate_detailed_summary(
--- a/server/reflector/hatchet/workflows/topic_chunk_processing.py
+++ b/server/reflector/hatchet/workflows/topic_chunk_processing.py
@@ -50,7 +50,9 @@ topic_chunk_workflow = hatchet.workflow(
@topic_chunk_workflow.task(
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
-    retries=3,
+    retries=5,
    backoff_factor=2.0,
    backoff_max_seconds=60,
    rate_limits=[RateLimit(static_key=LLM_RATE_LIMIT_KEY, units=1)],
 )
 async def detect_chunk_topic(input: TopicChunkInput, ctx: Context) -> TopicChunkResult:
--- a/server/reflector/hatchet/workflows/track_processing.py
+++ b/server/reflector/hatchet/workflows/track_processing.py
@@ -44,7 +44,12 @@ hatchet = HatchetClientManager.get_client()
 track_workflow = hatchet.workflow(name="TrackProcessing", input_validator=TrackInput)
-@track_workflow.task(execution_timeout=timedelta(seconds=TIMEOUT_AUDIO), retries=3)
+@track_workflow.task(
    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
 async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:
    """Pad single audio track with silence for alignment.
@@ -137,7 +142,11 @@ async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:
@track_workflow.task(
-    parents=[pad_track], execution_timeout=timedelta(seconds=TIMEOUT_HEAVY), retries=3
+    parents=[pad_track],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
    backoff_factor=2.0,
    backoff_max_seconds=30,
 )
 async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackResult:
    """Transcribe audio track using GPU (Modal.com) or local Whisper."""
--- a/server/reflector/llm.py
+++ b/server/reflector/llm.py
@@ -65,10 +65,25 @@ class LLM:
    async def get_response(
        self, prompt: str, texts: list[str], tone_name: str | None = None
    ) -> str:
-        """Get a text response using TreeSummarize for non-function-calling models"""
+        """Get a text response using TreeSummarize for non-function-calling models.
-        summarizer = TreeSummarize(verbose=False)
+
-        response = await summarizer.aget_response(prompt, texts, tone_name=tone_name)
+        Uses the same retry() wrapper as get_structured_response for transient
-        return str(response).strip()
+        network errors (connection, timeout, OSError) with exponential backoff.
        """
        async def _call():
            summarizer = TreeSummarize(verbose=False)
            response = await summarizer.aget_response(
                prompt, texts, tone_name=tone_name
            )
            return str(response).strip()
        return await retry(_call)(
            retry_attempts=3,
            retry_backoff_interval=1.0,
            retry_backoff_max=30.0,
            retry_ignore_exc_types=(ConnectionError, TimeoutError, OSError),
        )
    async def get_structured_response(
        self,
--- a/server/reflector/processors/audio_padding_modal.py
+++ b/server/reflector/processors/audio_padding_modal.py
@@ -7,7 +7,7 @@ import os
 import httpx
-from reflector.hatchet.constants import TIMEOUT_AUDIO
+from reflector.hatchet.constants import TIMEOUT_AUDIO_HTTP
 from reflector.logger import logger
 from reflector.processors.audio_padding import AudioPaddingProcessor, PaddingResponse
 from reflector.processors.audio_padding_auto import AudioPaddingAutoProcessor
@@ -60,7 +60,7 @@ class AudioPaddingModalProcessor(AudioPaddingProcessor):
            headers["Authorization"] = f"Bearer {self.modal_api_key}"
        try:
-            async with httpx.AsyncClient(timeout=TIMEOUT_AUDIO) as client:
+            async with httpx.AsyncClient(timeout=TIMEOUT_AUDIO_HTTP) as client:
                response = await client.post(
                    url,
                    headers=headers,
--- a/server/reflector/settings.py
+++ b/server/reflector/settings.py
@@ -55,7 +55,9 @@ class Settings(BaseSettings):
    WHISPER_FILE_MODEL: str = "tiny"
    TRANSCRIPT_URL: str | None = None
    TRANSCRIPT_TIMEOUT: int = 90
-    TRANSCRIPT_FILE_TIMEOUT: int = 600
+    TRANSCRIPT_FILE_TIMEOUT: int = (
        540  # Below Hatchet TIMEOUT_HEAVY (600) to avoid timeout race
    )
    # Audio Transcription: modal backend
    TRANSCRIPT_MODAL_API_KEY: str | None = None
--- a/server/reflector/utils/retry.py
+++ b/server/reflector/utils/retry.py
@@ -30,6 +30,7 @@ def retry(fn):
            "retry_httpx_status_stop",
            (
                401,  # auth issue
                402,  # payment required / no credits — needs human action
                404,  # not found
                413,  # payload too large
                418,  # teapot
@@ -58,8 +59,9 @@ def retry(fn):
                result = await fn(*args, **kwargs)
                if isinstance(result, Response):
                    result.raise_for_status()
-                if result:
+                # Return any result including falsy (e.g. "" from get_response);
-                    return result
+                # only retry on exception, not on empty string.
                return result
            except HTTPStatusError as e:
                retry_logger.exception(e)
                status_code = e.response.status_code
--- a/server/reflector/views/transcripts_process.py
+++ b/server/reflector/views/transcripts_process.py
@@ -50,5 +50,8 @@ async def transcript_process(
    if isinstance(config, ProcessError):
        raise HTTPException(status_code=500, detail=config.detail)
    else:
-        await dispatch_transcript_processing(config)
+        # When transcript is in error state, force a new workflow instead of replaying
        # (replay would re-run from failure point with same conditions and likely fail again)
        force = transcript.status == "error"
        await dispatch_transcript_processing(config, force=force)
        return ProcessStatus(status="ok")
--- a/server/tests/test_hatchet_error_handling.py
+++ b/server/tests/test_hatchet_error_handling.py
@@ -0,0 +1,303 @@
 """
 Tests for Hatchet error handling: NonRetryable classification and error status.
 These tests encode the desired behavior from the Hatchet Workflow Analysis doc:
 - Transient exceptions: do NOT set error status (let Hatchet retry; user stays on "processing").
 - Hard-fail exceptions: set error status and re-raise as NonRetryableException (stop retries).
 - on_failure_task: sets error status when workflow is truly dead.
 Run before the fix: some tests fail (reproducing the issues).
 Run after the fix: all tests pass.
 """
 from contextlib import asynccontextmanager
 from unittest.mock import AsyncMock, MagicMock, patch
 import httpx
 import pytest
 from hatchet_sdk import NonRetryableException
 from reflector.hatchet.error_classification import is_non_retryable
 from reflector.llm import LLMParseError
 # --- Tests for is_non_retryable() (pass once error_classification exists) ---
 def test_is_non_retryable_returns_true_for_value_error():
    """ValueError (e.g. missing config) should stop retries."""
    assert is_non_retryable(ValueError("DAILY_API_KEY must be set")) is True
 def test_is_non_retryable_returns_true_for_type_error():
    """TypeError (bad input) should stop retries."""
    assert is_non_retryable(TypeError("expected str")) is True
 def test_is_non_retryable_returns_true_for_http_401():
    """HTTP 401 auth error should stop retries."""
    resp = MagicMock()
    resp.status_code = 401
    err = httpx.HTTPStatusError("Unauthorized", request=MagicMock(), response=resp)
    assert is_non_retryable(err) is True
 def test_is_non_retryable_returns_true_for_http_402():
    """HTTP 402 (no credits) should stop retries."""
    resp = MagicMock()
    resp.status_code = 402
    err = httpx.HTTPStatusError("Payment Required", request=MagicMock(), response=resp)
    assert is_non_retryable(err) is True
 def test_is_non_retryable_returns_true_for_http_404():
    """HTTP 404 should stop retries."""
    resp = MagicMock()
    resp.status_code = 404
    err = httpx.HTTPStatusError("Not Found", request=MagicMock(), response=resp)
    assert is_non_retryable(err) is True
 def test_is_non_retryable_returns_false_for_http_503():
    """HTTP 503 is transient; retries are useful."""
    resp = MagicMock()
    resp.status_code = 503
    err = httpx.HTTPStatusError(
        "Service Unavailable", request=MagicMock(), response=resp
    )
    assert is_non_retryable(err) is False
 def test_is_non_retryable_returns_false_for_timeout():
    """Timeout is transient."""
    assert is_non_retryable(httpx.TimeoutException("timed out")) is False
 def test_is_non_retryable_returns_true_for_llm_parse_error():
    """LLMParseError after internal retries should stop."""
    from pydantic import BaseModel
    class _Dummy(BaseModel):
        pass
    assert is_non_retryable(LLMParseError(_Dummy, "Failed to parse", 3)) is True
 def test_is_non_retryable_returns_true_for_non_retryable_exception():
    """Already-wrapped NonRetryableException should stay non-retryable."""
    assert is_non_retryable(NonRetryableException("custom")) is True
 # --- Tests for with_error_handling (need pipeline module with patch) ---
@pytest.fixture(scope="module")
 def pipeline_module():
    """Import daily_multitrack_pipeline with Hatchet client mocked."""
    with patch("reflector.hatchet.client.settings") as s:
        s.HATCHET_CLIENT_TOKEN = "test-token"
        s.HATCHET_DEBUG = False
    mock_client = MagicMock()
    mock_client.workflow.return_value = MagicMock()
    with patch(
        "reflector.hatchet.client.HatchetClientManager.get_client",
        return_value=mock_client,
    ):
        from reflector.hatchet.workflows import daily_multitrack_pipeline
        return daily_multitrack_pipeline
@pytest.fixture
 def mock_input():
    """Minimal PipelineInput for decorator tests."""
    from reflector.hatchet.workflows.daily_multitrack_pipeline import PipelineInput
    return PipelineInput(
        recording_id="rec-1",
        tracks=[],
        bucket_name="bucket",
        transcript_id="ts-123",
        room_id=None,
    )
@pytest.fixture
 def mock_ctx():
    """Minimal Context-like object."""
    ctx = MagicMock()
    ctx.log = MagicMock()
    return ctx
@pytest.mark.asyncio
 async def test_with_error_handling_transient_does_not_set_error_status(
    pipeline_module, mock_input, mock_ctx
 ):
    """Transient exception must NOT set error status (so user stays on 'processing' during retries).
    Before fix: set_workflow_error_status is called on every exception → FAIL.
    After fix: not called for transient → PASS.
    """
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        TaskName,
        with_error_handling,
    )
    async def failing_task(input, ctx):
        raise httpx.TimeoutException("timed out")
    wrapped = with_error_handling(TaskName.GET_RECORDING)(failing_task)
    with patch(
        "reflector.hatchet.workflows.daily_multitrack_pipeline.set_workflow_error_status",
        new_callable=AsyncMock,
    ) as mock_set_error:
        with pytest.raises(httpx.TimeoutException):
            await wrapped(mock_input, mock_ctx)
        # Desired: do NOT set error status for transient (Hatchet will retry)
        mock_set_error.assert_not_called()
@pytest.mark.asyncio
 async def test_with_error_handling_hard_fail_raises_non_retryable_and_sets_status(
    pipeline_module, mock_input, mock_ctx
 ):
    """Hard-fail (e.g. ValueError) must set error status and re-raise NonRetryableException.
    Before fix: raises ValueError, set_workflow_error_status called → test would need to expect ValueError.
    After fix: raises NonRetryableException, set_workflow_error_status called → PASS.
    """
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        TaskName,
        with_error_handling,
    )
    async def failing_task(input, ctx):
        raise ValueError("PADDING_URL must be set")
    wrapped = with_error_handling(TaskName.GET_RECORDING)(failing_task)
    with patch(
        "reflector.hatchet.workflows.daily_multitrack_pipeline.set_workflow_error_status",
        new_callable=AsyncMock,
    ) as mock_set_error:
        with pytest.raises(NonRetryableException) as exc_info:
            await wrapped(mock_input, mock_ctx)
        assert "PADDING_URL" in str(exc_info.value)
        mock_set_error.assert_called_once_with("ts-123")
@pytest.mark.asyncio
 async def test_with_error_handling_set_error_status_false_never_sets_status(
    pipeline_module, mock_input, mock_ctx
 ):
    """When set_error_status=False, we must never set error status (e.g. cleanup_consent)."""
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        TaskName,
        with_error_handling,
    )
    async def failing_task(input, ctx):
        raise ValueError("something went wrong")
    wrapped = with_error_handling(TaskName.CLEANUP_CONSENT, set_error_status=False)(
        failing_task
    )
    with patch(
        "reflector.hatchet.workflows.daily_multitrack_pipeline.set_workflow_error_status",
        new_callable=AsyncMock,
    ) as mock_set_error:
        with pytest.raises((ValueError, NonRetryableException)):
            await wrapped(mock_input, mock_ctx)
        mock_set_error.assert_not_called()
@asynccontextmanager
 async def _noop_db_context():
    """Async context manager that yields without touching the DB (for unit tests)."""
    yield None
@pytest.mark.asyncio
 async def test_on_failure_task_sets_error_status(pipeline_module, mock_input, mock_ctx):
    """When workflow fails and transcript is not yet 'ended', on_failure sets status to 'error'."""
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        on_workflow_failure,
    )
    transcript_processing = MagicMock()
    transcript_processing.status = "processing"
    with patch(
        "reflector.hatchet.workflows.daily_multitrack_pipeline.fresh_db_connection",
        _noop_db_context,
    ):
        with patch(
            "reflector.db.transcripts.transcripts_controller.get_by_id",
            new_callable=AsyncMock,
            return_value=transcript_processing,
        ):
            with patch(
                "reflector.hatchet.workflows.daily_multitrack_pipeline.set_workflow_error_status",
                new_callable=AsyncMock,
            ) as mock_set_error:
                await on_workflow_failure(mock_input, mock_ctx)
                mock_set_error.assert_called_once_with(mock_input.transcript_id)
@pytest.mark.asyncio
 async def test_on_failure_task_does_not_overwrite_ended(
    pipeline_module, mock_input, mock_ctx
 ):
    """When workflow fails after finalize (e.g. post_zulip), do not overwrite 'ended' with 'error'.
    cleanup_consent, post_zulip, send_webhook use set_error_status=False; if one fails,
    on_workflow_failure must not set status to 'error' when transcript is already 'ended'.
    """
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        on_workflow_failure,
    )
    transcript_ended = MagicMock()
    transcript_ended.status = "ended"
    with patch(
        "reflector.hatchet.workflows.daily_multitrack_pipeline.fresh_db_connection",
        _noop_db_context,
    ):
        with patch(
            "reflector.db.transcripts.transcripts_controller.get_by_id",
            new_callable=AsyncMock,
            return_value=transcript_ended,
        ):
            with patch(
                "reflector.hatchet.workflows.daily_multitrack_pipeline.set_workflow_error_status",
                new_callable=AsyncMock,
            ) as mock_set_error:
                await on_workflow_failure(mock_input, mock_ctx)
                mock_set_error.assert_not_called()
 # --- Tests for fan-out helper (_successful_run_results) ---
 def test_successful_run_results_filters_exceptions():
    """_successful_run_results returns only non-exception items from aio_run_many(return_exceptions=True)."""
    from reflector.hatchet.workflows.daily_multitrack_pipeline import (
        _successful_run_results,
    )
    results = [
        {"key": "ok1"},
        ValueError("child failed"),
        {"key": "ok2"},
        RuntimeError("another"),
    ]
    successful = _successful_run_results(results)
    assert len(successful) == 2
    assert successful[0] == {"key": "ok1"}
    assert successful[1] == {"key": "ok2"}
--- a/server/tests/test_llm_retry.py
+++ b/server/tests/test_llm_retry.py
@@ -1,6 +1,6 @@
 """Tests for LLM structured output with astructured_predict + reflection retry"""
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from pydantic import BaseModel, Field, ValidationError
@@ -252,6 +252,63 @@ class TestNetworkErrorRetries:
            assert mock_settings.llm.astructured_predict.call_count == 3
 class TestGetResponseRetries:
    """Test that get_response() uses the same retry() wrapper for transient errors."""
    @pytest.mark.asyncio
    async def test_get_response_retries_on_connection_error(self, test_settings):
        """Test that get_response retries on ConnectionError and returns on success."""
        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
        mock_instance = MagicMock()
        mock_instance.aget_response = AsyncMock(
            side_effect=[
                ConnectionError("Connection refused"),
                "  Summary text  ",
            ]
        )
        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
            result = await llm.get_response("Prompt", ["text"])
        assert result == "Summary text"
        assert mock_instance.aget_response.call_count == 2
    @pytest.mark.asyncio
    async def test_get_response_exhausts_retries(self, test_settings):
        """Test that get_response raises RetryException after retry attempts exceeded."""
        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
        mock_instance = MagicMock()
        mock_instance.aget_response = AsyncMock(
            side_effect=ConnectionError("Connection refused")
        )
        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
            with pytest.raises(RetryException, match="Retry attempts exceeded"):
                await llm.get_response("Prompt", ["text"])
        assert mock_instance.aget_response.call_count == 3
    @pytest.mark.asyncio
    async def test_get_response_returns_empty_string_without_retry(self, test_settings):
        """Empty or whitespace-only LLM response must return '' and not raise RetryException.
        retry() must return falsy results (e.g. '' from get_response) instead of
        treating them as 'no result' and retrying until RetryException.
        """
        llm = LLM(settings=test_settings, temperature=0.4, max_tokens=100)
        mock_instance = MagicMock()
        mock_instance.aget_response = AsyncMock(return_value="   \n  ")  # strip() -> ""
        with patch("reflector.llm.TreeSummarize", return_value=mock_instance):
            result = await llm.get_response("Prompt", ["text"])
        assert result == ""
        assert mock_instance.aget_response.call_count == 1
 class TestTextsInclusion:
    """Test that texts parameter is included in the prompt sent to astructured_predict"""
--- a/server/tests/test_retry_decorator.py
+++ b/server/tests/test_retry_decorator.py
@@ -49,6 +49,15 @@ async def test_retry_httpx(httpx_mock):
            )
@pytest.mark.asyncio
 async def test_retry_402_stops_by_default(httpx_mock):
    """402 (payment required / no credits) is in default retry_httpx_status_stop — do not retry."""
    httpx_mock.add_response(status_code=402, json={"error": "insufficient_credits"})
    async with httpx.AsyncClient() as client:
        with pytest.raises(RetryHTTPException):
            await retry(client.get)("https://test_url", retry_timeout=5)
@pytest.mark.asyncio
 async def test_retry_normal():
    left = 3
--- a/server/tests/test_transcripts_process.py
+++ b/server/tests/test_transcripts_process.py
@@ -231,3 +231,81 @@ async def test_dailyco_recording_uses_multitrack_pipeline(client):
            {"s3_key": k} for k in track_keys
        ]
        mock_file_pipeline.delay.assert_not_called()
@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
 async def test_reprocess_error_transcript_passes_force(client):
    """When transcript status is 'error', reprocess passes force=True to start fresh workflow."""
    from datetime import datetime, timezone
    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.rooms import rooms_controller
    from reflector.db.transcripts import transcripts_controller
    room = await rooms_controller.add(
        name="test-room",
        user_id="test-user",
        zulip_auto_post=False,
        zulip_stream="",
        zulip_topic="",
        is_locked=False,
        room_mode="normal",
        recording_type="cloud",
        recording_trigger="automatic-2nd-participant",
        is_shared=False,
    )
    transcript = await transcripts_controller.add(
        "",
        source_kind="room",
        source_language="en",
        target_language="en",
        user_id="test-user",
        share_mode="public",
        room_id=room.id,
    )
    track_keys = ["recordings/test-room/track1.webm"]
    recording = await recordings_controller.create(
        Recording(
            bucket_name="daily-bucket",
            object_key="recordings/test-room",
            meeting_id="test-meeting",
            track_keys=track_keys,
            recorded_at=datetime.now(timezone.utc),
        )
    )
    await transcripts_controller.update(
        transcript,
        {
            "recording_id": recording.id,
            "status": "error",
            "workflow_run_id": "old-failed-run",
        },
    )
    with (
        patch(
            "reflector.services.transcript_process.task_is_scheduled_or_active"
        ) as mock_celery,
        patch(
            "reflector.services.transcript_process.HatchetClientManager"
        ) as mock_hatchet,
        patch(
            "reflector.views.transcripts_process.dispatch_transcript_processing",
            new_callable=AsyncMock,
        ) as mock_dispatch,
    ):
        mock_celery.return_value = False
        from hatchet_sdk.clients.rest.models import V1TaskStatus
        mock_hatchet.get_workflow_run_status = AsyncMock(
            return_value=V1TaskStatus.FAILED
        )
        response = await client.post(f"/transcripts/{transcript.id}/process")
    assert response.status_code == 200
    mock_dispatch.assert_called_once()
    assert mock_dispatch.call_args.kwargs["force"] is True