fix: improve hatchet workflow reliability (#900)

* Increase max connections * Classify hard and transient hatchet errors * Fan out partial success * Force reprocessing of error transcripts * Stop retrying on 402 payment required * Avoid httpx/hatchet timeout race * Add retry wrapper to get_response for for transient errors * Add retry backoff * Return falsy results so get_response won't retry on empty string * Skip error status in on_workflow_failure when transcript already ended * Fix precommit issues * Fail step on first fan-out failure instead of skipping
2026-04-24 22:25:19 +00:00 · 2026-03-06 17:07:26 +01:00
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions
--- a/server/reflector/hatchet/constants.py
+++ b/server/reflector/hatchet/constants.py
@@ -39,5 +39,12 @@ TIMEOUT_MEDIUM = (
    300  # Single LLM calls, waveform generation (5m for slow LLM responses)
 )
 TIMEOUT_LONG = 180  # Action items (larger context LLM)
-TIMEOUT_AUDIO = 720  # Audio processing: padding, mixdown
-TIMEOUT_HEAVY = 600  # Transcription, fan-out LLM tasks
+TIMEOUT_TITLE = 300  # generate_title (single LLM call; doc: reduce from 600s)
+TIMEOUT_AUDIO = 720  # Audio processing: padding, mixdown (Hatchet execution_timeout)
+TIMEOUT_AUDIO_HTTP = (
+    660  # httpx timeout for pad_track — below 720 so Hatchet doesn't race
+)
+TIMEOUT_HEAVY = 600  # Transcription, fan-out LLM tasks (Hatchet execution_timeout)
+TIMEOUT_HEAVY_HTTP = (
+    540  # httpx timeout for transcribe_track — below 600 so Hatchet doesn't race
+)
--- a/server/reflector/hatchet/error_classification.py
+++ b/server/reflector/hatchet/error_classification.py
@@ -0,0 +1,74 @@
+"""Classify exceptions as non-retryable for Hatchet workflows.
+
+When a task raises NonRetryableException (or an exception classified as
+non-retryable and re-raised as such), Hatchet stops immediately — no further
+retries. Used by with_error_handling to avoid wasting retries on config errors,
+auth failures, corrupt data, etc.
+"""
+
+# Optional dependencies: only classify if the exception type is available.
+# This avoids hard dependency on openai/av/botocore for code paths that don't use them.
+try:
+    import openai
+except ImportError:
+    openai = None  # type: ignore[assignment]
+
+try:
+    import av
+except ImportError:
+    av = None  # type: ignore[assignment]
+
+try:
+    from botocore.exceptions import ClientError as BotoClientError
+except ImportError:
+    BotoClientError = None  # type: ignore[misc, assignment]
+
+from hatchet_sdk import NonRetryableException
+from httpx import HTTPStatusError
+
+from reflector.llm import LLMParseError
+
+# HTTP status codes that won't change on retry (auth, not found, payment, payload)
+NON_RETRYABLE_HTTP_STATUSES = {401, 402, 403, 404, 413}
+NON_RETRYABLE_S3_CODES = {"AccessDenied", "NoSuchBucket", "NoSuchKey"}
+
+
+def is_non_retryable(e: BaseException) -> bool:
+    """Return True if the exception should stop Hatchet retries immediately.
+
+    Hard failures (config, auth, missing resource, corrupt data) return True.
+    Transient errors (timeouts, 5xx, 429, connection) return False.
+    """
+    if isinstance(e, NonRetryableException):
+        return True
+
+    # Config/input errors
+    if isinstance(e, (ValueError, TypeError)):
+        return True
+
+    # HTTP status codes that won't change on retry
+    if isinstance(e, HTTPStatusError):
+        return e.response.status_code in NON_RETRYABLE_HTTP_STATUSES
+
+    # OpenAI auth errors
+    if openai is not None and isinstance(e, openai.AuthenticationError):
+        return True
+
+    # LLM parse failures (already retried internally)
+    if isinstance(e, LLMParseError):
+        return True
+
+    # S3 permission/existence errors
+    if BotoClientError is not None and isinstance(e, BotoClientError):
+        code = e.response.get("Error", {}).get("Code", "")
+        return code in NON_RETRYABLE_S3_CODES
+
+    # Corrupt audio (PyAV) — AVError in some versions; fallback to InvalidDataError
+    if av is not None:
+        av_error = getattr(av, "AVError", None) or getattr(
+            getattr(av, "error", None), "InvalidDataError", None
+        )
+        if av_error is not None and isinstance(e, av_error):
+            return True
+
+    return False
--- a/server/reflector/hatchet/workflows/daily_multitrack_pipeline.py
+++ b/server/reflector/hatchet/workflows/daily_multitrack_pipeline.py
@@ -27,6 +27,7 @@ from hatchet_sdk import (
    ConcurrencyExpression,
    ConcurrencyLimitStrategy,
    Context,
+    NonRetryableException,
 )
 from hatchet_sdk.labels import DesiredWorkerLabel
 from pydantic import BaseModel
@@ -43,8 +44,10 @@ from reflector.hatchet.constants import (
    TIMEOUT_LONG,
    TIMEOUT_MEDIUM,
    TIMEOUT_SHORT,
+    TIMEOUT_TITLE,
    TaskName,
 )
+from reflector.hatchet.error_classification import is_non_retryable
 from reflector.hatchet.workflows.models import (
    ActionItemsResult,
    ConsentResult,
@@ -216,6 +219,13 @@ def make_audio_progress_logger(
 R = TypeVar("R")


+def _successful_run_results(
+    results: list[dict[str, Any] | BaseException],
+) -> list[dict[str, Any]]:
+    """Return only successful (non-exception) results from aio_run_many(return_exceptions=True)."""
+    return [r for r in results if not isinstance(r, BaseException)]
+
+
 def with_error_handling(
    step_name: TaskName, set_error_status: bool = True
 ) -> Callable[
@@ -243,8 +253,12 @@ def with_error_handling(
                    error=str(e),
                    exc_info=True,
                )
-                if set_error_status:
-                    await set_workflow_error_status(input.transcript_id)
+                if is_non_retryable(e):
+                    # Hard fail: stop retries, set error status, fail workflow
+                    if set_error_status:
+                        await set_workflow_error_status(input.transcript_id)
+                    raise NonRetryableException(str(e)) from e
+                # Transient: do not set error status — Hatchet will retry
                raise

        return wrapper  # type: ignore[return-value]
@@ -253,7 +267,10 @@ def with_error_handling(


@daily_multitrack_pipeline.task(
-    execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
+    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
+    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GET_RECORDING)
 async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
@@ -309,6 +326,8 @@ async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
    parents=[get_recording],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GET_PARTICIPANTS)
 async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsResult:
@@ -412,6 +431,8 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
    parents=[get_participants],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.PROCESS_TRACKS)
 async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksResult:
@@ -435,7 +456,7 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
        for i, track in enumerate(input.tracks)
    ]

-    results = await track_workflow.aio_run_many(bulk_runs)
+    results = await track_workflow.aio_run_many(bulk_runs, return_exceptions=True)

    target_language = participants_result.target_language

@@ -443,7 +464,18 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
    padded_tracks = []
    created_padded_files = set()

-    for result in results:
+    for i, result in enumerate(results):
+        if isinstance(result, BaseException):
+            logger.error(
+                "[Hatchet] process_tracks: track workflow failed, failing step",
+                transcript_id=input.transcript_id,
+                track_index=i,
+                error=str(result),
+            )
+            ctx.log(f"process_tracks: track {i} failed ({result}), failing step")
+            raise ValueError(
+                f"Track {i} workflow failed after retries: {result!s}"
+            ) from result
        transcribe_result = TranscribeTrackResult(**result[TaskName.TRANSCRIBE_TRACK])
        track_words.append(transcribe_result.words)

@@ -481,7 +513,9 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
@daily_multitrack_pipeline.task(
    parents=[process_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
-    retries=3,
+    retries=2,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
    desired_worker_labels={
        "pool": DesiredWorkerLabel(
            value="cpu-heavy",
@@ -593,6 +627,8 @@ async def mixdown_tracks(input: PipelineInput, ctx: Context) -> MixdownResult:
    parents=[mixdown_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.GENERATE_WAVEFORM)
 async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResult:
@@ -661,6 +697,8 @@ async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResul
    parents=[process_tracks],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.DETECT_TOPICS)
 async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
@@ -722,11 +760,22 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
        for chunk in chunks
    ]

-    results = await topic_chunk_workflow.aio_run_many(bulk_runs)
+    results = await topic_chunk_workflow.aio_run_many(bulk_runs, return_exceptions=True)

-    topic_chunks = [
-        TopicChunkResult(**result[TaskName.DETECT_CHUNK_TOPIC]) for result in results
-    ]
+    topic_chunks: list[TopicChunkResult] = []
+    for i, result in enumerate(results):
+        if isinstance(result, BaseException):
+            logger.error(
+                "[Hatchet] detect_topics: chunk workflow failed, failing step",
+                transcript_id=input.transcript_id,
+                chunk_index=i,
+                error=str(result),
+            )
+            ctx.log(f"detect_topics: chunk {i} failed ({result}), failing step")
+            raise ValueError(
+                f"Topic chunk {i} workflow failed after retries: {result!s}"
+            ) from result
+        topic_chunks.append(TopicChunkResult(**result[TaskName.DETECT_CHUNK_TOPIC]))

    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
@@ -764,8 +813,10 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:

@daily_multitrack_pipeline.task(
    parents=[detect_topics],
-    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
+    execution_timeout=timedelta(seconds=TIMEOUT_TITLE),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.GENERATE_TITLE)
 async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
@@ -830,7 +881,9 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
@daily_multitrack_pipeline.task(
    parents=[detect_topics],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
-    retries=3,
+    retries=5,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.EXTRACT_SUBJECTS)
 async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult:
@@ -909,6 +962,8 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
    parents=[extract_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
 )
@with_error_handling(TaskName.PROCESS_SUBJECTS)
 async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubjectsResult:
@@ -935,12 +990,24 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
        for i, subject in enumerate(subjects)
    ]

-    results = await subject_workflow.aio_run_many(bulk_runs)
+    results = await subject_workflow.aio_run_many(bulk_runs, return_exceptions=True)

-    subject_summaries = [
-        SubjectSummaryResult(**result[TaskName.GENERATE_DETAILED_SUMMARY])
-        for result in results
-    ]
+    subject_summaries: list[SubjectSummaryResult] = []
+    for i, result in enumerate(results):
+        if isinstance(result, BaseException):
+            logger.error(
+                "[Hatchet] process_subjects: subject workflow failed, failing step",
+                transcript_id=input.transcript_id,
+                subject_index=i,
+                error=str(result),
+            )
+            ctx.log(f"process_subjects: subject {i} failed ({result}), failing step")
+            raise ValueError(
+                f"Subject {i} workflow failed after retries: {result!s}"
+            ) from result
+        subject_summaries.append(
+            SubjectSummaryResult(**result[TaskName.GENERATE_DETAILED_SUMMARY])
+        )

    ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")

@@ -951,6 +1018,8 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
    parents=[process_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.GENERATE_RECAP)
 async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
@@ -1040,6 +1109,8 @@ async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
    parents=[extract_subjects],
    execution_timeout=timedelta(seconds=TIMEOUT_LONG),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.IDENTIFY_ACTION_ITEMS)
 async def identify_action_items(
@@ -1108,6 +1179,8 @@ async def identify_action_items(
    parents=[process_tracks, generate_title, generate_recap, identify_action_items],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=5,
 )
@with_error_handling(TaskName.FINALIZE)
 async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
@@ -1177,7 +1250,11 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:


@daily_multitrack_pipeline.task(
-    parents=[finalize], execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
+    parents=[finalize],
+    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
+    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=10,
 )
@with_error_handling(TaskName.CLEANUP_CONSENT, set_error_status=False)
 async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
@@ -1283,6 +1360,8 @@ async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
    parents=[cleanup_consent],
    execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
    retries=5,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.POST_ZULIP, set_error_status=False)
 async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
@@ -1310,6 +1389,8 @@ async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
    parents=[cleanup_consent],
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
    retries=5,
+    backoff_factor=2.0,
+    backoff_max_seconds=15,
 )
@with_error_handling(TaskName.SEND_WEBHOOK, set_error_status=False)
 async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
@@ -1378,3 +1459,32 @@ async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
        except Exception as e:
            ctx.log(f"send_webhook unexpected error, continuing anyway: {e}")
            return WebhookResult(webhook_sent=False)
+
+
+async def on_workflow_failure(input: PipelineInput, ctx: Context) -> None:
+    """Run when the workflow is truly dead (all retries exhausted).
+
+    Sets transcript status to 'error' only if it is not already 'ended'.
+    Post-finalize tasks (cleanup_consent, post_zulip, send_webhook) use
+    set_error_status=False; if one of them fails, we must not overwrite
+    the 'ended' status that finalize already set.
+    """
+    async with fresh_db_connection():
+        from reflector.db.transcripts import transcripts_controller  # noqa: PLC0415
+
+        transcript = await transcripts_controller.get_by_id(input.transcript_id)
+        if transcript and transcript.status == "ended":
+            logger.info(
+                "[Hatchet] on_workflow_failure: transcript already ended, skipping error status (failure was post-finalize)",
+                transcript_id=input.transcript_id,
+            )
+            ctx.log(
+                "on_workflow_failure: transcript already ended, skipping error status"
+            )
+            return
+    await set_workflow_error_status(input.transcript_id)
+
+
+@daily_multitrack_pipeline.on_failure_task()
+async def _register_on_workflow_failure(input: PipelineInput, ctx: Context) -> None:
+    await on_workflow_failure(input, ctx)
--- a/server/reflector/hatchet/workflows/padding_workflow.py
+++ b/server/reflector/hatchet/workflows/padding_workflow.py
@@ -34,7 +34,12 @@ padding_workflow = hatchet.workflow(
 )


-@padding_workflow.task(execution_timeout=timedelta(seconds=TIMEOUT_AUDIO), retries=3)
+@padding_workflow.task(
+    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
+    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
+)
 async def pad_track(input: PaddingInput, ctx: Context) -> PadTrackResult:
    """Pad audio track with silence based on WebM container start_time."""
    ctx.log(f"pad_track: track {input.track_index}, s3_key={input.s3_key}")
--- a/server/reflector/hatchet/workflows/subject_processing.py
+++ b/server/reflector/hatchet/workflows/subject_processing.py
@@ -13,7 +13,7 @@ from hatchet_sdk.rate_limit import RateLimit
 from pydantic import BaseModel

 from reflector.hatchet.client import HatchetClientManager
-from reflector.hatchet.constants import LLM_RATE_LIMIT_KEY, TIMEOUT_MEDIUM
+from reflector.hatchet.constants import LLM_RATE_LIMIT_KEY, TIMEOUT_HEAVY
 from reflector.hatchet.workflows.models import SubjectSummaryResult
 from reflector.logger import logger
 from reflector.processors.summary.prompts import (
@@ -41,8 +41,10 @@ subject_workflow = hatchet.workflow(


@subject_workflow.task(
-    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
-    retries=3,
+    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
+    retries=5,
+    backoff_factor=2.0,
+    backoff_max_seconds=60,
    rate_limits=[RateLimit(static_key=LLM_RATE_LIMIT_KEY, units=2)],
 )
 async def generate_detailed_summary(
--- a/server/reflector/hatchet/workflows/topic_chunk_processing.py
+++ b/server/reflector/hatchet/workflows/topic_chunk_processing.py
@@ -50,7 +50,9 @@ topic_chunk_workflow = hatchet.workflow(

@topic_chunk_workflow.task(
    execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
-    retries=3,
+    retries=5,
+    backoff_factor=2.0,
+    backoff_max_seconds=60,
    rate_limits=[RateLimit(static_key=LLM_RATE_LIMIT_KEY, units=1)],
 )
 async def detect_chunk_topic(input: TopicChunkInput, ctx: Context) -> TopicChunkResult:
--- a/server/reflector/hatchet/workflows/track_processing.py
+++ b/server/reflector/hatchet/workflows/track_processing.py
@@ -44,7 +44,12 @@ hatchet = HatchetClientManager.get_client()
 track_workflow = hatchet.workflow(name="TrackProcessing", input_validator=TrackInput)


-@track_workflow.task(execution_timeout=timedelta(seconds=TIMEOUT_AUDIO), retries=3)
+@track_workflow.task(
+    execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
+    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
+)
 async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:
    """Pad single audio track with silence for alignment.

@@ -137,7 +142,11 @@ async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:


@track_workflow.task(
-    parents=[pad_track], execution_timeout=timedelta(seconds=TIMEOUT_HEAVY), retries=3
+    parents=[pad_track],
+    execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
+    retries=3,
+    backoff_factor=2.0,
+    backoff_max_seconds=30,
 )
 async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackResult:
    """Transcribe audio track using GPU (Modal.com) or local Whisper."""
--- a/server/reflector/llm.py
+++ b/server/reflector/llm.py
@@ -65,10 +65,25 @@ class LLM:
    async def get_response(
        self, prompt: str, texts: list[str], tone_name: str | None = None
    ) -> str:
-        """Get a text response using TreeSummarize for non-function-calling models"""
-        summarizer = TreeSummarize(verbose=False)
-        response = await summarizer.aget_response(prompt, texts, tone_name=tone_name)
-        return str(response).strip()
+        """Get a text response using TreeSummarize for non-function-calling models.
+
+        Uses the same retry() wrapper as get_structured_response for transient
+        network errors (connection, timeout, OSError) with exponential backoff.
+        """
+
+        async def _call():
+            summarizer = TreeSummarize(verbose=False)
+            response = await summarizer.aget_response(
+                prompt, texts, tone_name=tone_name
+            )
+            return str(response).strip()
+
+        return await retry(_call)(
+            retry_attempts=3,
+            retry_backoff_interval=1.0,
+            retry_backoff_max=30.0,
+            retry_ignore_exc_types=(ConnectionError, TimeoutError, OSError),
+        )

    async def get_structured_response(
        self,
--- a/server/reflector/processors/audio_padding_modal.py
+++ b/server/reflector/processors/audio_padding_modal.py
@@ -7,7 +7,7 @@ import os

 import httpx

-from reflector.hatchet.constants import TIMEOUT_AUDIO
+from reflector.hatchet.constants import TIMEOUT_AUDIO_HTTP
 from reflector.logger import logger
 from reflector.processors.audio_padding import AudioPaddingProcessor, PaddingResponse
 from reflector.processors.audio_padding_auto import AudioPaddingAutoProcessor
@@ -60,7 +60,7 @@ class AudioPaddingModalProcessor(AudioPaddingProcessor):
            headers["Authorization"] = f"Bearer {self.modal_api_key}"

        try:
-            async with httpx.AsyncClient(timeout=TIMEOUT_AUDIO) as client:
+            async with httpx.AsyncClient(timeout=TIMEOUT_AUDIO_HTTP) as client:
                response = await client.post(
                    url,
                    headers=headers,
--- a/server/reflector/settings.py
+++ b/server/reflector/settings.py
@@ -55,7 +55,9 @@ class Settings(BaseSettings):
    WHISPER_FILE_MODEL: str = "tiny"
    TRANSCRIPT_URL: str | None = None
    TRANSCRIPT_TIMEOUT: int = 90
-    TRANSCRIPT_FILE_TIMEOUT: int = 600
+    TRANSCRIPT_FILE_TIMEOUT: int = (
+        540  # Below Hatchet TIMEOUT_HEAVY (600) to avoid timeout race
+    )

    # Audio Transcription: modal backend
    TRANSCRIPT_MODAL_API_KEY: str | None = None
--- a/server/reflector/utils/retry.py
+++ b/server/reflector/utils/retry.py
@@ -30,6 +30,7 @@ def retry(fn):
            "retry_httpx_status_stop",
            (
                401,  # auth issue
+                402,  # payment required / no credits — needs human action
                404,  # not found
                413,  # payload too large
                418,  # teapot
@@ -58,8 +59,9 @@ def retry(fn):
                result = await fn(*args, **kwargs)
                if isinstance(result, Response):
                    result.raise_for_status()
-                if result:
-                    return result
+                # Return any result including falsy (e.g. "" from get_response);
+                # only retry on exception, not on empty string.
+                return result
            except HTTPStatusError as e:
                retry_logger.exception(e)
                status_code = e.response.status_code
--- a/server/reflector/views/transcripts_process.py
+++ b/server/reflector/views/transcripts_process.py
@@ -50,5 +50,8 @@ async def transcript_process(
    if isinstance(config, ProcessError):
        raise HTTPException(status_code=500, detail=config.detail)
    else:
-        await dispatch_transcript_processing(config)
+        # When transcript is in error state, force a new workflow instead of replaying
+        # (replay would re-run from failure point with same conditions and likely fail again)
+        force = transcript.status == "error"
+        await dispatch_transcript_processing(config, force=force)
        return ProcessStatus(status="ok")