feat: durable (#794)

* durable (no-mistakes) * hatchet no-mistake * hatchet no-mistake * hatchet no-mistake, better logging * remove conductor and add hatchet tests (no-mistakes) * self-review (no-mistakes) * hatched logs * remove shadow mode for hatchet * and add hatchet processor setting to room * . * cleanup * hatchet init db * self-review (no-mistakes) * self-review (no-mistakes) * hatchet: restore zullip report * self-review round * self-review round * self-review round * dry hatchet with celery * dry hatched with celery - 2 * self-review round * more NES instead of str * self-review wip * self-review round * self-review round * self-review round * can_replay cancelled * add forgotten file * pr autoreviewer fixes * better log webhook events * durable_started return * migration sync * latest changes feature parity * migration merge * pr review --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-02-06 18:56:48 +00:00 · 2025-12-22 12:09:20 -05:00
parent f580b996ee
commit 1dac999b56
36 changed files with 4908 additions and 2009 deletions
--- a/server/reflector/services/transcript_process.py
+++ b/server/reflector/services/transcript_process.py
@@ -11,13 +11,19 @@ from typing import Literal, Union, assert_never

 import celery
 from celery.result import AsyncResult
+from hatchet_sdk.clients.rest.exceptions import ApiException
+from hatchet_sdk.clients.rest.models import V1TaskStatus

 from reflector.db.recordings import recordings_controller
-from reflector.db.transcripts import Transcript
+from reflector.db.rooms import rooms_controller
+from reflector.db.transcripts import Transcript, transcripts_controller
+from reflector.hatchet.client import HatchetClientManager
+from reflector.logger import logger
 from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
 from reflector.pipelines.main_multitrack_pipeline import (
    task_pipeline_multitrack_process,
 )
+from reflector.settings import settings
 from reflector.utils.string import NonEmptyString


@@ -37,6 +43,8 @@ class MultitrackProcessingConfig:
    transcript_id: NonEmptyString
    bucket_name: NonEmptyString
    track_keys: list[str]
+    recording_id: NonEmptyString | None = None
+    room_id: NonEmptyString | None = None
    mode: Literal["multitrack"] = "multitrack"


@@ -49,6 +57,7 @@ class ValidationOk:
    # transcript currently doesnt always have recording_id
    recording_id: NonEmptyString | None
    transcript_id: NonEmptyString
+    room_id: NonEmptyString | None = None


@dataclass
@@ -96,6 +105,7 @@ async def validate_transcript_for_processing(
    if transcript.status == "idle":
        return ValidationNotReady(detail="Recording is not ready for processing")

+    # Check Celery tasks
    if task_is_scheduled_or_active(
        "reflector.pipelines.main_file_pipeline.task_pipeline_file_process",
        transcript_id=transcript.id,
@@ -105,8 +115,25 @@ async def validate_transcript_for_processing(
    ):
        return ValidationAlreadyScheduled(detail="already running")

+    # Check Hatchet workflows (if enabled)
+    if settings.HATCHET_ENABLED and transcript.workflow_run_id:
+        try:
+            status = await HatchetClientManager.get_workflow_run_status(
+                transcript.workflow_run_id
+            )
+            # If workflow is running or queued, don't allow new processing
+            if status in (V1TaskStatus.RUNNING, V1TaskStatus.QUEUED):
+                return ValidationAlreadyScheduled(
+                    detail="Hatchet workflow already running"
+                )
+        except ApiException:
+            # Workflow might be gone (404) or API issue - allow processing
+            pass
+
    return ValidationOk(
-        recording_id=transcript.recording_id, transcript_id=transcript.id
+        recording_id=transcript.recording_id,
+        transcript_id=transcript.id,
+        room_id=transcript.room_id,
    )


@@ -116,6 +143,7 @@ async def prepare_transcript_processing(validation: ValidationOk) -> PrepareResu
    """
    bucket_name: str | None = None
    track_keys: list[str] | None = None
+    recording_id: str | None = validation.recording_id

    if validation.recording_id:
        recording = await recordings_controller.get_by_id(validation.recording_id)
@@ -137,6 +165,8 @@ async def prepare_transcript_processing(validation: ValidationOk) -> PrepareResu
            bucket_name=bucket_name,  # type: ignore (validated above)
            track_keys=track_keys,
            transcript_id=validation.transcript_id,
+            recording_id=recording_id,
+            room_id=validation.room_id,
        )

    return FileProcessingConfig(
@@ -144,8 +174,115 @@ async def prepare_transcript_processing(validation: ValidationOk) -> PrepareResu
    )


-def dispatch_transcript_processing(config: ProcessingConfig) -> AsyncResult:
+async def dispatch_transcript_processing(
+    config: ProcessingConfig, force: bool = False
+) -> AsyncResult | None:
+    """Dispatch transcript processing to appropriate backend (Hatchet or Celery).
+
+    Returns AsyncResult for Celery tasks, None for Hatchet workflows.
+    """
    if isinstance(config, MultitrackProcessingConfig):
+        # Check if room has use_hatchet=True (overrides env vars)
+        room_forces_hatchet = False
+        if config.room_id:
+            room = await rooms_controller.get_by_id(config.room_id)
+            room_forces_hatchet = room.use_hatchet if room else False
+
+        # Start durable workflow if enabled (Hatchet)
+        # or if room has use_hatchet=True
+        use_hatchet = settings.HATCHET_ENABLED or room_forces_hatchet
+
+        if room_forces_hatchet:
+            logger.info(
+                "Room forces Hatchet workflow",
+                room_id=config.room_id,
+                transcript_id=config.transcript_id,
+            )
+
+        if use_hatchet:
+            # First check if we can replay (outside transaction since it's read-only)
+            transcript = await transcripts_controller.get_by_id(config.transcript_id)
+            if transcript and transcript.workflow_run_id and not force:
+                can_replay = await HatchetClientManager.can_replay(
+                    transcript.workflow_run_id
+                )
+                if can_replay:
+                    await HatchetClientManager.replay_workflow(
+                        transcript.workflow_run_id
+                    )
+                    logger.info(
+                        "Replaying Hatchet workflow",
+                        workflow_id=transcript.workflow_run_id,
+                    )
+                    return None
+                else:
+                    # Workflow exists but can't replay (CANCELLED, COMPLETED, etc.)
+                    # Log and proceed to start new workflow
+                    status = await HatchetClientManager.get_workflow_run_status(
+                        transcript.workflow_run_id
+                    )
+                    logger.info(
+                        "Old workflow not replayable, starting new",
+                        old_workflow_id=transcript.workflow_run_id,
+                        old_status=status.value,
+                    )
+
+            # Force: cancel old workflow if exists
+            if force and transcript and transcript.workflow_run_id:
+                await HatchetClientManager.cancel_workflow(transcript.workflow_run_id)
+                logger.info(
+                    "Cancelled old workflow (--force)",
+                    workflow_id=transcript.workflow_run_id,
+                )
+                await transcripts_controller.update(
+                    transcript, {"workflow_run_id": None}
+                )
+
+            # Re-fetch and check for concurrent dispatch (optimistic approach).
+            # No database lock - worst case is duplicate dispatch, but Hatchet
+            # workflows are idempotent so this is acceptable.
+            transcript = await transcripts_controller.get_by_id(config.transcript_id)
+            if transcript and transcript.workflow_run_id:
+                # Another process started a workflow between validation and now
+                try:
+                    status = await HatchetClientManager.get_workflow_run_status(
+                        transcript.workflow_run_id
+                    )
+                    if status in (V1TaskStatus.RUNNING, V1TaskStatus.QUEUED):
+                        logger.info(
+                            "Concurrent workflow detected, skipping dispatch",
+                            workflow_id=transcript.workflow_run_id,
+                        )
+                        return None
+                except ApiException:
+                    # Workflow might be gone (404) or API issue - proceed with new workflow
+                    pass
+
+            workflow_id = await HatchetClientManager.start_workflow(
+                workflow_name="DiarizationPipeline",
+                input_data={
+                    "recording_id": config.recording_id,
+                    "tracks": [{"s3_key": k} for k in config.track_keys],
+                    "bucket_name": config.bucket_name,
+                    "transcript_id": config.transcript_id,
+                    "room_id": config.room_id,
+                },
+                additional_metadata={
+                    "transcript_id": config.transcript_id,
+                    "recording_id": config.recording_id,
+                    "daily_recording_id": config.recording_id,
+                },
+            )
+
+            if transcript:
+                await transcripts_controller.update(
+                    transcript, {"workflow_run_id": workflow_id}
+                )
+
+            logger.info("Hatchet workflow dispatched", workflow_id=workflow_id)
+            return None
+
+        # Celery pipeline (durable workflows disabled)
        return task_pipeline_multitrack_process.delay(
            transcript_id=config.transcript_id,
            bucket_name=config.bucket_name,