fix: improve hatchet workflow reliability (#900)

* Increase max connections * Classify hard and transient hatchet errors * Fan out partial success * Force reprocessing of error transcripts * Stop retrying on 402 payment required * Avoid httpx/hatchet timeout race * Add retry wrapper to get_response for for transient errors * Add retry backoff * Return falsy results so get_response won't retry on empty string * Skip error status in on_workflow_failure when transcript already ended * Fix precommit issues * Fail step on first fan-out failure instead of skipping
2026-04-06 22:06:47 +00:00 · 2026-03-06 17:07:26 +01:00
parent a682846645
commit c155f66982
17 changed files with 717 additions and 38 deletions
--- a/server/tests/test_transcripts_process.py
+++ b/server/tests/test_transcripts_process.py
@@ -231,3 +231,81 @@ async def test_dailyco_recording_uses_multitrack_pipeline(client):
            {"s3_key": k} for k in track_keys
        ]
        mock_file_pipeline.delay.assert_not_called()
+
+
+@pytest.mark.usefixtures("setup_database")
+@pytest.mark.asyncio
+async def test_reprocess_error_transcript_passes_force(client):
+    """When transcript status is 'error', reprocess passes force=True to start fresh workflow."""
+    from datetime import datetime, timezone
+
+    from reflector.db.recordings import Recording, recordings_controller
+    from reflector.db.rooms import rooms_controller
+    from reflector.db.transcripts import transcripts_controller
+
+    room = await rooms_controller.add(
+        name="test-room",
+        user_id="test-user",
+        zulip_auto_post=False,
+        zulip_stream="",
+        zulip_topic="",
+        is_locked=False,
+        room_mode="normal",
+        recording_type="cloud",
+        recording_trigger="automatic-2nd-participant",
+        is_shared=False,
+    )
+
+    transcript = await transcripts_controller.add(
+        "",
+        source_kind="room",
+        source_language="en",
+        target_language="en",
+        user_id="test-user",
+        share_mode="public",
+        room_id=room.id,
+    )
+
+    track_keys = ["recordings/test-room/track1.webm"]
+    recording = await recordings_controller.create(
+        Recording(
+            bucket_name="daily-bucket",
+            object_key="recordings/test-room",
+            meeting_id="test-meeting",
+            track_keys=track_keys,
+            recorded_at=datetime.now(timezone.utc),
+        )
+    )
+
+    await transcripts_controller.update(
+        transcript,
+        {
+            "recording_id": recording.id,
+            "status": "error",
+            "workflow_run_id": "old-failed-run",
+        },
+    )
+
+    with (
+        patch(
+            "reflector.services.transcript_process.task_is_scheduled_or_active"
+        ) as mock_celery,
+        patch(
+            "reflector.services.transcript_process.HatchetClientManager"
+        ) as mock_hatchet,
+        patch(
+            "reflector.views.transcripts_process.dispatch_transcript_processing",
+            new_callable=AsyncMock,
+        ) as mock_dispatch,
+    ):
+        mock_celery.return_value = False
+        from hatchet_sdk.clients.rest.models import V1TaskStatus
+
+        mock_hatchet.get_workflow_run_status = AsyncMock(
+            return_value=V1TaskStatus.FAILED
+        )
+        response = await client.post(f"/transcripts/{transcript.id}/process")
+
+    assert response.status_code == 200
+    mock_dispatch.assert_called_once()
+    assert mock_dispatch.call_args.kwargs["force"] is True