Files
reflector/server/tests/test_transcripts_process.py
Juan Diego García 37a1f01850 feat: migrate file and live post-processing pipelines from Celery to Hatchet workflow engine (#911)
* feat: migrate file and live post-processing pipelines from Celery to Hatchet workflow engine

* fix: always force reprocessing

* fix: ci tests with live pipelines

* fix: ci tests with live pipelines
2026-03-16 16:07:16 -05:00

331 lines
11 KiB
Python

from unittest.mock import AsyncMock, patch
import pytest
from httpx import ASGITransport, AsyncClient
from reflector.settings import settings
@pytest.fixture
async def app_lifespan():
from asgi_lifespan import LifespanManager
from reflector.app import app
async with LifespanManager(app) as manager:
yield manager.app
@pytest.fixture
async def client(app_lifespan):
yield AsyncClient(
transport=ASGITransport(app=app_lifespan),
base_url="http://test/v1",
)
@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_transcript_process(
tmpdir,
dummy_llm,
dummy_processors,
dummy_file_transcript,
dummy_file_diarization,
dummy_storage,
client,
monkeypatch,
mock_hatchet_client,
):
"""Test upload + process dispatch via Hatchet.
The file pipeline is now dispatched to Hatchet (fire-and-forget),
so we verify the workflow was triggered rather than polling for completion.
"""
monkeypatch.setattr(settings, "PUBLIC_MODE", True)
# create a transcript
response = await client.post("/transcripts", json={"name": "test"})
assert response.status_code == 200
assert response.json()["status"] == "idle"
tid = response.json()["id"]
# upload mp3
response = await client.post(
f"/transcripts/{tid}/record/upload?chunk_number=0&total_chunks=1",
files={
"chunk": (
"test_short.wav",
open("tests/records/test_short.wav", "rb"),
"audio/mpeg",
),
},
)
assert response.status_code == 200
assert response.json()["status"] == "ok"
# Verify Hatchet workflow was dispatched (from upload endpoint)
from reflector.hatchet.client import HatchetClientManager
HatchetClientManager.start_workflow.assert_called_once_with(
"FilePipeline",
{"transcript_id": tid},
additional_metadata={"transcript_id": tid},
)
# Verify transcript status was set to "uploaded"
resp = await client.get(f"/transcripts/{tid}")
assert resp.status_code == 200
assert resp.json()["status"] == "uploaded"
# Reset mock for reprocess test
HatchetClientManager.start_workflow.reset_mock()
# Clear workflow_run_id so /process endpoint can dispatch again
from reflector.db.transcripts import transcripts_controller
transcript = await transcripts_controller.get_by_id(tid)
await transcripts_controller.update(transcript, {"workflow_run_id": None})
# Reprocess via /process endpoint
with patch(
"reflector.services.transcript_process.task_is_scheduled_or_active",
return_value=False,
):
response = await client.post(f"/transcripts/{tid}/process")
assert response.status_code == 200
assert response.json()["status"] == "ok"
# Verify second Hatchet dispatch (from /process endpoint)
HatchetClientManager.start_workflow.assert_called_once()
call_kwargs = HatchetClientManager.start_workflow.call_args.kwargs
assert call_kwargs["workflow_name"] == "FilePipeline"
assert call_kwargs["input_data"]["transcript_id"] == tid
@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_whereby_recording_uses_file_pipeline(monkeypatch, client):
"""Test that Whereby recordings (bucket_name but no track_keys) use file pipeline"""
from datetime import datetime, timezone
from reflector.db.recordings import Recording, recordings_controller
from reflector.db.transcripts import transcripts_controller
from reflector.settings import settings
monkeypatch.setattr(
settings, "PUBLIC_MODE", True
) # public mode: allow anonymous transcript creation for this test
# Create transcript with Whereby recording (has bucket_name, no track_keys)
transcript = await transcripts_controller.add(
"",
source_kind="room",
source_language="en",
target_language="en",
user_id="test-user",
share_mode="public",
)
recording = await recordings_controller.create(
Recording(
bucket_name="whereby-bucket",
object_key="test-recording.mp4", # gitleaks:allow
meeting_id="test-meeting",
recorded_at=datetime.now(timezone.utc),
track_keys=None, # Whereby recordings have no track_keys
)
)
await transcripts_controller.update(
transcript, {"recording_id": recording.id, "status": "uploaded"}
)
with (
patch(
"reflector.services.transcript_process.task_is_scheduled_or_active",
return_value=False,
),
patch(
"reflector.services.transcript_process.HatchetClientManager"
) as mock_hatchet,
):
mock_hatchet.start_workflow = AsyncMock(return_value="test-workflow-id")
response = await client.post(f"/transcripts/{transcript.id}/process")
assert response.status_code == 200
assert response.json()["status"] == "ok"
# Whereby recordings should use Hatchet FilePipeline
mock_hatchet.start_workflow.assert_called_once()
call_kwargs = mock_hatchet.start_workflow.call_args.kwargs
assert call_kwargs["workflow_name"] == "FilePipeline"
assert call_kwargs["input_data"]["transcript_id"] == transcript.id
@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_dailyco_recording_uses_multitrack_pipeline(monkeypatch, client):
"""Test that Daily.co recordings (bucket_name + track_keys) use multitrack pipeline"""
from datetime import datetime, timezone
from reflector.db.recordings import Recording, recordings_controller
from reflector.db.rooms import rooms_controller
from reflector.db.transcripts import transcripts_controller
from reflector.settings import settings
monkeypatch.setattr(
settings, "PUBLIC_MODE", True
) # public mode: allow anonymous transcript creation for this test
room = await rooms_controller.add(
name="test-room",
user_id="test-user",
zulip_auto_post=False,
zulip_stream="",
zulip_topic="",
is_locked=False,
room_mode="normal",
recording_type="cloud",
recording_trigger="automatic-2nd-participant",
is_shared=False,
)
transcript = await transcripts_controller.add(
"",
source_kind="room",
source_language="en",
target_language="en",
user_id="test-user",
share_mode="public",
room_id=room.id,
)
track_keys = [
"recordings/test-room/track1.webm",
"recordings/test-room/track2.webm",
]
recording = await recordings_controller.create(
Recording(
bucket_name="daily-bucket",
object_key="recordings/test-room",
meeting_id="test-meeting",
track_keys=track_keys,
recorded_at=datetime.now(timezone.utc),
)
)
await transcripts_controller.update(
transcript, {"recording_id": recording.id, "status": "uploaded"}
)
with (
patch(
"reflector.services.transcript_process.task_is_scheduled_or_active",
return_value=False,
),
patch(
"reflector.services.transcript_process.HatchetClientManager"
) as mock_hatchet,
):
mock_hatchet.start_workflow = AsyncMock(return_value="test-workflow-id")
response = await client.post(f"/transcripts/{transcript.id}/process")
assert response.status_code == 200
assert response.json()["status"] == "ok"
# Daily.co multitrack recordings should use Hatchet DiarizationPipeline
mock_hatchet.start_workflow.assert_called_once()
call_kwargs = mock_hatchet.start_workflow.call_args.kwargs
assert call_kwargs["workflow_name"] == "DiarizationPipeline"
assert call_kwargs["input_data"]["transcript_id"] == transcript.id
assert call_kwargs["input_data"]["bucket_name"] == "daily-bucket"
assert call_kwargs["input_data"]["tracks"] == [
{"s3_key": k} for k in track_keys
]
@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_reprocess_error_transcript_passes_force(monkeypatch, client):
"""When transcript status is 'error', reprocess passes force=True to start fresh workflow."""
from datetime import datetime, timezone
from reflector.db.recordings import Recording, recordings_controller
from reflector.db.rooms import rooms_controller
from reflector.db.transcripts import transcripts_controller
from reflector.settings import settings
monkeypatch.setattr(
settings, "PUBLIC_MODE", True
) # public mode: allow anonymous transcript creation for this test
room = await rooms_controller.add(
name="test-room",
user_id="test-user",
zulip_auto_post=False,
zulip_stream="",
zulip_topic="",
is_locked=False,
room_mode="normal",
recording_type="cloud",
recording_trigger="automatic-2nd-participant",
is_shared=False,
)
transcript = await transcripts_controller.add(
"",
source_kind="room",
source_language="en",
target_language="en",
user_id="test-user",
share_mode="public",
room_id=room.id,
)
track_keys = ["recordings/test-room/track1.webm"]
recording = await recordings_controller.create(
Recording(
bucket_name="daily-bucket",
object_key="recordings/test-room",
meeting_id="test-meeting",
track_keys=track_keys,
recorded_at=datetime.now(timezone.utc),
)
)
await transcripts_controller.update(
transcript,
{
"recording_id": recording.id,
"status": "error",
"workflow_run_id": "old-failed-run",
},
)
with (
patch(
"reflector.services.transcript_process.task_is_scheduled_or_active"
) as mock_celery,
patch(
"reflector.services.transcript_process.HatchetClientManager"
) as mock_hatchet,
patch(
"reflector.views.transcripts_process.dispatch_transcript_processing",
new_callable=AsyncMock,
) as mock_dispatch,
):
mock_celery.return_value = False
from hatchet_sdk.clients.rest.models import V1TaskStatus
mock_hatchet.get_workflow_run_status = AsyncMock(
return_value=V1TaskStatus.FAILED
)
response = await client.post(f"/transcripts/{transcript.id}/process")
assert response.status_code == 200
mock_dispatch.assert_called_once()
assert mock_dispatch.call_args.kwargs["force"] is True