reflector/server/tests/test_transcripts_process.py

from unittest.mock import AsyncMock, patch

import pytest
from httpx import ASGITransport, AsyncClient

from reflector.settings import settings


@pytest.fixture
async def app_lifespan():
    from asgi_lifespan import LifespanManager

    from reflector.app import app

    async with LifespanManager(app) as manager:
        yield manager.app


@pytest.fixture
async def client(app_lifespan):
    yield AsyncClient(
        transport=ASGITransport(app=app_lifespan),
        base_url="http://test/v1",
    )


@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_transcript_process(
    tmpdir,
    dummy_llm,
    dummy_processors,
    dummy_file_transcript,
    dummy_file_diarization,
    dummy_storage,
    client,
    monkeypatch,
    mock_hatchet_client,
):
    """Test upload + process dispatch via Hatchet.

    The file pipeline is now dispatched to Hatchet (fire-and-forget),
    so we verify the workflow was triggered rather than polling for completion.
    """
    monkeypatch.setattr(settings, "PUBLIC_MODE", True)

    # create a transcript
    response = await client.post("/transcripts", json={"name": "test"})
    assert response.status_code == 200
    assert response.json()["status"] == "idle"
    tid = response.json()["id"]

    # upload mp3
    response = await client.post(
        f"/transcripts/{tid}/record/upload?chunk_number=0&total_chunks=1",
        files={
            "chunk": (
                "test_short.wav",
                open("tests/records/test_short.wav", "rb"),
                "audio/mpeg",
            ),
        },
    )
    assert response.status_code == 200
    assert response.json()["status"] == "ok"

    # Verify Hatchet workflow was dispatched (from upload endpoint)
    from reflector.hatchet.client import HatchetClientManager

    HatchetClientManager.start_workflow.assert_called_once_with(
        "FilePipeline",
        {"transcript_id": tid},
        additional_metadata={"transcript_id": tid},
    )

    # Verify transcript status was set to "uploaded"
    resp = await client.get(f"/transcripts/{tid}")
    assert resp.status_code == 200
    assert resp.json()["status"] == "uploaded"

    # Reset mock for reprocess test
    HatchetClientManager.start_workflow.reset_mock()

    # Clear workflow_run_id so /process endpoint can dispatch again
    from reflector.db.transcripts import transcripts_controller

    transcript = await transcripts_controller.get_by_id(tid)
    await transcripts_controller.update(transcript, {"workflow_run_id": None})

    # Reprocess via /process endpoint
    with patch(
        "reflector.services.transcript_process.task_is_scheduled_or_active",
        return_value=False,
    ):
        response = await client.post(f"/transcripts/{tid}/process")
        assert response.status_code == 200
        assert response.json()["status"] == "ok"

    # Verify second Hatchet dispatch (from /process endpoint)
    HatchetClientManager.start_workflow.assert_called_once()
    call_kwargs = HatchetClientManager.start_workflow.call_args.kwargs
    assert call_kwargs["workflow_name"] == "FilePipeline"
    assert call_kwargs["input_data"]["transcript_id"] == tid


@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_whereby_recording_uses_file_pipeline(monkeypatch, client):
    """Test that Whereby recordings (bucket_name but no track_keys) use file pipeline"""
    from datetime import datetime, timezone

    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.transcripts import transcripts_controller
    from reflector.settings import settings

    monkeypatch.setattr(
        settings, "PUBLIC_MODE", True
    )  # public mode: allow anonymous transcript creation for this test

    # Create transcript with Whereby recording (has bucket_name, no track_keys)
    transcript = await transcripts_controller.add(
        "",
        source_kind="room",
        source_language="en",
        target_language="en",
        user_id="test-user",
        share_mode="public",
    )

    recording = await recordings_controller.create(
        Recording(
            bucket_name="whereby-bucket",
            object_key="test-recording.mp4",  # gitleaks:allow
            meeting_id="test-meeting",
            recorded_at=datetime.now(timezone.utc),
            track_keys=None,  # Whereby recordings have no track_keys
        )
    )

    await transcripts_controller.update(
        transcript, {"recording_id": recording.id, "status": "uploaded"}
    )

    with (
        patch(
            "reflector.services.transcript_process.task_is_scheduled_or_active",
            return_value=False,
        ),
        patch(
            "reflector.services.transcript_process.HatchetClientManager"
        ) as mock_hatchet,
    ):
        mock_hatchet.start_workflow = AsyncMock(return_value="test-workflow-id")

        response = await client.post(f"/transcripts/{transcript.id}/process")

        assert response.status_code == 200
        assert response.json()["status"] == "ok"

        # Whereby recordings should use Hatchet FilePipeline
        mock_hatchet.start_workflow.assert_called_once()
        call_kwargs = mock_hatchet.start_workflow.call_args.kwargs
        assert call_kwargs["workflow_name"] == "FilePipeline"
        assert call_kwargs["input_data"]["transcript_id"] == transcript.id


@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_dailyco_recording_uses_multitrack_pipeline(monkeypatch, client):
    """Test that Daily.co recordings (bucket_name + track_keys) use multitrack pipeline"""
    from datetime import datetime, timezone

    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.rooms import rooms_controller
    from reflector.db.transcripts import transcripts_controller
    from reflector.settings import settings

    monkeypatch.setattr(
        settings, "PUBLIC_MODE", True
    )  # public mode: allow anonymous transcript creation for this test

    room = await rooms_controller.add(
        name="test-room",
        user_id="test-user",
        zulip_auto_post=False,
        zulip_stream="",
        zulip_topic="",
        is_locked=False,
        room_mode="normal",
        recording_type="cloud",
        recording_trigger="automatic-2nd-participant",
        is_shared=False,
    )

    transcript = await transcripts_controller.add(
        "",
        source_kind="room",
        source_language="en",
        target_language="en",
        user_id="test-user",
        share_mode="public",
        room_id=room.id,
    )

    track_keys = [
        "recordings/test-room/track1.webm",
        "recordings/test-room/track2.webm",
    ]
    recording = await recordings_controller.create(
        Recording(
            bucket_name="daily-bucket",
            object_key="recordings/test-room",
            meeting_id="test-meeting",
            track_keys=track_keys,
            recorded_at=datetime.now(timezone.utc),
        )
    )

    await transcripts_controller.update(
        transcript, {"recording_id": recording.id, "status": "uploaded"}
    )

    with (
        patch(
            "reflector.services.transcript_process.task_is_scheduled_or_active",
            return_value=False,
        ),
        patch(
            "reflector.services.transcript_process.HatchetClientManager"
        ) as mock_hatchet,
    ):
        mock_hatchet.start_workflow = AsyncMock(return_value="test-workflow-id")

        response = await client.post(f"/transcripts/{transcript.id}/process")

        assert response.status_code == 200
        assert response.json()["status"] == "ok"

        # Daily.co multitrack recordings should use Hatchet DiarizationPipeline
        mock_hatchet.start_workflow.assert_called_once()
        call_kwargs = mock_hatchet.start_workflow.call_args.kwargs
        assert call_kwargs["workflow_name"] == "DiarizationPipeline"
        assert call_kwargs["input_data"]["transcript_id"] == transcript.id
        assert call_kwargs["input_data"]["bucket_name"] == "daily-bucket"
        assert call_kwargs["input_data"]["tracks"] == [
            {"s3_key": k} for k in track_keys
        ]


@pytest.mark.usefixtures("setup_database")
@pytest.mark.asyncio
async def test_reprocess_error_transcript_passes_force(monkeypatch, client):
    """When transcript status is 'error', reprocess passes force=True to start fresh workflow."""
    from datetime import datetime, timezone

    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.rooms import rooms_controller
    from reflector.db.transcripts import transcripts_controller
    from reflector.settings import settings

    monkeypatch.setattr(
        settings, "PUBLIC_MODE", True
    )  # public mode: allow anonymous transcript creation for this test

    room = await rooms_controller.add(
        name="test-room",
        user_id="test-user",
        zulip_auto_post=False,
        zulip_stream="",
        zulip_topic="",
        is_locked=False,
        room_mode="normal",
        recording_type="cloud",
        recording_trigger="automatic-2nd-participant",
        is_shared=False,
    )

    transcript = await transcripts_controller.add(
        "",
        source_kind="room",
        source_language="en",
        target_language="en",
        user_id="test-user",
        share_mode="public",
        room_id=room.id,
    )

    track_keys = ["recordings/test-room/track1.webm"]
    recording = await recordings_controller.create(
        Recording(
            bucket_name="daily-bucket",
            object_key="recordings/test-room",
            meeting_id="test-meeting",
            track_keys=track_keys,
            recorded_at=datetime.now(timezone.utc),
        )
    )

    await transcripts_controller.update(
        transcript,
        {
            "recording_id": recording.id,
            "status": "error",
            "workflow_run_id": "old-failed-run",
        },
    )

    with (
        patch(
            "reflector.services.transcript_process.task_is_scheduled_or_active"
        ) as mock_celery,
        patch(
            "reflector.services.transcript_process.HatchetClientManager"
        ) as mock_hatchet,
        patch(
            "reflector.views.transcripts_process.dispatch_transcript_processing",
            new_callable=AsyncMock,
        ) as mock_dispatch,
    ):
        mock_celery.return_value = False
        from hatchet_sdk.clients.rest.models import V1TaskStatus

        mock_hatchet.get_workflow_run_status = AsyncMock(
            return_value=V1TaskStatus.FAILED
        )
        response = await client.post(f"/transcripts/{transcript.id}/process")

    assert response.status_code == 200
    mock_dispatch.assert_called_once()
    assert mock_dispatch.call_args.kwargs["force"] is True