mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-03-21 22:56:47 +00:00
130 lines
4.8 KiB
Python
130 lines
4.8 KiB
Python
"""
|
|
Integration test: Multitrack → DailyMultitrackPipeline → full processing.
|
|
|
|
Exercises: S3 upload → DB recording setup → process endpoint →
|
|
Hatchet DiarizationPipeline → mock Daily API → whisper per-track transcription →
|
|
diarization → mixdown → LLM summarization/topics → status "ended".
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
|
|
import pytest
|
|
from sqlalchemy import text
|
|
|
|
# Must match Daily's filename format: {recording_start_ts}-{participant_uuid}-cam-audio-{track_start_ts}
|
|
# These UUIDs must match mock_daily_server.py participant IDs
|
|
PARTICIPANT_A_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
|
|
PARTICIPANT_B_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
|
|
TRACK_KEYS = [
|
|
f"1700000000000-{PARTICIPANT_A_ID}-cam-audio-1700000001000",
|
|
f"1700000000000-{PARTICIPANT_B_ID}-cam-audio-1700000001000",
|
|
]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multitrack_pipeline_end_to_end(
|
|
api_client,
|
|
s3_client,
|
|
db_engine,
|
|
test_records_dir,
|
|
bucket_name,
|
|
poll_transcript_status,
|
|
):
|
|
"""Set up multitrack recording in S3/DB and verify the full pipeline completes."""
|
|
# 1. Upload test audio as two separate tracks to Garage S3
|
|
audio_path = test_records_dir / "test_short.wav"
|
|
assert audio_path.exists(), f"Test audio file not found: {audio_path}"
|
|
|
|
for track_key in TRACK_KEYS:
|
|
s3_client.upload_file(
|
|
str(audio_path),
|
|
bucket_name,
|
|
track_key,
|
|
)
|
|
|
|
# 2. Create transcript via API
|
|
resp = await api_client.post(
|
|
"/transcripts",
|
|
json={"name": "integration-multitrack-test"},
|
|
)
|
|
assert resp.status_code == 200, f"Failed to create transcript: {resp.text}"
|
|
transcript = resp.json()
|
|
transcript_id = transcript["id"]
|
|
|
|
# 3. Insert Recording row and link to transcript via direct DB access
|
|
recording_id = f"rec-integration-{transcript_id[:8]}"
|
|
now = datetime.now(timezone.utc)
|
|
|
|
async with db_engine.begin() as conn:
|
|
# Insert recording with track_keys
|
|
await conn.execute(
|
|
text("""
|
|
INSERT INTO recording (id, bucket_name, object_key, recorded_at, status, track_keys)
|
|
VALUES (:id, :bucket_name, :object_key, :recorded_at, :status, CAST(:track_keys AS json))
|
|
"""),
|
|
{
|
|
"id": recording_id,
|
|
"bucket_name": bucket_name,
|
|
"object_key": TRACK_KEYS[0],
|
|
"recorded_at": now,
|
|
"status": "completed",
|
|
"track_keys": json.dumps(TRACK_KEYS),
|
|
},
|
|
)
|
|
|
|
# Link recording to transcript and set status to uploaded
|
|
await conn.execute(
|
|
text("""
|
|
UPDATE transcript
|
|
SET recording_id = :recording_id, status = 'uploaded'
|
|
WHERE id = :transcript_id
|
|
"""),
|
|
{
|
|
"recording_id": recording_id,
|
|
"transcript_id": transcript_id,
|
|
},
|
|
)
|
|
|
|
# 4. Trigger processing via process endpoint
|
|
resp = await api_client.post(f"/transcripts/{transcript_id}/process")
|
|
assert resp.status_code == 200, f"Process trigger failed: {resp.text}"
|
|
|
|
# 5. Poll until pipeline completes
|
|
# The pipeline will call mock-daily for get_recording and get_participants
|
|
# Accept "error" too — non-critical steps like action_items may fail due to
|
|
# LLM parsing flakiness while core results (transcript, summaries) still exist.
|
|
data = await poll_transcript_status(
|
|
api_client, transcript_id, target=("ended", "error"), max_wait=300
|
|
)
|
|
|
|
# 6. Assertions — verify core pipeline results regardless of final status
|
|
assert data.get("title") and len(data["title"]) > 0, "Title should be non-empty"
|
|
assert (
|
|
data.get("long_summary") and len(data["long_summary"]) > 0
|
|
), "Long summary should be non-empty"
|
|
assert (
|
|
data.get("short_summary") and len(data["short_summary"]) > 0
|
|
), "Short summary should be non-empty"
|
|
|
|
# Topics are served from a separate endpoint
|
|
topics_resp = await api_client.get(f"/transcripts/{transcript_id}/topics")
|
|
assert topics_resp.status_code == 200, f"Failed to get topics: {topics_resp.text}"
|
|
topics = topics_resp.json()
|
|
assert len(topics) >= 1, "Should have at least 1 topic"
|
|
for topic in topics:
|
|
assert topic.get("title"), "Each topic should have a title"
|
|
assert topic.get("summary"), "Each topic should have a summary"
|
|
|
|
# Participants are served from a separate endpoint
|
|
participants_resp = await api_client.get(
|
|
f"/transcripts/{transcript_id}/participants"
|
|
)
|
|
assert (
|
|
participants_resp.status_code == 200
|
|
), f"Failed to get participants: {participants_resp.text}"
|
|
participants = participants_resp.json()
|
|
assert (
|
|
len(participants) >= 2
|
|
), f"Expected at least 2 speakers for multitrack, got {len(participants)}"
|