test: full integration tests (#916)

* test: full integration tests

* fix: add env vars as secrets in CI
This commit is contained in:
Juan Diego García
2026-03-18 15:29:21 -05:00
committed by GitHub
parent a9200d35bf
commit 9a2f973a2e
16 changed files with 1098 additions and 3 deletions

View File

@@ -0,0 +1,129 @@
"""
Integration test: Multitrack → DailyMultitrackPipeline → full processing.
Exercises: S3 upload → DB recording setup → process endpoint →
Hatchet DiarizationPipeline → mock Daily API → whisper per-track transcription →
diarization → mixdown → LLM summarization/topics → status "ended".
"""
import json
from datetime import datetime, timezone
import pytest
from sqlalchemy import text
# Must match Daily's filename format: {recording_start_ts}-{participant_uuid}-cam-audio-{track_start_ts}
# These UUIDs must match mock_daily_server.py participant IDs
PARTICIPANT_A_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
PARTICIPANT_B_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
TRACK_KEYS = [
f"1700000000000-{PARTICIPANT_A_ID}-cam-audio-1700000001000",
f"1700000000000-{PARTICIPANT_B_ID}-cam-audio-1700000001000",
]
@pytest.mark.asyncio
async def test_multitrack_pipeline_end_to_end(
api_client,
s3_client,
db_engine,
test_records_dir,
bucket_name,
poll_transcript_status,
):
"""Set up multitrack recording in S3/DB and verify the full pipeline completes."""
# 1. Upload test audio as two separate tracks to Garage S3
audio_path = test_records_dir / "test_short.wav"
assert audio_path.exists(), f"Test audio file not found: {audio_path}"
for track_key in TRACK_KEYS:
s3_client.upload_file(
str(audio_path),
bucket_name,
track_key,
)
# 2. Create transcript via API
resp = await api_client.post(
"/transcripts",
json={"name": "integration-multitrack-test"},
)
assert resp.status_code == 200, f"Failed to create transcript: {resp.text}"
transcript = resp.json()
transcript_id = transcript["id"]
# 3. Insert Recording row and link to transcript via direct DB access
recording_id = f"rec-integration-{transcript_id[:8]}"
now = datetime.now(timezone.utc)
async with db_engine.begin() as conn:
# Insert recording with track_keys
await conn.execute(
text("""
INSERT INTO recording (id, bucket_name, object_key, recorded_at, status, track_keys)
VALUES (:id, :bucket_name, :object_key, :recorded_at, :status, CAST(:track_keys AS json))
"""),
{
"id": recording_id,
"bucket_name": bucket_name,
"object_key": TRACK_KEYS[0],
"recorded_at": now,
"status": "completed",
"track_keys": json.dumps(TRACK_KEYS),
},
)
# Link recording to transcript and set status to uploaded
await conn.execute(
text("""
UPDATE transcript
SET recording_id = :recording_id, status = 'uploaded'
WHERE id = :transcript_id
"""),
{
"recording_id": recording_id,
"transcript_id": transcript_id,
},
)
# 4. Trigger processing via process endpoint
resp = await api_client.post(f"/transcripts/{transcript_id}/process")
assert resp.status_code == 200, f"Process trigger failed: {resp.text}"
# 5. Poll until pipeline completes
# The pipeline will call mock-daily for get_recording and get_participants
# Accept "error" too — non-critical steps like action_items may fail due to
# LLM parsing flakiness while core results (transcript, summaries) still exist.
data = await poll_transcript_status(
api_client, transcript_id, target=("ended", "error"), max_wait=300
)
# 6. Assertions — verify core pipeline results regardless of final status
assert data.get("title") and len(data["title"]) > 0, "Title should be non-empty"
assert (
data.get("long_summary") and len(data["long_summary"]) > 0
), "Long summary should be non-empty"
assert (
data.get("short_summary") and len(data["short_summary"]) > 0
), "Short summary should be non-empty"
# Topics are served from a separate endpoint
topics_resp = await api_client.get(f"/transcripts/{transcript_id}/topics")
assert topics_resp.status_code == 200, f"Failed to get topics: {topics_resp.text}"
topics = topics_resp.json()
assert len(topics) >= 1, "Should have at least 1 topic"
for topic in topics:
assert topic.get("title"), "Each topic should have a title"
assert topic.get("summary"), "Each topic should have a summary"
# Participants are served from a separate endpoint
participants_resp = await api_client.get(
f"/transcripts/{transcript_id}/participants"
)
assert (
participants_resp.status_code == 200
), f"Failed to get participants: {participants_resp.text}"
participants = participants_resp.json()
assert (
len(participants) >= 2
), f"Expected at least 2 speakers for multitrack, got {len(participants)}"