From e81e0cb5c3f63c67cb79c688289eff13cb7b859c Mon Sep 17 00:00:00 2001 From: Igor Loskutov Date: Tue, 16 Dec 2025 13:23:39 -0500 Subject: [PATCH] remove conductor and add hatchet tests (no-mistakes) --- CONDUCTOR_MIGRATION_REQUIREMENTS.md | 617 ------------------ TASKS.md | 125 +--- docker-compose.yml | 27 - server/CONDUCTOR_LLM_OBSERVATIONS.md | 345 ---------- ...de0e0_add_workflow_run_id_to_transcript.py | 4 +- ...26252ac554_add_workflow_id_to_recording.py | 32 - server/pyproject.toml | 3 - server/reflector/app.py | 2 - server/reflector/conductor/__init__.py | 5 - server/reflector/conductor/client.py | 40 -- server/reflector/conductor/progress.py | 103 --- server/reflector/conductor/run_workers.py | 58 -- server/reflector/conductor/shadow_compare.py | 207 ------ server/reflector/conductor/tasks/__init__.py | 6 - .../reflector/conductor/tasks/definitions.py | 161 ----- server/reflector/conductor/tasks/register.py | 60 -- .../reflector/conductor/workers/__init__.py | 37 -- .../conductor/workers/cleanup_consent.py | 126 ---- .../conductor/workers/detect_topics.py | 93 --- .../reflector/conductor/workers/finalize.py | 111 ---- .../workers/generate_dynamic_fork_tasks.py | 110 ---- .../conductor/workers/generate_summary.py | 150 ----- .../conductor/workers/generate_title.py | 111 ---- .../conductor/workers/generate_waveform.py | 106 --- .../conductor/workers/get_participants.py | 96 --- .../conductor/workers/get_recording.py | 90 --- .../conductor/workers/merge_transcripts.py | 89 --- .../conductor/workers/mixdown_tracks.py | 278 -------- .../reflector/conductor/workers/pad_track.py | 322 --------- .../reflector/conductor/workers/post_zulip.py | 101 --- .../conductor/workers/send_webhook.py | 115 ---- .../conductor/workers/transcribe_track.py | 96 --- .../workflows/diarization_pipeline.json | 205 ------ .../reflector/conductor/workflows/register.py | 74 --- server/reflector/db/recordings.py | 4 - .../reflector/services/transcript_process.py | 21 - server/reflector/settings.py | 16 +- server/reflector/views/conductor.py | 45 -- server/reflector/worker/process.py | 31 +- server/runserver.sh | 2 - server/tests/test_hatchet_client.py | 59 ++ server/tests/test_hatchet_dispatch.py | 376 +++++++++++ server/tests/test_hatchet_progress.py | 62 ++ server/uv.lock | 50 -- 44 files changed, 537 insertions(+), 4234 deletions(-) delete mode 100644 CONDUCTOR_MIGRATION_REQUIREMENTS.md delete mode 100644 server/CONDUCTOR_LLM_OBSERVATIONS.md delete mode 100644 server/migrations/versions/a326252ac554_add_workflow_id_to_recording.py delete mode 100644 server/reflector/conductor/__init__.py delete mode 100644 server/reflector/conductor/client.py delete mode 100644 server/reflector/conductor/progress.py delete mode 100644 server/reflector/conductor/run_workers.py delete mode 100644 server/reflector/conductor/shadow_compare.py delete mode 100644 server/reflector/conductor/tasks/__init__.py delete mode 100644 server/reflector/conductor/tasks/definitions.py delete mode 100644 server/reflector/conductor/tasks/register.py delete mode 100644 server/reflector/conductor/workers/__init__.py delete mode 100644 server/reflector/conductor/workers/cleanup_consent.py delete mode 100644 server/reflector/conductor/workers/detect_topics.py delete mode 100644 server/reflector/conductor/workers/finalize.py delete mode 100644 server/reflector/conductor/workers/generate_dynamic_fork_tasks.py delete mode 100644 server/reflector/conductor/workers/generate_summary.py delete mode 100644 server/reflector/conductor/workers/generate_title.py delete mode 100644 server/reflector/conductor/workers/generate_waveform.py delete mode 100644 server/reflector/conductor/workers/get_participants.py delete mode 100644 server/reflector/conductor/workers/get_recording.py delete mode 100644 server/reflector/conductor/workers/merge_transcripts.py delete mode 100644 server/reflector/conductor/workers/mixdown_tracks.py delete mode 100644 server/reflector/conductor/workers/pad_track.py delete mode 100644 server/reflector/conductor/workers/post_zulip.py delete mode 100644 server/reflector/conductor/workers/send_webhook.py delete mode 100644 server/reflector/conductor/workers/transcribe_track.py delete mode 100644 server/reflector/conductor/workflows/diarization_pipeline.json delete mode 100644 server/reflector/conductor/workflows/register.py delete mode 100644 server/reflector/views/conductor.py create mode 100644 server/tests/test_hatchet_client.py create mode 100644 server/tests/test_hatchet_dispatch.py create mode 100644 server/tests/test_hatchet_progress.py diff --git a/CONDUCTOR_MIGRATION_REQUIREMENTS.md b/CONDUCTOR_MIGRATION_REQUIREMENTS.md deleted file mode 100644 index 89fdecb4..00000000 --- a/CONDUCTOR_MIGRATION_REQUIREMENTS.md +++ /dev/null @@ -1,617 +0,0 @@ -# Conductor Migration Requirements: Daily.co Multitrack Pipeline - -## Executive Summary - -Migrate the Daily.co multitrack diarization pipeline from a monolithic Celery task to a decomposed Conductor workflow, enabling visual progress tracking, granular retries, and operational observability. - ---- - -## Business Value - -### 1. Visibility: Where Are We Now? (UX, DevEx) - -**Current State**: Users see only three states: `idle` → `processing` → `ended/error`. A 10-minute pipeline appears frozen with no feedback. - -**Target State**: Real-time visibility into which step is executing: -- "Transcribing track 2 of 3" -- "Generating summary (step 8 of 9)" -- Visual DAG in admin UI showing completed/in-progress/pending nodes - -**Business Impact**: -- Reduced support tickets ("is it stuck?") -- Engineers can instantly identify bottlenecks -- Users have confidence the system is working - -### 2. Progress Tracking: What's Left? (UX, DevEx) - -**Current State**: No indication of remaining work. A failure at step 8 gives same error as failure at step 1. - -**Target State**: -- Progress percentage based on completed steps -- Clear step enumeration (e.g., "Step 5/9: Transcription") -- Frontend receives structured progress events with step metadata - -**Business Impact**: -- Users can estimate completion time -- Frontend can render meaningful progress bars -- Error messages include context ("Failed during summary generation") - -### 3. Audit Trail & Profiling (DevEx, Ops) - -**Current State**: Logs scattered across Celery workers. No unified view of a single recording's journey. Resource consumption unknown per step. - -**Target State**: -- Single workflow ID traces entire recording lifecycle -- Per-step execution times recorded -- Resource consumption (GPU seconds, LLM tokens) attributable to specific steps -- Conductor UI provides complete audit history - -**Business Impact**: -- Debugging: "Recording X failed at step Y after Z seconds" -- Cost attribution: "Transcription costs $X, summarization costs $Y" -- Performance optimization: identify slowest steps - -### 4. Clear Event Dictionary (DevEx) - -**Current State**: Frontend receives WebSocket events (`TRANSCRIPT`, `TOPIC`, `FINAL_TITLE`, etc.) but mapping to pipeline phases is implicit. Adding new events requires tracing through Python code. - -**Target State**: -- Each Conductor task explicitly defines its output events -- Event schema documented alongside task definition -- Frontend developers can reference task→event mapping directly - -**Business Impact**: -- Faster frontend development -- Reduced miscommunication between backend/frontend teams -- Self-documenting pipeline - -### 5. Restart Without Reprocessing (UX, DevEx) - -**Current State**: Any failure restarts the entire pipeline. A timeout during summary generation re-runs transcription (wasting GPU costs). - -**Target State**: -- Failures resume from last successful step -- Completed work is checkpointed (e.g., transcription results stored before summary) -- Manual retry triggers only failed step, not entire workflow - -**Business Impact**: -- Reduced GPU/LLM costs on retries -- Faster recovery from transient failures -- Users don't wait for re-transcription on summary failures - -### 6. Per-Step Timeouts (UX, DevEx) - -**Current State**: Single task timeout for entire pipeline. A hung GPU call blocks everything. Killing the task loses all progress. - -**Target State**: -- Each step has independent timeout (e.g., transcription: 5min, LLM: 30s) -- Timeout kills only the hung step -- Pipeline can retry just that step or fail gracefully - -**Business Impact**: -- Faster detection of stuck external services -- Reduced blast radius from hung calls -- More granular SLAs per operation type - -### 7. Native Retries with Backoff (DevEx, UX) - -**Current State**: Celery retry logic is per-task, not per-external-call. Custom retry wrappers needed for each API call. - -**Target State**: -- Conductor provides native retry policies per task -- Exponential backoff configured declaratively -- Retry state visible in UI (attempt 2/5) - -**Business Impact**: -- Reduced boilerplate code -- Consistent retry behavior across all external calls -- Visibility into retry attempts for debugging - ---- - -## Current Architecture - -### Daily.co Multitrack Pipeline Flow - -``` -Daily webhook (recording.ready-to-download) Polling (every 3 min) - │ │ - ▼ ▼ - _handle_recording_ready() poll_daily_recordings() - │ │ - └──────────────┬─────────────────────────┘ - ▼ - process_multitrack_recording.delay() ← Celery task #1 - │ - ├── Daily API: GET /recordings/{id} - ├── Daily API: GET /meetings/{mtgSessionId}/participants - ├── DB: Create recording + transcript - │ - ▼ - task_pipeline_multitrack_process.delay() ← Celery task #2 (MONOLITH) - │ - │ ┌─────────────────────────────────────────────────┐ - │ │ pipeline.process() - ALL PHASES INSIDE HERE │ - │ │ │ - │ │ Phase 2: Track Padding (N tracks, sequential) │ - │ │ Phase 3: Mixdown → S3 upload │ - │ │ Phase 4: Waveform generation │ - │ │ Phase 5: Transcription (N GPU calls, serial!) │ - │ │ Phase 6: Topic Detection (C LLM calls) │ - │ │ Phase 7a: Title Generation (1 LLM call) │ - │ │ Phase 7b: Summary Generation (2+2M LLM calls) │ - │ │ Phase 8: Finalize status │ - │ └─────────────────────────────────────────────────┘ - │ - ▼ - chain(cleanup → zulip → webhook).delay() ← Celery chain (3 tasks) -``` - -### Problem: Monolithic `pipeline.process()` - -The heavy lifting happens inside a single Python function call. Celery only sees: -- Task started -- Task succeeded/failed - -It cannot see or control the 8 internal phases. - ---- - -## Target Architecture - -### Decomposed Conductor Workflow - -``` - ┌─────────────────────┐ - │ get_recording │ ← Daily API - │ get_participants │ - └──────────┬──────────┘ - │ - ┌──────────────────┼──────────────────┐ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ pad_tk_0 │ │ pad_tk_1 │ │ pad_tk_N │ ← FORK (parallel) - └────┬─────┘ └────┬─────┘ └────┬─────┘ - └──────────────────┼──────────────────┘ - ▼ - ┌─────────────────────┐ - │ mixdown_tracks │ ← PyAV → S3 - └──────────┬──────────┘ - │ - ┌──────────┴──────────┐ - ▼ ▼ - ┌───────────────┐ ┌───────────────┐ - │generate_wave │ │ (continue) │ ← waveform parallel with transcription setup - └───────────────┘ └───────────────┘ - │ - ┌──────────────────┼──────────────────┐ - ▼ ▼ ▼ - ┌────────────┐ ┌────────────┐ ┌────────────┐ - │transcribe_0│ │transcribe_1│ │transcribe_N│ ← FORK (parallel GPU!) - └─────┬──────┘ └─────┬──────┘ └─────┬──────┘ - └──────────────────┼──────────────────┘ - ▼ - ┌─────────────────────┐ - │ merge_transcripts │ - └──────────┬──────────┘ - │ - ┌──────────┴──────────┐ - ▼ ▼ - ┌───────────────┐ ┌───────────────┐ - │detect_topics │ │ (or) │ ← topic detection - └───────┬───────┘ └───────────────┘ - │ - ┌──────────────┴──────────────┐ - ▼ ▼ - ┌─────────────┐ ┌─────────────┐ - │generate_title│ │gen_summary │ ← FORK (parallel LLM) - └──────┬──────┘ └──────┬──────┘ - └──────────────┬─────────────┘ - ▼ - ┌─────────────────────┐ - │ finalize │ - └──────────┬──────────┘ - │ - ┌──────────────┼──────────────┐ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ consent │──▶│ zulip │──▶│ webhook │ ← sequential chain - └──────────┘ └──────────┘ └──────────┘ -``` - -### Key Improvements - -| Aspect | Current (Celery) | Target (Conductor) | -|--------|------------------|-------------------| -| Transcription parallelism | Serial (N × 30s) | Parallel (max 30s) | -| Failure granularity | Restart all | Retry failed step only | -| Progress visibility | None | Per-step status in UI | -| Timeout control | Entire pipeline | Per-step timeouts | -| Audit trail | Scattered logs | Unified workflow history | - ---- - -## Scope of Work - -### Module 1: Conductor Infrastructure Setup - -**Files to Create/Modify:** -- `docker-compose.yml` - Add Conductor server container -- `server/reflector/conductor/` - New module for Conductor client -- Environment configuration for Conductor URL - -**Tasks:** -- [ ] Add `conductoross/conductor-standalone:3.15.0` to docker-compose -- [ ] Create Conductor client wrapper (Python `conductor-python` SDK) -- [ ] Configure health checks and service dependencies -- [ ] Document Conductor UI access (port 8127) - -### Module 2: Task Decomposition - Worker Definitions - -**Files to Create:** -- `server/reflector/conductor/workers/` directory with: - - `get_recording.py` - Daily API recording fetch - - `get_participants.py` - Daily API participant fetch - - `pad_track.py` - Single track padding (PyAV) - - `mixdown_tracks.py` - Multi-track mixdown - - `generate_waveform.py` - Waveform generation - - `transcribe_track.py` - Single track GPU transcription - - `merge_transcripts.py` - Combine transcriptions - - `detect_topics.py` - LLM topic detection - - `generate_title.py` - LLM title generation - - `generate_summary.py` - LLM summary generation - - `finalize.py` - Status update and cleanup - - `cleanup_consent.py` - Consent check - - `post_zulip.py` - Zulip notification - - `send_webhook.py` - External webhook - - `generate_dynamic_fork_tasks.py` - Helper for FORK_JOIN_DYNAMIC task generation - -**Reference Files (Current Implementation):** -- `server/reflector/pipelines/main_multitrack_pipeline.py` -- `server/reflector/worker/process.py` -- `server/reflector/worker/webhook.py` - -**Key Considerations:** -- Each worker receives input from previous step via Conductor -- Workers must be idempotent (same input → same output) -- State serialization between steps (JSON-compatible types) - -### Module 3: Workflow Definition - -**Files to Create:** -- `server/reflector/conductor/workflows/diarization_pipeline.json` -- `server/reflector/conductor/workflows/register.py` - Registration script - -**Workflow Structure:** -```json -{ - "name": "daily_diarization_pipeline", - "version": 1, - "tasks": [ - {"name": "get_recording", "type": "SIMPLE"}, - {"name": "get_participants", "type": "SIMPLE"}, - { - "name": "fork_padding", - "type": "FORK_JOIN_DYNAMIC", - "dynamicForkTasksParam": "track_keys" - }, - {"name": "mixdown_tracks", "type": "SIMPLE"}, - {"name": "generate_waveform", "type": "SIMPLE"}, - { - "name": "fork_transcription", - "type": "FORK_JOIN_DYNAMIC", - "dynamicForkTasksParam": "padded_urls" - }, - {"name": "merge_transcripts", "type": "SIMPLE"}, - {"name": "detect_topics", "type": "SIMPLE"}, - { - "name": "fork_generation", - "type": "FORK_JOIN", - "forkTasks": [["generate_title"], ["generate_summary"]] - }, - {"name": "finalize", "type": "SIMPLE"}, - {"name": "cleanup_consent", "type": "SIMPLE"}, - {"name": "post_zulip", "type": "SIMPLE"}, - {"name": "send_webhook", "type": "SIMPLE"} - ] -} -``` - -**Key Considerations:** -- Dynamic FORK for variable number of tracks (N) -- Timeout configuration per task type -- Retry policies with exponential backoff - -### Module 4: Pipeline Trigger Migration - -**Files to Modify:** -- `server/reflector/worker/process.py` - -**Changes:** -- Replace `task_pipeline_multitrack_process.delay()` with Conductor workflow start -- Store workflow ID on Recording for status tracking -- Handle Conductor API errors -- Keep `process_multitrack_recording` as-is (creates DB entities before workflow) - -**Note:** Both webhook AND polling entry points converge at `process_multitrack_recording`, -which then calls `task_pipeline_multitrack_process.delay()`. By modifying this single call site, -we capture both entry paths without duplicating integration logic. - -### Module 5: Task Definition Registration - -**Files to Create:** -- `server/reflector/conductor/tasks/definitions.py` - -**Task Definitions with Timeouts:** - -| Task | Timeout | Response Timeout | Retry Count | -|------|---------|------------------|-------------| -| get_recording | 60s | 30s | 3 | -| get_participants | 60s | 30s | 3 | -| pad_track | 300s | 120s | 3 | -| mixdown_tracks | 600s | 300s | 3 | -| generate_waveform | 120s | 60s | 3 | -| transcribe_track | 1800s | 900s | 3 | -| merge_transcripts | 60s | 30s | 3 | -| detect_topics | 300s | 120s | 3 | -| generate_title | 60s | 30s | 3 | -| generate_summary | 300s | 120s | 3 | -| finalize | 60s | 30s | 3 | -| cleanup_consent | 60s | 30s | 3 | -| post_zulip | 60s | 30s | 5 | -| send_webhook | 60s | 30s | 30 | -| generate_dynamic_fork_tasks | 30s | 15s | 3 | - -### Module 6: Frontend Integration - -**WebSocket Events (Already Defined):** - -Events continue to be broadcast as today. No change to event structure. - -| Event | Triggered By Task | Payload | -|-------|-------------------|---------| -| STATUS | finalize | `{value: "processing"\|"ended"\|"error"}` | -| DURATION | mixdown_tracks | `{duration: float}` | -| WAVEFORM | generate_waveform | `{waveform: float[]}` | -| TRANSCRIPT | merge_transcripts | `{text: string, translation: string\|null}` | -| TOPIC | detect_topics | `{id, title, summary, timestamp, duration}` | -| FINAL_TITLE | generate_title | `{title: string}` | -| FINAL_LONG_SUMMARY | generate_summary | `{long_summary: string}` | -| FINAL_SHORT_SUMMARY | generate_summary | `{short_summary: string}` | - -**New: Progress Events** - -Add new event type for granular progress: - -```python -# PipelineProgressEvent -{ - "event": "PIPELINE_PROGRESS", - "data": { - "workflow_id": str, - "current_step": str, - "step_index": int, - "total_steps": int, - "step_status": "pending" | "in_progress" | "completed" | "failed" - } -} -``` - -### Module 7: State Management & Checkpointing - -**Current State Storage:** -- `transcript.status` - High-level status -- `transcript.events[]` - Append-only event log -- `transcript.topics[]` - Topic results -- `transcript.title`, `transcript.long_summary`, etc. - -**Conductor State Storage:** -- Workflow execution state in Conductor database -- Per-task input/output in Conductor - -**Checkpointing Strategy:** -1. Each task reads required state from DB (not previous task output for large data) -2. Each task writes results to DB before returning -3. Task output contains references (IDs, URLs) not large payloads -4. On retry, task can check DB for existing results (idempotency) - ---- - -## Data Flow Between Tasks - -### Input/Output Contracts - -``` -get_recording - Input: { recording_id: string } - Output: { id, mtg_session_id, room_name, duration } - -get_participants - Input: { mtg_session_id: string } - Output: { participants: [{participant_id, user_name}] } - -pad_track - Input: { track_index: number, s3_key: string } - Output: { padded_url: string, size: number } - -mixdown_tracks - Input: { padded_urls: string[] } - Output: { audio_key: string, duration: number } - -generate_waveform - Input: { audio_key: string } - Output: { waveform: number[] } - -transcribe_track - Input: { track_index: number, audio_url: string } - Output: { words: Word[] } - -merge_transcripts - Input: { transcripts: Word[][] } - Output: { all_words: Word[], word_count: number } - -detect_topics - Input: { words: Word[] } - Output: { topics: Topic[] } - -generate_title - Input: { topics: Topic[] } - Output: { title: string } - -generate_summary - Input: { words: Word[], topics: Topic[] } - Output: { summary: string, short_summary: string } - -finalize - Input: { recording_id, title, summary, duration } - Output: { status: "COMPLETED" } -``` - ---- - -## External API Calls Summary - -### Per-Step External Dependencies - -| Task | External Service | Calls | Notes | -|------|------------------|-------|-------| -| get_recording | Daily.co API | 1 | GET /recordings/{id} | -| get_participants | Daily.co API | 1 | GET /meetings/{id}/participants | -| pad_track | S3 | 2 | presign read + PUT padded | -| mixdown_tracks | S3 | 1 | PUT audio.mp3 | -| transcribe_track | Modal.com GPU | 1 | POST /transcriptions | -| detect_topics | LLM (OpenAI) | C | C = ceil(words/300) | -| generate_title | LLM (OpenAI) | 1 | - | -| generate_summary | LLM (OpenAI) | 2+2M | M = subjects (max 6) | -| post_zulip | Zulip API | 1 | POST or PATCH | -| send_webhook | External | 1 | Customer webhook URL | - -### Cost Attribution Enabled - -With decomposed tasks, costs can be attributed: -- **GPU costs**: Sum of `transcribe_track` durations -- **LLM costs**: Sum of `detect_topics` + `generate_title` + `generate_summary` token usage -- **S3 costs**: Bytes uploaded by `pad_track` + `mixdown_tracks` - ---- - -## Idempotency Requirements - -### By Task - -| Task | Idempotent? | Strategy | -|------|-------------|----------| -| get_recording | ✅ | Read-only API call | -| get_participants | ✅ | Read-only API call | -| pad_track | ⚠️ | Overwrite same S3 key | -| mixdown_tracks | ⚠️ | Overwrite same S3 key | -| generate_waveform | ✅ | Deterministic from audio | -| transcribe_track | ❌ | Cache by hash(audio_url) | -| detect_topics | ❌ | Cache by hash(words) | -| generate_title | ❌ | Cache by hash(topic_titles) | -| generate_summary | ❌ | Cache by hash(words+topics) | -| finalize | ✅ | Upsert status | -| cleanup_consent | ✅ | Idempotent deletes | -| post_zulip | ⚠️ | Use message_id for updates | -| send_webhook | ⚠️ | Receiver's responsibility | - -### Caching Strategy for LLM/GPU Calls - -```python -class TaskCache: - async def get(self, input_hash: str) -> Optional[Output]: ... - async def set(self, input_hash: str, output: Output) -> None: ... - -# Before calling external service: -cached = await cache.get(hash(input)) -if cached: - return cached - -result = await external_service.call(input) -await cache.set(hash(input), result) -return result -``` - ---- - -## Migration Strategy - -### Phase 1: Infrastructure (No Behavior Change) -- Add Conductor container to docker-compose -- Create Conductor client library -- Verify Conductor UI accessible - -### Phase 2: Parallel Implementation -- Implement all worker tasks -- Register workflow definition -- Test with synthetic recordings - -### Phase 3: Shadow Mode -- Trigger both Celery and Conductor pipelines -- Compare results for consistency -- Monitor Conductor execution in UI - -### Phase 4: Cutover -- Disable Celery pipeline trigger -- Enable Conductor-only execution -- Monitor error rates and performance - -### Phase 5: Cleanup -- Remove Celery task definitions -- Remove old pipeline code -- Update documentation - ---- - -## Risks & Mitigations - -| Risk | Mitigation | -|------|------------| -| Conductor server downtime | Health checks, failover to Celery (Phase 3) | -| Worker serialization issues | Extensive testing with real data | -| Performance regression | Benchmark parallel vs serial transcription | -| Data loss on migration | Shadow mode comparison (Phase 3) | -| Learning curve for team | Documentation, Conductor UI training | - ---- - -## Success Metrics - -| Metric | Current | Target | -|--------|---------|--------| -| Pipeline visibility | 3 states | 14+ steps visible | -| Transcription latency (N tracks) | N × 30s | ~30s (parallel) | -| Retry granularity | Entire pipeline | Single step | -| Cost attribution | None | Per-step breakdown | -| Debug time for failures | ~30 min | ~5 min (UI trace) | - ---- - -## Appendix: Conductor Mock Implementation - -A working Python mock demonstrating the target workflow structure is available at: -`docs/conductor-pipeline-mock/` - -To run: -```bash -cd docs/conductor-pipeline-mock -docker compose up --build -./test_workflow.sh -``` - -UI: http://localhost:8127 - -This mock validates: -- Workflow definition structure -- FORK_JOIN parallelism -- Worker task patterns -- Conductor SDK usage - ---- - -## References - -- Diarization Pipeline Diagram: `DIARIZATION_PIPELINE_DIAGRAM.md` -- Current Celery Implementation: `server/reflector/pipelines/main_multitrack_pipeline.py` -- Conductor OSS Documentation: https://conductor-oss.github.io/conductor/ -- Conductor Python SDK: https://github.com/conductor-sdk/conductor-python diff --git a/TASKS.md b/TASKS.md index d038dc18..1b2d27c8 100644 --- a/TASKS.md +++ b/TASKS.md @@ -1,6 +1,6 @@ # Durable Workflow Migration Tasks -This document defines atomic, isolated work items for migrating the Daily.co multitrack diarization pipeline from Celery to durable workflow orchestration. Supports both **Conductor** and **Hatchet** via `DURABLE_WORKFLOW_PROVIDER` env var. +This document defines atomic, isolated work items for migrating the Daily.co multitrack diarization pipeline from Celery to durable workflow orchestration using **Hatchet**. --- @@ -9,91 +9,46 @@ This document defines atomic, isolated work items for migrating the Daily.co mul ```bash # .env DURABLE_WORKFLOW_PROVIDER=none # Celery only (default) -DURABLE_WORKFLOW_PROVIDER=conductor # Use Conductor DURABLE_WORKFLOW_PROVIDER=hatchet # Use Hatchet -DURABLE_WORKFLOW_SHADOW_MODE=true # Run both provider + Celery (for comparison) +DURABLE_WORKFLOW_SHADOW_MODE=true # Run both Hatchet + Celery (for comparison) ``` --- ## Task Index -| ID | Title | Status | Conductor | Hatchet | -|----|-------|--------|-----------|---------| -| INFRA-001 | Add container to docker-compose | Done | ✓ | ✓ | -| INFRA-002 | Create Python client wrapper | Done | ✓ | ✓ | -| INFRA-003 | Add environment configuration | Done | ✓ | ✓ | -| TASK-001 | Create task definitions/workflow | Done | ✓ JSON | ✓ Python | -| TASK-002 | get_recording worker | Done | ✓ | ✓ | -| TASK-003 | get_participants worker | Done | ✓ | ✓ | -| TASK-004 | pad_track worker | Done | ✓ | ✓ | -| TASK-005 | mixdown_tracks worker | Done | ✓ | ✓ | -| TASK-006 | generate_waveform worker | Done | ✓ | ✓ | -| TASK-007 | transcribe_track worker | Done | ✓ | ✓ | -| TASK-008 | merge_transcripts worker | Done | ✓ | ✓ (in process_tracks) | -| TASK-009 | detect_topics worker | Done | ✓ | ✓ | -| TASK-010 | generate_title worker | Done | ✓ | ✓ | -| TASK-011 | generate_summary worker | Done | ✓ | ✓ | -| TASK-012 | finalize worker | Done | ✓ | ✓ | -| TASK-013 | cleanup_consent worker | Done | ✓ | ✓ | -| TASK-014 | post_zulip worker | Done | ✓ | ✓ | -| TASK-015 | send_webhook worker | Done | ✓ | ✓ | -| EVENT-001 | Progress WebSocket events | Done | ✓ | ✓ | -| INTEG-001 | Pipeline trigger integration | Done | ✓ | ✓ | -| SHADOW-001 | Shadow mode toggle | Done | ✓ | ✓ | -| TEST-001 | Integration tests | Pending | - | - | -| TEST-002 | E2E workflow test | Pending | - | - | -| CUTOVER-001 | Production cutover | Pending | - | - | -| CLEANUP-001 | Remove Celery code | Pending | - | - | - ---- - -## Architecture Differences - -| Aspect | Conductor | Hatchet | -|--------|-----------|---------| -| Worker model | Multiprocessing (fork) | Async (single process) | -| Task communication | REST polling | gRPC streaming | -| Workflow definition | JSON files | Python decorators | -| Child workflows | FORK_JOIN_DYNAMIC + JOIN task | `aio_run()` returns directly | -| Task definitions | Separate worker files | Embedded in workflow | -| Debug logging | Limited | Excellent with `HATCHET_DEBUG=true` | +| ID | Title | Status | +|----|-------|--------| +| INFRA-001 | Add container to docker-compose | Done | +| INFRA-002 | Create Python client wrapper | Done | +| INFRA-003 | Add environment configuration | Done | +| TASK-001 | Create workflow definition | Done | +| TASK-002 | get_recording task | Done | +| TASK-003 | get_participants task | Done | +| TASK-004 | pad_track task | Done | +| TASK-005 | mixdown_tracks task | Done | +| TASK-006 | generate_waveform task | Done | +| TASK-007 | transcribe_track task | Done | +| TASK-008 | merge_transcripts task | Done (in process_tracks) | +| TASK-009 | detect_topics task | Done | +| TASK-010 | generate_title task | Done | +| TASK-011 | generate_summary task | Done | +| TASK-012 | finalize task | Done | +| TASK-013 | cleanup_consent task | Done | +| TASK-014 | post_zulip task | Done | +| TASK-015 | send_webhook task | Done | +| EVENT-001 | Progress WebSocket events | Done | +| INTEG-001 | Pipeline trigger integration | Done | +| SHADOW-001 | Shadow mode toggle | Done | +| TEST-001 | Integration tests | Pending | +| TEST-002 | E2E workflow test | Pending | +| CUTOVER-001 | Production cutover | Pending | +| CLEANUP-001 | Remove Celery code | Pending | --- ## File Structure -### Conductor -``` -server/reflector/conductor/ -├── client.py # SDK wrapper -├── progress.py # WebSocket progress emission -├── run_workers.py # Worker startup -├── shadow_compare.py # Shadow mode comparison -├── tasks/ -│ ├── definitions.py # Task definitions with timeouts -│ └── register.py # Registration script -├── workers/ -│ ├── get_recording.py -│ ├── get_participants.py -│ ├── pad_track.py -│ ├── mixdown_tracks.py -│ ├── generate_waveform.py -│ ├── transcribe_track.py -│ ├── merge_transcripts.py -│ ├── detect_topics.py -│ ├── generate_title.py -│ ├── generate_summary.py -│ ├── finalize.py -│ ├── cleanup_consent.py -│ ├── post_zulip.py -│ ├── send_webhook.py -│ └── generate_dynamic_fork_tasks.py -└── workflows/ - └── register.py -``` - -### Hatchet ``` server/reflector/hatchet/ ├── client.py # SDK wrapper @@ -109,9 +64,8 @@ server/reflector/hatchet/ ## Remaining Work ### TEST-001: Integration Tests -- [ ] Test each worker with mocked external services +- [ ] Test each task with mocked external services - [ ] Test error handling and retries -- [ ] Test both Conductor and Hatchet paths ### TEST-002: E2E Workflow Test - [ ] Complete workflow run with real Daily.co recording @@ -119,7 +73,7 @@ server/reflector/hatchet/ - [ ] Performance comparison ### CUTOVER-001: Production Cutover -- [ ] Deploy with `DURABLE_WORKFLOW_PROVIDER=conductor` or `hatchet` +- [ ] Deploy with `DURABLE_WORKFLOW_PROVIDER=hatchet` - [ ] Monitor for failures - [ ] Compare results with shadow mode if needed @@ -132,30 +86,17 @@ server/reflector/hatchet/ ## Known Issues -### Conductor -- See `CONDUCTOR_LLM_OBSERVATIONS.md` for debugging notes -- Ghost workers issue (multiple containers polling) -- Multiprocessing + AsyncIO conflicts - ### Hatchet - See `HATCHET_LLM_OBSERVATIONS.md` for debugging notes - SDK v1.21+ API changes (breaking) - JWT token Docker networking issues - Worker appears hung without debug mode +- Workflow replay is version-locked (use --force to run latest code) --- ## Quick Start -### Conductor -```bash -# Start infrastructure -docker compose up -d conductor conductor-worker - -# Register workflow -docker compose exec conductor-worker uv run python -m reflector.conductor.workflows.register -``` - ### Hatchet ```bash # Start infrastructure @@ -167,7 +108,7 @@ docker compose up -d hatchet hatchet-worker ### Trigger Workflow ```bash # Set provider in .env -DURABLE_WORKFLOW_PROVIDER=hatchet # or conductor +DURABLE_WORKFLOW_PROVIDER=hatchet # Process a Daily.co recording via webhook or API # The pipeline trigger automatically uses the configured provider diff --git a/docker-compose.yml b/docker-compose.yml index d2dc6e5a..a433a33c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,20 +34,6 @@ services: environment: ENTRYPOINT: beat - conductor-worker: - build: - context: server - volumes: - - ./server/:/app/ - - /app/.venv - env_file: - - ./server/.env - environment: - ENTRYPOINT: conductor-worker - depends_on: - conductor: - condition: service_healthy - hatchet-worker: build: context: server @@ -92,19 +78,6 @@ services: volumes: - ./data/postgres:/var/lib/postgresql/data - conductor: - image: conductoross/conductor-standalone:3.15.0 - ports: - - "8180:8080" - - "5001:5000" - environment: - - conductor.db.type=memory - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8080/health"] - interval: 30s - timeout: 10s - retries: 5 - hatchet-postgres: image: postgres:15.6 command: postgres -c 'max_connections=200' diff --git a/server/CONDUCTOR_LLM_OBSERVATIONS.md b/server/CONDUCTOR_LLM_OBSERVATIONS.md deleted file mode 100644 index e3b2e9d2..00000000 --- a/server/CONDUCTOR_LLM_OBSERVATIONS.md +++ /dev/null @@ -1,345 +0,0 @@ -# Conductor OSS Migration - LLM Debugging Observations - -This document captures hard-won debugging insights from migrating the multitrack diarization pipeline from Celery to Conductor OSS. These observations are particularly relevant for LLM assistants working on this codebase. - -## Architecture Context - -- **Conductor Python SDK** uses multiprocessing: 1 parent process spawns 15 `TaskRunner` subprocesses -- Each task type gets its own subprocess that polls Conductor server -- Workers are identified by container hostname (e.g., `595f5ddc9711`) -- Shadow mode (`CONDUCTOR_SHADOW_MODE=true`) runs both Celery and Conductor in parallel - ---- - -## Challenge 1: Ghost Workers - Multiple Containers Polling Same Tasks - -### Symptoms -- Tasks complete but with wrong/empty output -- Worker logs show no execution for a task that API shows as COMPLETED -- `workerId` in Conductor API doesn't match expected container - -### Root Cause -Multiple containers may be running Conductor workers: -- `reflector-conductor-worker-1` (dedicated worker) -- `reflector-server-1` (if shadow mode enabled or worker code imported) - -### Debugging Steps -```bash -# 1. Get the mystery worker ID from Conductor API -curl -s "http://localhost:8180/api/workflow/{id}" | jq '.tasks[] | {ref: .referenceTaskName, workerId}' - -# 2. Find which container has that hostname -docker ps -a | grep {workerId} -# or -docker ps -a --format "{{.ID}} {{.Names}}" | grep {first-12-chars} - -# 3. Check that container's code version -docker exec {container} cat /app/reflector/conductor/workers/{worker}.py | head -50 -``` - -### Resolution -Restart ALL containers that might be polling Conductor tasks: -```bash -docker compose restart conductor-worker server -``` - -### Key Insight -**Always verify `workerId` matches your expected container.** In distributed worker setups, know ALL containers that poll for tasks. - ---- - -## Challenge 2: Multiprocessing + AsyncIO + Database Conflicts - -### Symptoms -``` -InterfaceError: cannot perform operation: another operation is in progress -RuntimeError: Task running at /app/.../worker.py -``` - -### Root Cause -Conductor Python SDK forks subprocesses. When subprocess calls `asyncio.run()`: -1. New event loop is created -2. But `get_database()` returns cached connection from parent process context -3. Parent's connection is incompatible with child's event loop - -### Resolution -Reset context and create fresh connection in each subprocess: -```python -async def _process(): - import databases - from reflector.db import _database_context - from reflector.settings import settings - - # Reset context var - don't inherit from parent - _database_context.set(None) - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - # ... rest of async code -``` - -### Key Insight -**Any singleton/cached resource (DB connections, S3 clients, HTTP sessions) must be recreated AFTER fork.** Never trust inherited state in multiprocessing workers. - -### TODO: The Real Problem with get_database() - -**Current solution is a hack.** The issue runs deeper than multiprocessing fork: - -#### What's Actually Happening -1. Each Conductor subprocess calls `asyncio.run(_process())` repeatedly for each task -2. First `asyncio.run()`: creates DB connection, stores in ContextVar -3. First task completes, `asyncio.run()` exits, **event loop destroyed** -4. **But**: ContextVar still holds the connection reference (ContextVars persist across `asyncio.run()` calls) -5. Second `asyncio.run()`: Creates a **new event loop** -6. Code tries to use the **old connection** (from ContextVar) with the **new event loop** -7. Error: "another operation is in progress" - -**Root issue**: `get_database()` as a global singleton is incompatible with repeated `asyncio.run()` calls in the same process. - -#### Option 1: Explicit Connection Lifecycle (cleanest) -```python -async def _process(): - import databases - from reflector.settings import settings - - # Don't use get_database() - create explicit connection - db = databases.Database(settings.DATABASE_URL) - - try: - await db.connect() - - # Problem: transcripts_controller.get_by_id() uses get_database() internally - # Would need to refactor controllers to accept db parameter - # e.g., await transcripts_controller.get_by_id(transcript_id, db=db) - - finally: - await db.disconnect() -``` - -**Pros**: Clean separation, explicit lifecycle -**Cons**: Requires refactoring all controller methods to accept `db` parameter - -#### Option 2: Reset ContextVar Properly (pragmatic) -```python -async def _process(): - from reflector.db import _database_context, get_database - - # Ensure fresh connection per task - old_db = _database_context.get() - if old_db and old_db.is_connected: - await old_db.disconnect() - _database_context.set(None) - - # Now get_database() will create fresh connection - db = get_database() - await db.connect() - - try: - # ... work ... - finally: - await db.disconnect() - _database_context.set(None) -``` - -**Pros**: Works with existing controller code -**Cons**: Still manipulating globals, cleanup needed in every worker - -#### Option 3: Fix get_database() Itself (best long-term) -```python -# In reflector/db/__init__.py -def get_database() -> databases.Database: - """Get database instance for current event loop""" - import asyncio - - db = _database_context.get() - - # Check if connection is valid for current event loop - if db is not None: - try: - loop = asyncio.get_running_loop() - # If connection's event loop differs, it's stale - if db._connection and hasattr(db._connection, '_loop'): - if db._connection._loop != loop: - # Stale connection from old event loop - db = None - except RuntimeError: - # No running loop - pass - - if db is None: - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - - return db -``` - -**Pros**: Fixes root cause, no changes needed in workers -**Cons**: Relies on implementation details of `databases` library - -#### Recommendation -- **Short-term**: Option 2 (explicit cleanup in workers that need DB) -- **Long-term**: Option 1 (refactor to dependency injection) is the only architecturally clean solution - ---- - -## Challenge 3: Type Mismatches Across Serialization Boundary - -### Symptoms -``` -ValidationError: 1 validation error for TranscriptTopic -transcript - Input should be a valid string [type=string_type, input_value={'translation': None, 'words': [...]}] -``` - -### Root Cause -Conductor JSON-serializes all task inputs/outputs. Complex Pydantic models get serialized to dicts: -- `TitleSummary.transcript: Transcript` becomes `{"translation": null, "words": [...]}` -- Next task expects `TranscriptTopic.transcript: str` - -### Resolution -Explicitly reconstruct types when deserializing: -```python -from reflector.processors.types import TitleSummary, Transcript as TranscriptType, Word - -def normalize_topic(t): - topic = dict(t) - transcript_data = topic.get("transcript") - if isinstance(transcript_data, dict): - words_list = transcript_data.get("words", []) - word_objects = [Word(**w) for w in words_list] - topic["transcript"] = TranscriptType( - words=word_objects, - translation=transcript_data.get("translation") - ) - return topic - -topic_objects = [TitleSummary(**normalize_topic(t)) for t in topics] -``` - -### Key Insight -**Conductor task I/O is always JSON.** Design workers to handle dict inputs and reconstruct domain objects explicitly. - ---- - -## Challenge 4: Conductor Health Check Failures - -### Symptoms -``` -dependency failed to start: container reflector-conductor-1 is unhealthy -``` - -### Root Cause -Conductor OSS standalone container health endpoint can be slow/flaky, especially during startup or under load. - -### Resolution -Bypass docker-compose health check dependency: -```bash -# Instead of: docker compose up -d conductor-worker -docker start reflector-conductor-worker-1 -``` - -### Key Insight -For development, consider removing `depends_on.condition: service_healthy` or increasing health check timeout. - ---- - -## Challenge 5: JOIN Task Output Format - -### Symptoms -`merge_transcripts` receives data but outputs `word_count: 0` - -### Root Cause -FORK_JOIN_DYNAMIC's JOIN task outputs a **dict keyed by task reference names**, not an array: -```json -{ - "transcribe_track_0": {"words": [...], "track_index": 0}, - "transcribe_track_1": {"words": [...], "track_index": 1} -} -``` - -### Resolution -Handle both dict and array inputs: -```python -transcripts = task.input_data.get("transcripts", []) - -# Handle JOIN output (dict with task refs as keys) -if isinstance(transcripts, dict): - transcripts = list(transcripts.values()) - -for t in transcripts: - if isinstance(t, dict) and "words" in t: - all_words.extend(t["words"]) -``` - -### Key Insight -**JOIN task output structure differs from FORK input.** Always log input types during debugging. - ---- - -## Debugging Workflow - -### 1. Add DEBUG Prints with Flush -Multiprocessing buffers stdout. Force immediate output: -```python -import sys -print("[DEBUG] worker entered", flush=True) -sys.stdout.flush() -``` - -### 2. Test Worker Functions Directly -Bypass Conductor entirely to verify logic: -```bash -docker compose exec conductor-worker uv run python -c " -from reflector.conductor.workers.merge_transcripts import merge_transcripts -from conductor.client.http.models import Task - -mock_task = Task() -mock_task.input_data = {'transcripts': {...}, 'transcript_id': 'test'} -result = merge_transcripts(mock_task) -print(result.output_data) -" -``` - -### 3. Check Task Timing -Suspiciously fast completion (e.g., 10ms) indicates: -- Cached result from previous run -- Wrong worker processed it -- Task completed without actual execution - -```bash -curl -s "http://localhost:8180/api/workflow/{id}" | \ - jq '.tasks[] | {ref: .referenceTaskName, duration: (.endTime - .startTime)}' -``` - -### 4. Verify Container Code Version -```bash -docker compose exec conductor-worker cat /app/reflector/conductor/workers/{file}.py | head -50 -``` - -### 5. Use Conductor Retry API -Retry from specific failed task without re-running entire workflow: -```bash -curl -X POST "http://localhost:8180/api/workflow/{id}/retry" -``` - ---- - -## Common Gotchas Summary - -| Issue | Signal | Fix | -|-------|--------|-----| -| Wrong worker | `workerId` mismatch | Restart all worker containers | -| DB conflict | "another operation in progress" | Fresh DB connection per subprocess | -| Type mismatch | Pydantic validation error | Reconstruct objects from dicts | -| No logs | Task completes but no output | Check if different container processed | -| 0 results | JOIN output format | Convert dict.values() to list | -| Health check | Compose dependency fails | Use `docker start` directly | - ---- - -## Files Most Likely to Need Conductor-Specific Handling - -- `server/reflector/conductor/workers/*.py` - All workers need multiprocessing-safe patterns -- `server/reflector/db/__init__.py` - Database singleton, needs context reset -- `server/reflector/conductor/workflows/*.json` - Workflow definitions, check input/output mappings diff --git a/server/migrations/versions/0f943fede0e0_add_workflow_run_id_to_transcript.py b/server/migrations/versions/0f943fede0e0_add_workflow_run_id_to_transcript.py index cd1857c1..a32184a8 100644 --- a/server/migrations/versions/0f943fede0e0_add_workflow_run_id_to_transcript.py +++ b/server/migrations/versions/0f943fede0e0_add_workflow_run_id_to_transcript.py @@ -1,7 +1,7 @@ """add workflow_run_id to transcript Revision ID: 0f943fede0e0 -Revises: a326252ac554 +Revises: bbafedfa510c Create Date: 2025-12-16 01:54:13.855106 """ @@ -13,7 +13,7 @@ from alembic import op # revision identifiers, used by Alembic. revision: str = "0f943fede0e0" -down_revision: Union[str, None] = "a326252ac554" +down_revision: Union[str, None] = "bbafedfa510c" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None diff --git a/server/migrations/versions/a326252ac554_add_workflow_id_to_recording.py b/server/migrations/versions/a326252ac554_add_workflow_id_to_recording.py deleted file mode 100644 index b88e2da7..00000000 --- a/server/migrations/versions/a326252ac554_add_workflow_id_to_recording.py +++ /dev/null @@ -1,32 +0,0 @@ -"""add workflow_id to recording - -Revision ID: a326252ac554 -Revises: bbafedfa510c -Create Date: 2025-12-14 11:34:22.137910 - -""" - -from typing import Sequence, Union - -import sqlalchemy as sa -from alembic import op - -# revision identifiers, used by Alembic. -revision: str = "a326252ac554" -down_revision: Union[str, None] = "bbafedfa510c" -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - with op.batch_alter_table("recording", schema=None) as batch_op: - batch_op.add_column(sa.Column("workflow_id", sa.String(), nullable=True)) - batch_op.create_index( - "idx_recording_workflow_id", ["workflow_id"], unique=False - ) - - -def downgrade() -> None: - with op.batch_alter_table("recording", schema=None) as batch_op: - batch_op.drop_index("idx_recording_workflow_id") - batch_op.drop_column("workflow_id") diff --git a/server/pyproject.toml b/server/pyproject.toml index a3702a7e..ad58954c 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ "pytest-env>=1.1.5", "webvtt-py>=0.5.0", "icalendar>=6.0.0", - "conductor-python>=1.2.3", "hatchet-sdk>=0.47.0", ] @@ -138,8 +137,6 @@ select = [ "reflector/tools/**.py" = ["PLC0415"] "reflector/hatchet/run_workers.py" = ["PLC0415"] "reflector/hatchet/workflows/**.py" = ["PLC0415"] -"reflector/conductor/run_workers.py" = ["PLC0415"] -"reflector/conductor/workers/**.py" = ["PLC0415"] "reflector/views/hatchet.py" = ["PLC0415"] "migrations/versions/**.py" = ["PLC0415"] "tests/**.py" = ["PLC0415"] diff --git a/server/reflector/app.py b/server/reflector/app.py index f37e19bd..91aad316 100644 --- a/server/reflector/app.py +++ b/server/reflector/app.py @@ -12,7 +12,6 @@ from reflector.events import subscribers_shutdown, subscribers_startup from reflector.logger import logger from reflector.metrics import metrics_init from reflector.settings import settings -from reflector.views.conductor import router as conductor_router from reflector.views.daily import router as daily_router from reflector.views.hatchet import router as hatchet_router from reflector.views.meetings import router as meetings_router @@ -100,7 +99,6 @@ app.include_router(user_ws_router, prefix="/v1") app.include_router(zulip_router, prefix="/v1") app.include_router(whereby_router, prefix="/v1") app.include_router(daily_router, prefix="/v1/daily") -app.include_router(conductor_router, prefix="/v1") app.include_router(hatchet_router, prefix="/v1") add_pagination(app) diff --git a/server/reflector/conductor/__init__.py b/server/reflector/conductor/__init__.py deleted file mode 100644 index 640c6aae..00000000 --- a/server/reflector/conductor/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Conductor workflow orchestration module.""" - -from reflector.conductor.client import ConductorClientManager - -__all__ = ["ConductorClientManager"] diff --git a/server/reflector/conductor/client.py b/server/reflector/conductor/client.py deleted file mode 100644 index 134bafe3..00000000 --- a/server/reflector/conductor/client.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Conductor Python client wrapper.""" - -from conductor.client.configuration.configuration import Configuration -from conductor.client.orkes_clients import OrkesClients -from conductor.client.workflow_client import WorkflowClient -from reflector.settings import settings - - -class ConductorClientManager: - """Singleton manager for Conductor client connections.""" - - _instance: OrkesClients | None = None - - @classmethod - def get_client(cls) -> WorkflowClient: - """Get or create the workflow client.""" - if cls._instance is None: - config = Configuration( - server_api_url=settings.CONDUCTOR_SERVER_URL, - debug=settings.CONDUCTOR_DEBUG, - ) - cls._instance = OrkesClients(config) - return cls._instance.get_workflow_client() - - @classmethod - def start_workflow(cls, name: str, version: int, input_data: dict) -> str: - """Start a workflow and return the workflow ID.""" - client = cls.get_client() - return client.start_workflow_by_name(name, input_data, version=version) - - @classmethod - def get_workflow_status(cls, workflow_id: str) -> dict: - """Get the current status of a workflow.""" - client = cls.get_client() - return client.get_workflow(workflow_id, include_tasks=True) - - @classmethod - def reset(cls) -> None: - """Reset the client instance (for testing).""" - cls._instance = None diff --git a/server/reflector/conductor/progress.py b/server/reflector/conductor/progress.py deleted file mode 100644 index 4287da14..00000000 --- a/server/reflector/conductor/progress.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Progress event emission for Conductor workers.""" - -import asyncio -from typing import Literal - -from reflector.db.transcripts import PipelineProgressData -from reflector.logger import logger -from reflector.ws_manager import get_ws_manager - -# Step mapping for progress tracking -# Maps task names to their index in the pipeline -PIPELINE_STEPS = { - "get_recording": 1, - "get_participants": 2, - "pad_track": 3, # Fork tasks share same step - "mixdown_tracks": 4, - "generate_waveform": 5, - "transcribe_track": 6, # Fork tasks share same step - "merge_transcripts": 7, - "detect_topics": 8, - "generate_title": 9, # Fork tasks share same step - "generate_summary": 9, # Fork tasks share same step - "finalize": 10, - "cleanup_consent": 11, - "post_zulip": 12, - "send_webhook": 13, -} - -TOTAL_STEPS = 13 - - -async def _emit_progress_async( - transcript_id: str, - step: str, - status: Literal["pending", "in_progress", "completed", "failed"], - workflow_id: str | None = None, -) -> None: - """Async implementation of progress emission.""" - ws_manager = get_ws_manager() - step_index = PIPELINE_STEPS.get(step, 0) - - data = PipelineProgressData( - workflow_id=workflow_id, - current_step=step, - step_index=step_index, - total_steps=TOTAL_STEPS, - step_status=status, - ) - - await ws_manager.send_json( - room_id=f"ts:{transcript_id}", - message={ - "event": "PIPELINE_PROGRESS", - "data": data.model_dump(), - }, - ) - - logger.debug( - "[Progress] Emitted", - transcript_id=transcript_id, - step=step, - status=status, - step_index=step_index, - ) - - -def emit_progress( - transcript_id: str, - step: str, - status: Literal["pending", "in_progress", "completed", "failed"], - workflow_id: str | None = None, -) -> None: - """Emit a pipeline progress event (sync wrapper for Conductor workers). - - Args: - transcript_id: The transcript ID to emit progress for - step: The current step name (e.g., "transcribe_track") - status: The step status - workflow_id: Optional workflow ID - """ - try: - # Get or create event loop for sync context - try: - loop = asyncio.get_running_loop() - except RuntimeError: - loop = None - - if loop is not None and loop.is_running(): - # Already in async context, schedule the coroutine - asyncio.create_task( - _emit_progress_async(transcript_id, step, status, workflow_id) - ) - else: - # Not in async context, run synchronously - asyncio.run(_emit_progress_async(transcript_id, step, status, workflow_id)) - except Exception as e: - # Progress emission should never break the pipeline - logger.warning( - "[Progress] Failed to emit progress event", - error=str(e), - transcript_id=transcript_id, - step=step, - ) diff --git a/server/reflector/conductor/run_workers.py b/server/reflector/conductor/run_workers.py deleted file mode 100644 index e24f86df..00000000 --- a/server/reflector/conductor/run_workers.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Run Conductor workers for the diarization pipeline. - -Usage: - uv run -m reflector.conductor.run_workers - - # Or via docker: - docker compose exec server uv run -m reflector.conductor.run_workers -""" - -import signal -import sys -import time - -from conductor.client.automator.task_handler import TaskHandler -from conductor.client.configuration.configuration import Configuration -from reflector.conductor import workers # noqa: F401 - registers workers via decorators -from reflector.logger import logger -from reflector.settings import settings - - -def main() -> None: - """Start Conductor worker polling.""" - if not settings.CONDUCTOR_ENABLED: - logger.error("CONDUCTOR_ENABLED is False, not starting workers") - sys.exit(1) - - logger.info( - "Starting Conductor workers", - server_url=settings.CONDUCTOR_SERVER_URL, - ) - - config = Configuration( - server_api_url=settings.CONDUCTOR_SERVER_URL, - debug=settings.CONDUCTOR_DEBUG, - ) - - task_handler = TaskHandler(configuration=config) - - # Handle graceful shutdown - def shutdown_handler(signum: int, frame) -> None: - logger.info("Received shutdown signal, stopping workers...") - task_handler.stop_processes() - sys.exit(0) - - signal.signal(signal.SIGINT, shutdown_handler) - signal.signal(signal.SIGTERM, shutdown_handler) - - logger.info("Starting task polling...") - task_handler.start_processes() - - # Keep alive - while True: - time.sleep(1) - - -if __name__ == "__main__": - main() diff --git a/server/reflector/conductor/shadow_compare.py b/server/reflector/conductor/shadow_compare.py deleted file mode 100644 index 90d65b22..00000000 --- a/server/reflector/conductor/shadow_compare.py +++ /dev/null @@ -1,207 +0,0 @@ -"""Shadow mode comparison for Celery vs Conductor pipeline results.""" - -from dataclasses import dataclass -from typing import Any - -from reflector.conductor.client import ConductorClientManager -from reflector.db.transcripts import Transcript, transcripts_controller -from reflector.logger import logger - - -@dataclass -class FieldDifference: - """A difference between Celery and Conductor field values.""" - - field: str - celery_value: Any - conductor_value: Any - - -@dataclass -class ComparisonResult: - """Result of comparing Celery and Conductor outputs.""" - - match: bool - differences: list[FieldDifference] - celery_status: str - conductor_status: str - error: str | None = None - - -async def compare_content_results( - recording_id: str, workflow_id: str -) -> ComparisonResult: - """ - Compare content results from Celery and Conductor pipelines. - - Args: - recording_id: Recording ID to look up Celery transcript - workflow_id: Conductor workflow ID to get workflow output - - Returns: - ComparisonResult with match status and any differences - """ - try: - # Get Celery result from DB - celery_transcript = await transcripts_controller.get_by_recording_id( - recording_id - ) - if not celery_transcript: - return ComparisonResult( - match=False, - differences=[], - celery_status="not_found", - conductor_status="unknown", - error=f"No transcript found for recording_id={recording_id}", - ) - - # Get Conductor workflow status - workflow_status = ConductorClientManager.get_workflow_status(workflow_id) - conductor_status = workflow_status.status if workflow_status else "unknown" - - # If workflow not completed, can't compare - if conductor_status != "COMPLETED": - return ComparisonResult( - match=False, - differences=[], - celery_status=celery_transcript.status, - conductor_status=conductor_status, - error=f"Conductor workflow not completed: {conductor_status}", - ) - - # Extract output from workflow - workflow_output = ( - workflow_status.output if hasattr(workflow_status, "output") else {} - ) - - differences = _compare_fields(celery_transcript, workflow_output) - - result = ComparisonResult( - match=len(differences) == 0, - differences=differences, - celery_status=celery_transcript.status, - conductor_status=conductor_status, - ) - - # Log comparison result - if result.match: - logger.info( - "Shadow mode comparison: MATCH", - recording_id=recording_id, - workflow_id=workflow_id, - ) - else: - logger.warning( - "Shadow mode comparison: MISMATCH", - recording_id=recording_id, - workflow_id=workflow_id, - differences=[ - { - "field": d.field, - "celery": d.celery_value, - "conductor": d.conductor_value, - } - for d in differences - ], - ) - - return result - - except Exception as e: - logger.error( - "Shadow mode comparison failed", - recording_id=recording_id, - workflow_id=workflow_id, - error=str(e), - exc_info=True, - ) - return ComparisonResult( - match=False, - differences=[], - celery_status="unknown", - conductor_status="unknown", - error=str(e), - ) - - -def _compare_fields( - celery_transcript: Transcript, workflow_output: dict -) -> list[FieldDifference]: - """Compare specific content fields between Celery and Conductor.""" - differences = [] - - # Compare title - conductor_title = workflow_output.get("title") - if celery_transcript.title != conductor_title: - differences.append( - FieldDifference( - field="title", - celery_value=celery_transcript.title, - conductor_value=conductor_title, - ) - ) - - # Compare short_summary - conductor_short_summary = workflow_output.get("short_summary") - if celery_transcript.short_summary != conductor_short_summary: - differences.append( - FieldDifference( - field="short_summary", - celery_value=celery_transcript.short_summary, - conductor_value=conductor_short_summary, - ) - ) - - # Compare long_summary - conductor_long_summary = workflow_output.get("summary") - if celery_transcript.long_summary != conductor_long_summary: - differences.append( - FieldDifference( - field="long_summary", - celery_value=celery_transcript.long_summary, - conductor_value=conductor_long_summary, - ) - ) - - # Compare topic count - celery_topics = celery_transcript.topics or [] - conductor_topics = workflow_output.get("topics", []) - if len(celery_topics) != len(conductor_topics): - differences.append( - FieldDifference( - field="topic_count", - celery_value=len(celery_topics), - conductor_value=len(conductor_topics), - ) - ) - - # Compare word count from events - celery_events = celery_transcript.events or {} - celery_words = ( - celery_events.get("words", []) if isinstance(celery_events, dict) else [] - ) - conductor_words = workflow_output.get("all_words", []) - if len(celery_words) != len(conductor_words): - differences.append( - FieldDifference( - field="word_count", - celery_value=len(celery_words), - conductor_value=len(conductor_words), - ) - ) - - # Compare duration - conductor_duration = workflow_output.get("duration") - if ( - conductor_duration is not None - and celery_transcript.duration != conductor_duration - ): - differences.append( - FieldDifference( - field="duration", - celery_value=celery_transcript.duration, - conductor_value=conductor_duration, - ) - ) - - return differences diff --git a/server/reflector/conductor/tasks/__init__.py b/server/reflector/conductor/tasks/__init__.py deleted file mode 100644 index e47fb2f7..00000000 --- a/server/reflector/conductor/tasks/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Conductor task definitions module.""" - -from reflector.conductor.tasks.definitions import TASK_DEFINITIONS -from reflector.conductor.tasks.register import register_task_definitions - -__all__ = ["TASK_DEFINITIONS", "register_task_definitions"] diff --git a/server/reflector/conductor/tasks/definitions.py b/server/reflector/conductor/tasks/definitions.py deleted file mode 100644 index 93edc928..00000000 --- a/server/reflector/conductor/tasks/definitions.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Task definitions for Conductor workflow orchestration. - -Timeout reference (from CONDUCTOR_MIGRATION_REQUIREMENTS.md): -| Task | Timeout (s) | Response Timeout (s) | Retry Count | -|-------------------|-------------|----------------------|-------------| -| get_recording | 60 | 30 | 3 | -| get_participants | 60 | 30 | 3 | -| pad_track | 300 | 120 | 3 | -| mixdown_tracks | 600 | 300 | 3 | -| generate_waveform | 120 | 60 | 3 | -| transcribe_track | 1800 | 900 | 3 | -| merge_transcripts | 60 | 30 | 3 | -| detect_topics | 300 | 120 | 3 | -| generate_title | 60 | 30 | 3 | -| generate_summary | 300 | 120 | 3 | -| finalize | 60 | 30 | 3 | -| cleanup_consent | 60 | 30 | 3 | -| post_zulip | 60 | 30 | 5 | -| send_webhook | 60 | 30 | 30 | -""" - -OWNER_EMAIL = "reflector@example.com" - -TASK_DEFINITIONS = [ - { - "name": "get_recording", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["recording_id"], - "outputKeys": ["id", "mtg_session_id", "room_name", "duration"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "get_participants", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["mtg_session_id"], - "outputKeys": ["participants"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "pad_track", - "retryCount": 3, - "timeoutSeconds": 300, - "responseTimeoutSeconds": 120, - "inputKeys": ["track_index", "s3_key", "bucket_name", "transcript_id"], - "outputKeys": ["padded_url", "size", "track_index"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "mixdown_tracks", - "retryCount": 3, - "timeoutSeconds": 600, - "responseTimeoutSeconds": 300, - "inputKeys": ["padded_urls", "transcript_id"], - "outputKeys": ["audio_key", "duration", "size"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "generate_waveform", - "retryCount": 3, - "timeoutSeconds": 120, - "responseTimeoutSeconds": 60, - "inputKeys": ["audio_key", "transcript_id"], - "outputKeys": ["waveform"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "transcribe_track", - "retryCount": 3, - "timeoutSeconds": 1800, - "responseTimeoutSeconds": 900, - "inputKeys": ["track_index", "audio_url", "language"], - "outputKeys": ["words", "track_index"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "merge_transcripts", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["transcripts", "transcript_id"], - "outputKeys": ["all_words", "word_count"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "detect_topics", - "retryCount": 3, - "timeoutSeconds": 300, - "responseTimeoutSeconds": 120, - "inputKeys": ["words", "transcript_id", "target_language"], - "outputKeys": ["topics"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "generate_title", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["topics", "transcript_id"], - "outputKeys": ["title"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "generate_summary", - "retryCount": 3, - "timeoutSeconds": 300, - "responseTimeoutSeconds": 120, - "inputKeys": ["words", "topics", "transcript_id"], - "outputKeys": ["summary", "short_summary"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "finalize", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["transcript_id", "title", "summary", "short_summary", "duration"], - "outputKeys": ["status"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "cleanup_consent", - "retryCount": 3, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["transcript_id"], - "outputKeys": ["audio_deleted", "reason"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "post_zulip", - "retryCount": 5, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["transcript_id"], - "outputKeys": ["message_id"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "send_webhook", - "retryCount": 30, - "timeoutSeconds": 60, - "responseTimeoutSeconds": 30, - "inputKeys": ["transcript_id", "room_id"], - "outputKeys": ["sent", "status_code"], - "ownerEmail": OWNER_EMAIL, - }, - { - "name": "generate_dynamic_fork_tasks", - "retryCount": 3, - "timeoutSeconds": 30, - "responseTimeoutSeconds": 15, - "inputKeys": ["tracks", "task_type", "transcript_id", "bucket_name"], - "outputKeys": ["tasks", "inputs"], - "ownerEmail": OWNER_EMAIL, - "description": "Helper task to generate dynamic fork structure for variable track counts", - }, -] diff --git a/server/reflector/conductor/tasks/register.py b/server/reflector/conductor/tasks/register.py deleted file mode 100644 index 32b3f2d8..00000000 --- a/server/reflector/conductor/tasks/register.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Register task definitions with Conductor server.""" - -import httpx - -from reflector.conductor.tasks.definitions import TASK_DEFINITIONS -from reflector.logger import logger -from reflector.settings import settings - - -def register_task_definitions() -> None: - """Register all task definitions with Conductor server. - - Raises: - httpx.HTTPStatusError: If registration fails. - """ - base_url = settings.CONDUCTOR_SERVER_URL.rstrip("/") - url = f"{base_url}/metadata/taskdefs" - - logger.info( - "Registering task definitions", - count=len(TASK_DEFINITIONS), - url=url, - ) - - with httpx.Client(timeout=30.0) as client: - resp = client.post( - url, - json=TASK_DEFINITIONS, - headers={"Content-Type": "application/json"}, - ) - resp.raise_for_status() - - logger.info("Task definitions registered successfully") - - -async def register_task_definitions_async() -> None: - """Async version of register_task_definitions.""" - base_url = settings.CONDUCTOR_SERVER_URL.rstrip("/") - url = f"{base_url}/metadata/taskdefs" - - logger.info( - "Registering task definitions", - count=len(TASK_DEFINITIONS), - url=url, - ) - - async with httpx.AsyncClient(timeout=30.0) as client: - resp = await client.post( - url, - json=TASK_DEFINITIONS, - headers={"Content-Type": "application/json"}, - ) - resp.raise_for_status() - - logger.info("Task definitions registered successfully") - - -if __name__ == "__main__": - register_task_definitions() - print(f"Registered {len(TASK_DEFINITIONS)} task definitions") diff --git a/server/reflector/conductor/workers/__init__.py b/server/reflector/conductor/workers/__init__.py deleted file mode 100644 index a599e61a..00000000 --- a/server/reflector/conductor/workers/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Conductor workers for the diarization pipeline.""" - -from reflector.conductor.workers.cleanup_consent import cleanup_consent -from reflector.conductor.workers.detect_topics import detect_topics -from reflector.conductor.workers.finalize import finalize -from reflector.conductor.workers.generate_dynamic_fork_tasks import ( - generate_dynamic_fork_tasks, -) -from reflector.conductor.workers.generate_summary import generate_summary -from reflector.conductor.workers.generate_title import generate_title -from reflector.conductor.workers.generate_waveform import generate_waveform -from reflector.conductor.workers.get_participants import get_participants -from reflector.conductor.workers.get_recording import get_recording -from reflector.conductor.workers.merge_transcripts import merge_transcripts -from reflector.conductor.workers.mixdown_tracks import mixdown_tracks -from reflector.conductor.workers.pad_track import pad_track -from reflector.conductor.workers.post_zulip import post_zulip -from reflector.conductor.workers.send_webhook import send_webhook -from reflector.conductor.workers.transcribe_track import transcribe_track - -__all__ = [ - "get_recording", - "get_participants", - "pad_track", - "mixdown_tracks", - "generate_waveform", - "transcribe_track", - "merge_transcripts", - "detect_topics", - "generate_title", - "generate_summary", - "finalize", - "cleanup_consent", - "post_zulip", - "send_webhook", - "generate_dynamic_fork_tasks", -] diff --git a/server/reflector/conductor/workers/cleanup_consent.py b/server/reflector/conductor/workers/cleanup_consent.py deleted file mode 100644 index 04c58cf4..00000000 --- a/server/reflector/conductor/workers/cleanup_consent.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Conductor worker: cleanup_consent - Check consent and delete audio if denied.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="cleanup_consent") -def cleanup_consent(task: Task) -> TaskResult: - """Check participant consent and delete audio if denied. - - Input: - transcript_id: str - Transcript ID - - Output: - audio_deleted: bool - Whether audio was deleted - reason: str | None - Reason for deletion - """ - transcript_id = task.input_data.get("transcript_id") - - logger.info("[Worker] cleanup_consent", transcript_id=transcript_id) - - if transcript_id: - emit_progress( - transcript_id, "cleanup_consent", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing transcript_id" - return task_result - - import asyncio - - async def _process(): - import databases - - from reflector.db import _database_context - from reflector.db.transcripts import transcripts_controller - from reflector.settings import settings - from reflector.storage import get_transcripts_storage - - # Create fresh database connection for subprocess (not shared from parent) - _database_context.set(None) - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - try: - transcript = await transcripts_controller.get_by_id(transcript_id) - if transcript is None: - raise ValueError(f"Transcript {transcript_id} not found in database") - - # Check if any participant denied consent - # This mirrors the logic from main_live_pipeline.task_cleanup_consent - audio_deleted = False - reason = None - - if transcript.participants: - for p in transcript.participants: - if hasattr(p, "consent") and p.consent == "denied": - audio_deleted = True - reason = f"Participant {p.name or p.id} denied consent" - break - - if audio_deleted: - storage = get_transcripts_storage() - audio_key = f"{transcript_id}/audio.mp3" - try: - await storage.delete_file(audio_key) - await transcripts_controller.update( - transcript, {"audio_deleted": True} - ) - logger.info( - "[Worker] cleanup_consent: audio deleted", - transcript_id=transcript_id, - reason=reason, - ) - except Exception as e: - logger.warning( - "[Worker] cleanup_consent: failed to delete audio", - error=str(e), - ) - - return audio_deleted, reason - finally: - await db.disconnect() - _database_context.set(None) - - try: - audio_deleted, reason = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "audio_deleted": audio_deleted, - "reason": reason, - } - - logger.info( - "[Worker] cleanup_consent complete", - transcript_id=transcript_id, - audio_deleted=audio_deleted, - ) - - if transcript_id: - emit_progress( - transcript_id, "cleanup_consent", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] cleanup_consent failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "cleanup_consent", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/detect_topics.py b/server/reflector/conductor/workers/detect_topics.py deleted file mode 100644 index bff25ede..00000000 --- a/server/reflector/conductor/workers/detect_topics.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Conductor worker: detect_topics - Detect topics using LLM.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="detect_topics") -def detect_topics(task: Task) -> TaskResult: - """Detect topics using LLM. - - Input: - words: list[dict] - Transcribed words - transcript_id: str - Transcript ID - target_language: str - Target language code (default: "en") - - Output: - topics: list[dict] - Detected topics - """ - words = task.input_data.get("words", []) - transcript_id = task.input_data.get("transcript_id") - target_language = task.input_data.get("target_language", "en") - - logger.info( - "[Worker] detect_topics", - word_count=len(words), - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "detect_topics", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - import asyncio - - async def _process(): - from reflector.pipelines import topic_processing - from reflector.processors.types import Transcript as TranscriptType - from reflector.processors.types import Word - - # Convert word dicts to Word objects - word_objects = [Word(**w) for w in words] - transcript = TranscriptType(words=word_objects) - - empty_pipeline = topic_processing.EmptyPipeline(logger=logger) - - async def noop_callback(t): - pass - - topics = await topic_processing.detect_topics( - transcript, - target_language, - on_topic_callback=noop_callback, - empty_pipeline=empty_pipeline, - ) - - return [t.model_dump() for t in topics] - - try: - topics = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"topics": topics} - - logger.info( - "[Worker] detect_topics complete", - transcript_id=transcript_id, - topic_count=len(topics), - ) - - if transcript_id: - emit_progress( - transcript_id, "detect_topics", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] detect_topics failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "detect_topics", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/finalize.py b/server/reflector/conductor/workers/finalize.py deleted file mode 100644 index d60e9544..00000000 --- a/server/reflector/conductor/workers/finalize.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Conductor worker: finalize - Finalize transcript status and update database.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="finalize") -def finalize(task: Task) -> TaskResult: - """Finalize the transcript status and update the database. - - Input: - transcript_id: str - Transcript ID - title: str - Generated title - summary: str - Long summary - short_summary: str - Short summary - duration: float - Audio duration - - Output: - status: str - "COMPLETED" - """ - transcript_id = task.input_data.get("transcript_id") - title = task.input_data.get("title", "") - summary = task.input_data.get("summary", "") - short_summary = task.input_data.get("short_summary", "") - duration = task.input_data.get("duration", 0) - - logger.info( - "[Worker] finalize", - transcript_id=transcript_id, - title=title, - ) - - if transcript_id: - emit_progress( - transcript_id, "finalize", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing transcript_id" - return task_result - - import asyncio - - async def _process(): - import databases - - from reflector.db import _database_context - from reflector.db.transcripts import transcripts_controller - from reflector.settings import settings - - # Create fresh database connection for subprocess (not shared from parent) - _database_context.set(None) - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - try: - transcript = await transcripts_controller.get_by_id(transcript_id) - if transcript is None: - raise ValueError(f"Transcript {transcript_id} not found in database") - - await transcripts_controller.update( - transcript, - { - "status": "ended", - "title": title, - "long_summary": summary, - "short_summary": short_summary, - "duration": duration, - }, - ) - return True - finally: - await db.disconnect() - _database_context.set(None) - - try: - asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"status": "COMPLETED"} - - logger.info( - "[Worker] finalize complete", - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "finalize", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] finalize failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "finalize", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/generate_dynamic_fork_tasks.py b/server/reflector/conductor/workers/generate_dynamic_fork_tasks.py deleted file mode 100644 index b150d074..00000000 --- a/server/reflector/conductor/workers/generate_dynamic_fork_tasks.py +++ /dev/null @@ -1,110 +0,0 @@ -"""Conductor worker: generate_dynamic_fork_tasks - Helper for FORK_JOIN_DYNAMIC.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.logger import logger - - -@worker_task(task_definition_name="generate_dynamic_fork_tasks") -def generate_dynamic_fork_tasks(task: Task) -> TaskResult: - """Generate dynamic fork task structure for variable track counts. - - This helper task generates the task definitions and inputs needed for - FORK_JOIN_DYNAMIC to process N tracks in parallel. - - Input: - tracks: list[dict] - List of track info with s3_key - task_type: str - Either "pad_track" or "transcribe_track" - transcript_id: str - Transcript ID - bucket_name: str - S3 bucket name (for pad_track) - padded_urls: list[dict] - Padded track outputs (for transcribe_track) - - Output: - tasks: list[dict] - Task definitions for dynamic fork - inputs: dict - Input parameters keyed by task reference name - """ - tracks = task.input_data.get("tracks", []) - task_type = task.input_data.get("task_type") - transcript_id = task.input_data.get("transcript_id") - bucket_name = task.input_data.get("bucket_name") - padded_urls = task.input_data.get("padded_urls", {}) - - logger.info( - "[Worker] generate_dynamic_fork_tasks", - task_type=task_type, - track_count=len(tracks), - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not tracks or not task_type: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing tracks or task_type" - return task_result - - try: - tasks = [] - inputs = {} - - for idx, track in enumerate(tracks): - ref_name = f"{task_type}_{idx}" - - # Task definition - tasks.append( - { - "name": task_type, - "taskReferenceName": ref_name, - "type": "SIMPLE", - } - ) - - # Task input based on type - if task_type == "pad_track": - inputs[ref_name] = { - "track_index": idx, - "s3_key": track.get("s3_key"), - "bucket_name": bucket_name, - "transcript_id": transcript_id, - } - elif task_type == "transcribe_track": - # Get padded URL from previous fork join output - padded_url = None - if isinstance(padded_urls, dict): - # Try to get from join output structure - pad_ref = f"pad_track_{idx}" - if pad_ref in padded_urls: - padded_url = padded_urls[pad_ref].get("padded_url") - elif "padded_url" in padded_urls: - # Single track case - padded_url = padded_urls.get("padded_url") - - inputs[ref_name] = { - "track_index": idx, - "audio_url": padded_url, - "language": "en", - "transcript_id": transcript_id, - } - - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "tasks": tasks, - "inputs": inputs, - } - - logger.info( - "[Worker] generate_dynamic_fork_tasks complete", - task_type=task_type, - task_count=len(tasks), - ) - - except Exception as e: - logger.error("[Worker] generate_dynamic_fork_tasks failed", error=str(e)) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - - return task_result diff --git a/server/reflector/conductor/workers/generate_summary.py b/server/reflector/conductor/workers/generate_summary.py deleted file mode 100644 index 4c38bb3e..00000000 --- a/server/reflector/conductor/workers/generate_summary.py +++ /dev/null @@ -1,150 +0,0 @@ -"""Conductor worker: generate_summary - Generate meeting summaries using LLM.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="generate_summary") -def generate_summary(task: Task) -> TaskResult: - """Generate long and short summaries from topics and words using LLM. - - Input: - words: list[dict] - Transcribed words - topics: list[dict] - Detected topics - transcript_id: str - Transcript ID - - Output: - summary: str - Long summary - short_summary: str - Short summary - """ - words = task.input_data.get("words", []) - topics = task.input_data.get("topics", []) - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] generate_summary", - word_count=len(words), - topic_count=len(topics), - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "generate_summary", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - import asyncio - - async def _process(): - import databases - - from reflector.db import _database_context - from reflector.db.transcripts import transcripts_controller - from reflector.pipelines import topic_processing - from reflector.processors.types import TitleSummary, Word - from reflector.processors.types import Transcript as TranscriptType - from reflector.settings import settings - - # Create fresh database connection for subprocess (not shared from parent) - # Reset context var to ensure we get a fresh connection - _database_context.set(None) - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - try: - # detect_topics returns TitleSummary objects (with transcript: Transcript) - # When serialized, transcript becomes {translation, words} dict - # We need to reconstruct TitleSummary objects with proper Transcript - def normalize_topic(t): - topic = dict(t) - transcript_data = topic.get("transcript") - if isinstance(transcript_data, dict): - # Reconstruct Transcript object from serialized dict - words_list = transcript_data.get("words", []) - word_objects = [ - Word(**w) if isinstance(w, dict) else w for w in words_list - ] - topic["transcript"] = TranscriptType( - words=word_objects, - translation=transcript_data.get("translation"), - ) - elif transcript_data is None: - topic["transcript"] = TranscriptType(words=[]) - return topic - - topic_objects = [TitleSummary(**normalize_topic(t)) for t in topics] - empty_pipeline = topic_processing.EmptyPipeline(logger=logger) - - transcript = await transcripts_controller.get_by_id(transcript_id) - - long_summary = "" - short_summary = "" - - async def on_long(s): - nonlocal long_summary - # s is FinalLongSummary object - long_summary = s.long_summary if hasattr(s, "long_summary") else str(s) - - async def on_short(s): - nonlocal short_summary - # s is FinalShortSummary object - short_summary = ( - s.short_summary if hasattr(s, "short_summary") else str(s) - ) - - await topic_processing.generate_summaries( - topic_objects, - transcript, - on_long_summary_callback=on_long, - on_short_summary_callback=on_short, - empty_pipeline=empty_pipeline, - logger=logger, - ) - - return long_summary, short_summary - finally: - await db.disconnect() - _database_context.set(None) - - try: - summary, short_summary = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "summary": summary, - "short_summary": short_summary, - } - - logger.info( - "[Worker] generate_summary complete", - transcript_id=transcript_id, - summary_len=len(summary) if summary else 0, - ) - - if transcript_id: - emit_progress( - transcript_id, - "generate_summary", - "completed", - task.workflow_instance_id, - ) - - except Exception as e: - logger.error("[Worker] generate_summary failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "generate_summary", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/generate_title.py b/server/reflector/conductor/workers/generate_title.py deleted file mode 100644 index 75174344..00000000 --- a/server/reflector/conductor/workers/generate_title.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Conductor worker: generate_title - Generate meeting title using LLM.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="generate_title") -def generate_title(task: Task) -> TaskResult: - """Generate meeting title from detected topics using LLM. - - Input: - topics: list[dict] - Detected topics - transcript_id: str - Transcript ID - - Output: - title: str - Generated title - """ - topics = task.input_data.get("topics", []) - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] generate_title", - topic_count=len(topics), - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "generate_title", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not topics: - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"title": "Untitled Meeting"} - return task_result - - import asyncio - - async def _process(): - from reflector.pipelines import topic_processing - from reflector.processors.types import TitleSummary, Word - from reflector.processors.types import Transcript as TranscriptType - - # detect_topics returns TitleSummary objects (with transcript: Transcript) - # When serialized, transcript becomes {translation, words} dict - # We need to reconstruct TitleSummary objects with proper Transcript - def normalize_topic(t): - topic = dict(t) - transcript_data = topic.get("transcript") - if isinstance(transcript_data, dict): - # Reconstruct Transcript object from serialized dict - words_list = transcript_data.get("words", []) - word_objects = [ - Word(**w) if isinstance(w, dict) else w for w in words_list - ] - topic["transcript"] = TranscriptType( - words=word_objects, translation=transcript_data.get("translation") - ) - elif transcript_data is None: - topic["transcript"] = TranscriptType(words=[]) - return topic - - topic_objects = [TitleSummary(**normalize_topic(t)) for t in topics] - empty_pipeline = topic_processing.EmptyPipeline(logger=logger) - - async def noop_callback(t): - pass - - title = await topic_processing.generate_title( - topic_objects, - on_title_callback=noop_callback, - empty_pipeline=empty_pipeline, - logger=logger, - ) - return title - - try: - title = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"title": title} - - logger.info( - "[Worker] generate_title complete", - transcript_id=transcript_id, - title=title, - ) - - if transcript_id: - emit_progress( - transcript_id, "generate_title", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] generate_title failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "generate_title", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/generate_waveform.py b/server/reflector/conductor/workers/generate_waveform.py deleted file mode 100644 index 059f28e4..00000000 --- a/server/reflector/conductor/workers/generate_waveform.py +++ /dev/null @@ -1,106 +0,0 @@ -"""Conductor worker: generate_waveform - Generate waveform visualization data.""" - -import tempfile -from pathlib import Path - -import httpx - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger -from reflector.storage import get_transcripts_storage -from reflector.utils.audio_waveform import get_audio_waveform - -PRESIGNED_URL_EXPIRATION_SECONDS = 7200 - - -@worker_task(task_definition_name="generate_waveform") -def generate_waveform(task: Task) -> TaskResult: - """Generate waveform visualization data from mixed audio. - - Input: - audio_key: str - S3 key of the audio file - transcript_id: str - Transcript ID - - Output: - waveform: list[float] - Waveform peaks array - """ - audio_key = task.input_data.get("audio_key") - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] generate_waveform", audio_key=audio_key, transcript_id=transcript_id - ) - - if transcript_id: - emit_progress( - transcript_id, "generate_waveform", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not audio_key or not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing audio_key or transcript_id" - return task_result - - import asyncio - - async def _process(): - storage = get_transcripts_storage() - audio_url = await storage.get_file_url( - audio_key, - operation="get_object", - expires_in=PRESIGNED_URL_EXPIRATION_SECONDS, - ) - - # Download audio to temp file - with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp: - tmp_path = Path(tmp.name) - async with httpx.AsyncClient() as client: - resp = await client.get(audio_url) - resp.raise_for_status() - tmp.write(resp.content) - - try: - waveform = get_audio_waveform(tmp_path, segments_count=255) - finally: - tmp_path.unlink(missing_ok=True) - - return waveform - - try: - waveform = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"waveform": waveform} - - logger.info( - "[Worker] generate_waveform complete", - transcript_id=transcript_id, - peaks_count=len(waveform) if waveform else 0, - ) - - if transcript_id: - emit_progress( - transcript_id, - "generate_waveform", - "completed", - task.workflow_instance_id, - ) - - except Exception as e: - logger.error("[Worker] generate_waveform failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "generate_waveform", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/get_participants.py b/server/reflector/conductor/workers/get_participants.py deleted file mode 100644 index bbf2d21a..00000000 --- a/server/reflector/conductor/workers/get_participants.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Conductor worker: get_participants - Fetch meeting participants from Daily.co API.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.dailyco_api.client import DailyApiClient -from reflector.logger import logger -from reflector.settings import settings - - -@worker_task(task_definition_name="get_participants") -def get_participants(task: Task) -> TaskResult: - """Fetch meeting participants from Daily.co API. - - Input: - mtg_session_id: str - Daily.co meeting session identifier - transcript_id: str - Transcript ID for progress tracking - - Output: - participants: list[dict] - List of participant info - - participant_id: str - - user_name: str | None - - user_id: str | None - """ - mtg_session_id = task.input_data.get("mtg_session_id") - transcript_id = task.input_data.get("transcript_id") - - logger.info("[Worker] get_participants", mtg_session_id=mtg_session_id) - - if transcript_id: - emit_progress( - transcript_id, "get_participants", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not mtg_session_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing mtg_session_id" - return task_result - - if not settings.DAILY_API_KEY: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "DAILY_API_KEY not configured" - return task_result - - import asyncio - - async def _fetch(): - async with DailyApiClient(api_key=settings.DAILY_API_KEY) as client: - return await client.get_meeting_participants(mtg_session_id) - - try: - response = asyncio.run(_fetch()) - - participants = [ - { - "participant_id": p.participant_id, - "user_name": p.user_name, - "user_id": p.user_id, - } - for p in response.data - ] - - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"participants": participants} - - logger.info( - "[Worker] get_participants complete", - mtg_session_id=mtg_session_id, - count=len(participants), - ) - - if transcript_id: - emit_progress( - transcript_id, - "get_participants", - "completed", - task.workflow_instance_id, - ) - - except Exception as e: - logger.error("[Worker] get_participants failed", error=str(e)) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "get_participants", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/get_recording.py b/server/reflector/conductor/workers/get_recording.py deleted file mode 100644 index ce8163d0..00000000 --- a/server/reflector/conductor/workers/get_recording.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Conductor worker: get_recording - Fetch recording metadata from Daily.co API.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.dailyco_api.client import DailyApiClient -from reflector.logger import logger -from reflector.settings import settings - - -@worker_task(task_definition_name="get_recording") -def get_recording(task: Task) -> TaskResult: - """Fetch recording metadata from Daily.co API. - - Input: - recording_id: str - Daily.co recording identifier - transcript_id: str - Transcript ID for progress tracking - - Output: - id: str - Recording ID - mtg_session_id: str - Meeting session ID - room_name: str - Room name - duration: int - Recording duration in seconds - """ - recording_id = task.input_data.get("recording_id") - transcript_id = task.input_data.get("transcript_id") - - logger.info("[Worker] get_recording", recording_id=recording_id) - - if transcript_id: - emit_progress( - transcript_id, "get_recording", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not recording_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing recording_id" - return task_result - - if not settings.DAILY_API_KEY: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "DAILY_API_KEY not configured" - return task_result - - import asyncio - - async def _fetch(): - async with DailyApiClient(api_key=settings.DAILY_API_KEY) as client: - return await client.get_recording(recording_id) - - try: - recording = asyncio.run(_fetch()) - - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "id": recording.id, - "mtg_session_id": recording.mtgSessionId, - "room_name": recording.room_name, - "duration": recording.duration, - } - - logger.info( - "[Worker] get_recording complete", - recording_id=recording_id, - room_name=recording.room_name, - duration=recording.duration, - ) - - if transcript_id: - emit_progress( - transcript_id, "get_recording", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] get_recording failed", error=str(e)) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "get_recording", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/merge_transcripts.py b/server/reflector/conductor/workers/merge_transcripts.py deleted file mode 100644 index 31a05f12..00000000 --- a/server/reflector/conductor/workers/merge_transcripts.py +++ /dev/null @@ -1,89 +0,0 @@ -"""Conductor worker: merge_transcripts - Merge multiple track transcriptions.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="merge_transcripts") -def merge_transcripts(task: Task) -> TaskResult: - """Merge multiple track transcriptions into single timeline sorted by timestamp. - - Input: - transcripts: list[dict] - List of transcription results with words - transcript_id: str - Transcript ID - - Output: - all_words: list[dict] - Merged and sorted words - word_count: int - Total word count - """ - transcripts = task.input_data.get("transcripts", []) - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] merge_transcripts", - transcript_count=len(transcripts) - if isinstance(transcripts, (list, dict)) - else 0, - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "merge_transcripts", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - try: - all_words = [] - - # Handle JOIN output (dict with task refs as keys) - if isinstance(transcripts, dict): - transcripts = list(transcripts.values()) - - for t in transcripts: - if isinstance(t, list): - all_words.extend(t) - elif isinstance(t, dict) and "words" in t: - all_words.extend(t["words"]) - - # Sort by start timestamp - all_words.sort(key=lambda w: w.get("start", 0)) - - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "all_words": all_words, - "word_count": len(all_words), - } - - logger.info( - "[Worker] merge_transcripts complete", - transcript_id=transcript_id, - word_count=len(all_words), - ) - - if transcript_id: - emit_progress( - transcript_id, - "merge_transcripts", - "completed", - task.workflow_instance_id, - ) - - except Exception as e: - logger.error("[Worker] merge_transcripts failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "merge_transcripts", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/mixdown_tracks.py b/server/reflector/conductor/workers/mixdown_tracks.py deleted file mode 100644 index 2a7afee4..00000000 --- a/server/reflector/conductor/workers/mixdown_tracks.py +++ /dev/null @@ -1,278 +0,0 @@ -"""Conductor worker: mixdown_tracks - Mix multiple audio tracks into single file. - -Builds PyAV filter graph with amix filter to combine N padded tracks into -a single stereo MP3 file. -""" - -import tempfile -from fractions import Fraction -from pathlib import Path - -import av -from av.audio.resampler import AudioResampler - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger -from reflector.storage import get_transcripts_storage - -PRESIGNED_URL_EXPIRATION_SECONDS = 7200 -MP3_BITRATE = 192000 - - -def _build_mixdown_filter_graph(containers: list, target_sample_rate: int): - """Build PyAV filter graph: N abuffer -> amix -> aformat -> sink. - - Args: - containers: List of PyAV containers for input tracks - target_sample_rate: Output sample rate - - Returns: - Tuple of (graph, inputs list, sink) - """ - graph = av.filter.Graph() - inputs = [] - - for idx in range(len(containers)): - args = ( - f"time_base=1/{target_sample_rate}:" - f"sample_rate={target_sample_rate}:" - f"sample_fmt=s32:" - f"channel_layout=stereo" - ) - in_ctx = graph.add("abuffer", args=args, name=f"in{idx}") - inputs.append(in_ctx) - - # amix with normalize=0 to prevent volume reduction - mixer = graph.add("amix", args=f"inputs={len(containers)}:normalize=0", name="mix") - fmt = graph.add( - "aformat", - args=f"sample_fmts=s16:channel_layouts=stereo:sample_rates={target_sample_rate}", - name="fmt", - ) - sink = graph.add("abuffersink", name="out") - - for idx, in_ctx in enumerate(inputs): - in_ctx.link_to(mixer, 0, idx) - mixer.link_to(fmt) - fmt.link_to(sink) - graph.configure() - - return graph, inputs, sink - - -@worker_task(task_definition_name="mixdown_tracks") -def mixdown_tracks(task: Task) -> TaskResult: - """Mix multiple audio tracks into single stereo file. - - Input: - padded_urls: list[str] - Presigned URLs of padded tracks - transcript_id: str - Transcript ID for storage path - - Output: - audio_key: str - S3 key of mixed audio file - duration: float - Audio duration in seconds - size: int - File size in bytes - """ - padded_urls = task.input_data.get("padded_urls", []) - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] mixdown_tracks", - track_count=len(padded_urls), - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "mixdown_tracks", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not padded_urls or not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing padded_urls or transcript_id" - return task_result - - import asyncio - - async def _process(): - storage = get_transcripts_storage() - - # Determine target sample rate from first track - target_sample_rate = None - for url in padded_urls: - if not url: - continue - try: - with av.open(url) as container: - for frame in container.decode(audio=0): - target_sample_rate = frame.sample_rate - break - except Exception: - continue - if target_sample_rate: - break - - if not target_sample_rate: - raise Exception("Mixdown failed: No decodable audio frames in any track") - - # Open all containers with reconnect options for S3 streaming - containers = [] - valid_urls = [url for url in padded_urls if url] - - for url in valid_urls: - try: - c = av.open( - url, - options={ - "reconnect": "1", - "reconnect_streamed": "1", - "reconnect_delay_max": "5", - }, - ) - containers.append(c) - except Exception as e: - logger.warning( - "Mixdown: failed to open container", url=url[:50], error=str(e) - ) - - if not containers: - raise Exception("Mixdown failed: Could not open any track containers") - - try: - # Build filter graph - graph, inputs, sink = _build_mixdown_filter_graph( - containers, target_sample_rate - ) - - # Create temp file for output - with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file: - temp_path = temp_file.name - - try: - # Open output container for MP3 - with av.open(temp_path, "w", format="mp3") as out_container: - out_stream = out_container.add_stream( - "libmp3lame", rate=target_sample_rate - ) - out_stream.bit_rate = MP3_BITRATE - - decoders = [c.decode(audio=0) for c in containers] - active = [True] * len(decoders) - resamplers = [ - AudioResampler( - format="s32", layout="stereo", rate=target_sample_rate - ) - for _ in decoders - ] - - duration_samples = 0 - - while any(active): - for i, (dec, is_active) in enumerate(zip(decoders, active)): - if not is_active: - continue - try: - frame = next(dec) - except StopIteration: - active[i] = False - inputs[i].push(None) - continue - - if frame.sample_rate != target_sample_rate: - continue - - out_frames = resamplers[i].resample(frame) or [] - for rf in out_frames: - rf.sample_rate = target_sample_rate - rf.time_base = Fraction(1, target_sample_rate) - inputs[i].push(rf) - - while True: - try: - mixed = sink.pull() - except Exception: - break - mixed.sample_rate = target_sample_rate - mixed.time_base = Fraction(1, target_sample_rate) - duration_samples += mixed.samples - for packet in out_stream.encode(mixed): - out_container.mux(packet) - - # Flush remaining - while True: - try: - mixed = sink.pull() - except Exception: - break - mixed.sample_rate = target_sample_rate - mixed.time_base = Fraction(1, target_sample_rate) - duration_samples += mixed.samples - for packet in out_stream.encode(mixed): - out_container.mux(packet) - - for packet in out_stream.encode(None): - out_container.mux(packet) - - # Get file size and duration - file_size = Path(temp_path).stat().st_size - duration = ( - duration_samples / target_sample_rate if target_sample_rate else 0 - ) - - # Upload to S3 - storage_path = f"{transcript_id}/audio.mp3" - with open(temp_path, "rb") as mp3_file: - await storage.put_file(storage_path, mp3_file) - - finally: - Path(temp_path).unlink(missing_ok=True) - - finally: - for c in containers: - try: - c.close() - except Exception: - pass - - return { - "audio_key": storage_path, - "duration": duration, - "size": file_size, - } - - try: - result = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = result - - logger.info( - "[Worker] mixdown_tracks complete", - audio_key=result["audio_key"], - duration=result["duration"], - size=result["size"], - ) - - if transcript_id: - emit_progress( - transcript_id, "mixdown_tracks", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] mixdown_tracks failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "mixdown_tracks", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/pad_track.py b/server/reflector/conductor/workers/pad_track.py deleted file mode 100644 index a2c6e36d..00000000 --- a/server/reflector/conductor/workers/pad_track.py +++ /dev/null @@ -1,322 +0,0 @@ -"""Conductor worker: pad_track - Pad audio track with silence for alignment. - -This worker extracts stream.start_time from WebM container metadata and applies -silence padding using PyAV filter graph (adelay). The padded audio is uploaded -to S3 and a presigned URL is returned. -""" - -import math -import tempfile -from fractions import Fraction -from pathlib import Path - -import av -from av.audio.resampler import AudioResampler - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - -# Audio constants matching existing pipeline -OPUS_STANDARD_SAMPLE_RATE = 48000 -OPUS_DEFAULT_BIT_RATE = 64000 -PRESIGNED_URL_EXPIRATION_SECONDS = 7200 - - -def _extract_stream_start_time_from_container(container, track_idx: int) -> float: - """Extract meeting-relative start time from WebM stream metadata. - - Uses PyAV to read stream.start_time from WebM container. - More accurate than filename timestamps by ~209ms due to network/encoding delays. - - Args: - container: PyAV container object - track_idx: Track index for logging - - Returns: - Start time in seconds (0.0 if not found) - """ - start_time_seconds = 0.0 - try: - audio_streams = [s for s in container.streams if s.type == "audio"] - stream = audio_streams[0] if audio_streams else container.streams[0] - - # 1) Try stream-level start_time (most reliable for Daily.co tracks) - if stream.start_time is not None and stream.time_base is not None: - start_time_seconds = float(stream.start_time * stream.time_base) - - # 2) Fallback to container-level start_time (in av.time_base units) - if (start_time_seconds <= 0) and (container.start_time is not None): - start_time_seconds = float(container.start_time * av.time_base) - - # 3) Fallback to first packet DTS in stream.time_base - if start_time_seconds <= 0: - for packet in container.demux(stream): - if packet.dts is not None: - start_time_seconds = float(packet.dts * stream.time_base) - break - except Exception as e: - logger.warning( - "PyAV metadata read failed; assuming 0 start_time", - track_idx=track_idx, - error=str(e), - ) - start_time_seconds = 0.0 - - logger.info( - f"Track {track_idx} stream metadata: start_time={start_time_seconds:.3f}s", - track_idx=track_idx, - ) - return start_time_seconds - - -def _apply_audio_padding_to_file( - in_container, - output_path: str, - start_time_seconds: float, - track_idx: int, -) -> None: - """Apply silence padding to audio track using PyAV filter graph. - - Filter chain: abuffer -> aresample -> adelay -> abuffersink - - Args: - in_container: PyAV input container - output_path: Path to write padded output - start_time_seconds: Amount of silence to prepend - track_idx: Track index for logging - """ - delay_ms = math.floor(start_time_seconds * 1000) - - logger.info( - f"Padding track {track_idx} with {delay_ms}ms delay using PyAV", - track_idx=track_idx, - delay_ms=delay_ms, - ) - - with av.open(output_path, "w", format="webm") as out_container: - in_stream = next((s for s in in_container.streams if s.type == "audio"), None) - if in_stream is None: - raise Exception("No audio stream in input") - - out_stream = out_container.add_stream("libopus", rate=OPUS_STANDARD_SAMPLE_RATE) - out_stream.bit_rate = OPUS_DEFAULT_BIT_RATE - graph = av.filter.Graph() - - abuf_args = ( - f"time_base=1/{OPUS_STANDARD_SAMPLE_RATE}:" - f"sample_rate={OPUS_STANDARD_SAMPLE_RATE}:" - f"sample_fmt=s16:" - f"channel_layout=stereo" - ) - src = graph.add("abuffer", args=abuf_args, name="src") - aresample_f = graph.add("aresample", args="async=1", name="ares") - # adelay requires one delay value per channel separated by '|' - delays_arg = f"{delay_ms}|{delay_ms}" - adelay_f = graph.add("adelay", args=f"delays={delays_arg}:all=1", name="delay") - sink = graph.add("abuffersink", name="sink") - - src.link_to(aresample_f) - aresample_f.link_to(adelay_f) - adelay_f.link_to(sink) - graph.configure() - - resampler = AudioResampler( - format="s16", layout="stereo", rate=OPUS_STANDARD_SAMPLE_RATE - ) - - # Decode -> resample -> push through graph -> encode Opus - for frame in in_container.decode(in_stream): - out_frames = resampler.resample(frame) or [] - for rframe in out_frames: - rframe.sample_rate = OPUS_STANDARD_SAMPLE_RATE - rframe.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE) - src.push(rframe) - - while True: - try: - f_out = sink.pull() - except Exception: - break - f_out.sample_rate = OPUS_STANDARD_SAMPLE_RATE - f_out.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE) - for packet in out_stream.encode(f_out): - out_container.mux(packet) - - # Flush remaining frames - src.push(None) - while True: - try: - f_out = sink.pull() - except Exception: - break - f_out.sample_rate = OPUS_STANDARD_SAMPLE_RATE - f_out.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE) - for packet in out_stream.encode(f_out): - out_container.mux(packet) - - for packet in out_stream.encode(None): - out_container.mux(packet) - - -@worker_task(task_definition_name="pad_track") -def pad_track(task: Task) -> TaskResult: - """Pad audio track with silence for alignment. - - Input: - track_index: int - Index of the track - s3_key: str - S3 key of the source audio file - bucket_name: str - S3 bucket name - transcript_id: str - Transcript ID for storage path - - Output: - padded_url: str - Presigned URL of padded track - size: int - File size in bytes - track_index: int - Track index (echoed back) - """ - track_index = task.input_data.get("track_index", 0) - s3_key = task.input_data.get("s3_key") - bucket_name = task.input_data.get("bucket_name") - transcript_id = task.input_data.get("transcript_id") - - logger.info( - "[Worker] pad_track", - track_index=track_index, - s3_key=s3_key, - transcript_id=transcript_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "pad_track", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not s3_key or not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing s3_key or transcript_id" - return task_result - - import asyncio - - async def _process(): - # Create fresh storage instance to avoid aioboto3 fork issues - from reflector.settings import settings - from reflector.storage.storage_aws import AwsStorage - - storage = AwsStorage( - aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME, - aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION, - aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY, - ) - - # Get presigned URL for source file - source_url = await storage.get_file_url( - s3_key, - operation="get_object", - expires_in=PRESIGNED_URL_EXPIRATION_SECONDS, - bucket=bucket_name, - ) - - # Open container and extract start time - with av.open(source_url) as in_container: - start_time_seconds = _extract_stream_start_time_from_container( - in_container, track_index - ) - - # If no padding needed, return original URL - if start_time_seconds <= 0: - logger.info( - f"Track {track_index} requires no padding", - track_index=track_index, - ) - return { - "padded_url": source_url, - "size": 0, - "track_index": track_index, - } - - # Create temp file for padded output - with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as temp_file: - temp_path = temp_file.name - - try: - _apply_audio_padding_to_file( - in_container, temp_path, start_time_seconds, track_index - ) - - # Get file size - file_size = Path(temp_path).stat().st_size - - # Upload using storage layer (use separate path in shadow mode to avoid conflicts) - storage_path = f"file_pipeline_conductor/{transcript_id}/tracks/padded_{track_index}.webm" - - logger.info( - f"About to upload padded track", - key=storage_path, - size=file_size, - ) - - with open(temp_path, "rb") as padded_file: - upload_result = await storage.put_file(storage_path, padded_file) - logger.info( - f"storage.put_file returned", - result=str(upload_result), - ) - - logger.info( - f"Uploaded padded track to S3", - key=storage_path, - size=file_size, - ) - - finally: - Path(temp_path).unlink(missing_ok=True) - - # Get presigned URL for padded file - padded_url = await storage.get_file_url( - storage_path, - operation="get_object", - expires_in=PRESIGNED_URL_EXPIRATION_SECONDS, - ) - - return { - "padded_url": padded_url, - "size": file_size, - "track_index": track_index, - } - - try: - result = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = result - - logger.info( - "[Worker] pad_track complete", - track_index=track_index, - padded_url=result["padded_url"][:50] + "...", - ) - - if transcript_id: - emit_progress( - transcript_id, "pad_track", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] pad_track failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "pad_track", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/post_zulip.py b/server/reflector/conductor/workers/post_zulip.py deleted file mode 100644 index e668b3da..00000000 --- a/server/reflector/conductor/workers/post_zulip.py +++ /dev/null @@ -1,101 +0,0 @@ -"""Conductor worker: post_zulip - Post or update Zulip message with transcript summary.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger -from reflector.settings import settings - - -@worker_task(task_definition_name="post_zulip") -def post_zulip(task: Task) -> TaskResult: - """Post or update a Zulip message with the transcript summary. - - Input: - transcript_id: str - Transcript ID - - Output: - message_id: str | None - Zulip message ID - """ - transcript_id = task.input_data.get("transcript_id") - - logger.info("[Worker] post_zulip", transcript_id=transcript_id) - - if transcript_id: - emit_progress( - transcript_id, "post_zulip", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing transcript_id" - return task_result - - # Check if Zulip is configured - if not settings.ZULIP_REALM or not settings.ZULIP_API_KEY: - logger.info("[Worker] post_zulip: Zulip not configured, skipping") - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = {"message_id": None} - return task_result - - import asyncio - - async def _process(): - import databases - - from reflector.db import _database_context - from reflector.db.transcripts import transcripts_controller - from reflector.settings import settings as app_settings - from reflector.zulip import post_transcript_to_zulip - - # Create fresh database connection for subprocess (not shared from parent) - _database_context.set(None) - db = databases.Database(app_settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - try: - transcript = await transcripts_controller.get_by_id(transcript_id) - if transcript is None: - raise ValueError(f"Transcript {transcript_id} not found in database") - message_id = await post_transcript_to_zulip(transcript) - return message_id - finally: - await db.disconnect() - _database_context.set(None) - - try: - message_id = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "message_id": str(message_id) if message_id else None - } - - logger.info( - "[Worker] post_zulip complete", - transcript_id=transcript_id, - message_id=message_id, - ) - - if transcript_id: - emit_progress( - transcript_id, "post_zulip", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] post_zulip failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "post_zulip", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/send_webhook.py b/server/reflector/conductor/workers/send_webhook.py deleted file mode 100644 index b6ed9514..00000000 --- a/server/reflector/conductor/workers/send_webhook.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Conductor worker: send_webhook - Send transcript completion webhook.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="send_webhook") -def send_webhook(task: Task) -> TaskResult: - """Send the transcript completion webhook to the configured URL. - - Input: - transcript_id: str - Transcript ID - room_id: str - Room ID - - Output: - sent: bool - Whether webhook was sent - status_code: int | None - HTTP status code - """ - transcript_id = task.input_data.get("transcript_id") - room_id = task.input_data.get("room_id") - - logger.info("[Worker] send_webhook", transcript_id=transcript_id, room_id=room_id) - - if transcript_id: - emit_progress( - transcript_id, "send_webhook", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not transcript_id: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing transcript_id" - return task_result - - import asyncio - - async def _process(): - import databases - - from reflector.db import _database_context - from reflector.db.rooms import rooms_controller - from reflector.db.transcripts import transcripts_controller - from reflector.settings import settings - from reflector.worker.webhook import send_transcript_webhook - - # Create fresh database connection for subprocess (not shared from parent) - _database_context.set(None) - db = databases.Database(settings.DATABASE_URL) - _database_context.set(db) - await db.connect() - - try: - transcript = await transcripts_controller.get_by_id(transcript_id) - if transcript is None: - raise ValueError(f"Transcript {transcript_id} not found in database") - - # Get room for webhook URL - room = None - if room_id: - try: - room = await rooms_controller.get_by_id(room_id) - except Exception: - pass - - if not room or not room.webhook_url: - logger.info( - "[Worker] send_webhook: No webhook URL configured", - transcript_id=transcript_id, - ) - return False, None - - status_code = await send_transcript_webhook(transcript, room) - return True, status_code - finally: - await db.disconnect() - _database_context.set(None) - - try: - sent, status_code = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "sent": sent, - "status_code": status_code, - } - - logger.info( - "[Worker] send_webhook complete", - transcript_id=transcript_id, - sent=sent, - status_code=status_code, - ) - - if transcript_id: - emit_progress( - transcript_id, "send_webhook", "completed", task.workflow_instance_id - ) - - except Exception as e: - logger.error("[Worker] send_webhook failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "send_webhook", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workers/transcribe_track.py b/server/reflector/conductor/workers/transcribe_track.py deleted file mode 100644 index 008ce8d1..00000000 --- a/server/reflector/conductor/workers/transcribe_track.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Conductor worker: transcribe_track - Transcribe audio track using GPU service.""" - -from conductor.client.http.models import Task, TaskResult -from conductor.client.http.models.task_result_status import TaskResultStatus -from conductor.client.worker.worker_task import worker_task -from reflector.conductor.progress import emit_progress -from reflector.logger import logger - - -@worker_task(task_definition_name="transcribe_track") -def transcribe_track(task: Task) -> TaskResult: - """Transcribe audio track using GPU (Modal.com) or local Whisper. - - Input: - track_index: int - Index of the track - audio_url: str - Presigned URL of the audio file - language: str - Language code (default: "en") - transcript_id: str - Transcript ID for progress tracking - - Output: - words: list[dict] - List of transcribed words with timestamps and speaker - track_index: int - Track index (echoed back) - """ - track_index = task.input_data.get("track_index", 0) - audio_url = task.input_data.get("audio_url") - language = task.input_data.get("language", "en") - transcript_id = task.input_data.get("transcript_id") - - logger.info("[Worker] transcribe_track", track_index=track_index, language=language) - - if transcript_id: - emit_progress( - transcript_id, "transcribe_track", "in_progress", task.workflow_instance_id - ) - - task_result = TaskResult( - task_id=task.task_id, - workflow_instance_id=task.workflow_instance_id, - worker_id=task.worker_id, - ) - - if not audio_url: - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = "Missing audio_url" - return task_result - - import asyncio - - async def _process(): - from reflector.pipelines.transcription_helpers import ( - transcribe_file_with_processor, - ) - - transcript = await transcribe_file_with_processor(audio_url, language) - - # Tag all words with speaker index - words = [] - for word in transcript.words: - word_dict = word.model_dump() - word_dict["speaker"] = track_index - words.append(word_dict) - - return words - - try: - words = asyncio.run(_process()) - task_result.status = TaskResultStatus.COMPLETED - task_result.output_data = { - "words": words, - "track_index": track_index, - } - - logger.info( - "[Worker] transcribe_track complete", - track_index=track_index, - word_count=len(words), - ) - - if transcript_id: - emit_progress( - transcript_id, - "transcribe_track", - "completed", - task.workflow_instance_id, - ) - - except Exception as e: - logger.error("[Worker] transcribe_track failed", error=str(e), exc_info=True) - task_result.status = TaskResultStatus.FAILED - task_result.reason_for_incompletion = str(e) - if transcript_id: - emit_progress( - transcript_id, "transcribe_track", "failed", task.workflow_instance_id - ) - - return task_result diff --git a/server/reflector/conductor/workflows/diarization_pipeline.json b/server/reflector/conductor/workflows/diarization_pipeline.json deleted file mode 100644 index 4810e57e..00000000 --- a/server/reflector/conductor/workflows/diarization_pipeline.json +++ /dev/null @@ -1,205 +0,0 @@ -{ - "name": "diarization_pipeline", - "description": "Reflector multitrack diarization pipeline", - "version": 1, - "schemaVersion": 2, - "inputParameters": [ - "recording_id", - "room_name", - "tracks", - "bucket_name", - "transcript_id", - "room_id" - ], - "tasks": [ - { - "name": "get_recording", - "taskReferenceName": "get_recording", - "type": "SIMPLE", - "inputParameters": { - "recording_id": "${workflow.input.recording_id}", - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "get_participants", - "taskReferenceName": "get_participants", - "type": "SIMPLE", - "inputParameters": { - "mtg_session_id": "${get_recording.output.mtg_session_id}", - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "generate_dynamic_fork_tasks", - "taskReferenceName": "generate_padding_tasks", - "type": "SIMPLE", - "inputParameters": { - "tracks": "${workflow.input.tracks}", - "task_type": "pad_track", - "transcript_id": "${workflow.input.transcript_id}", - "bucket_name": "${workflow.input.bucket_name}" - } - }, - { - "name": "fork_track_padding", - "taskReferenceName": "fork_track_padding", - "type": "FORK_JOIN_DYNAMIC", - "inputParameters": { - "dynamicTasks": "${generate_padding_tasks.output.tasks}", - "dynamicTasksInput": "${generate_padding_tasks.output.inputs}" - }, - "dynamicForkTasksParam": "dynamicTasks", - "dynamicForkTasksInputParamName": "dynamicTasksInput" - }, - { - "name": "join_padding", - "taskReferenceName": "join_padding", - "type": "JOIN" - }, - { - "name": "mixdown_tracks", - "taskReferenceName": "mixdown_tracks", - "type": "SIMPLE", - "inputParameters": { - "padded_urls": "${join_padding.output..padded_url}", - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "generate_waveform", - "taskReferenceName": "generate_waveform", - "type": "SIMPLE", - "inputParameters": { - "audio_key": "${mixdown_tracks.output.audio_key}", - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "generate_dynamic_fork_tasks", - "taskReferenceName": "generate_transcription_tasks", - "type": "SIMPLE", - "inputParameters": { - "tracks": "${workflow.input.tracks}", - "task_type": "transcribe_track", - "transcript_id": "${workflow.input.transcript_id}", - "padded_urls": "${join_padding.output}" - } - }, - { - "name": "fork_transcription", - "taskReferenceName": "fork_transcription", - "type": "FORK_JOIN_DYNAMIC", - "inputParameters": { - "dynamicTasks": "${generate_transcription_tasks.output.tasks}", - "dynamicTasksInput": "${generate_transcription_tasks.output.inputs}" - }, - "dynamicForkTasksParam": "dynamicTasks", - "dynamicForkTasksInputParamName": "dynamicTasksInput" - }, - { - "name": "join_transcription", - "taskReferenceName": "join_transcription", - "type": "JOIN" - }, - { - "name": "merge_transcripts", - "taskReferenceName": "merge_transcripts", - "type": "SIMPLE", - "inputParameters": { - "transcripts": "${join_transcription.output}", - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "detect_topics", - "taskReferenceName": "detect_topics", - "type": "SIMPLE", - "inputParameters": { - "words": "${merge_transcripts.output.all_words}", - "transcript_id": "${workflow.input.transcript_id}", - "target_language": "en" - } - }, - { - "name": "fork_generation", - "taskReferenceName": "fork_generation", - "type": "FORK_JOIN", - "forkTasks": [ - [ - { - "name": "generate_title", - "taskReferenceName": "generate_title", - "type": "SIMPLE", - "inputParameters": { - "topics": "${detect_topics.output.topics}", - "transcript_id": "${workflow.input.transcript_id}" - } - } - ], - [ - { - "name": "generate_summary", - "taskReferenceName": "generate_summary", - "type": "SIMPLE", - "inputParameters": { - "words": "${merge_transcripts.output.all_words}", - "topics": "${detect_topics.output.topics}", - "transcript_id": "${workflow.input.transcript_id}" - } - } - ] - ] - }, - { - "name": "join_generation", - "taskReferenceName": "join_generation", - "type": "JOIN", - "joinOn": ["generate_title", "generate_summary"] - }, - { - "name": "finalize", - "taskReferenceName": "finalize", - "type": "SIMPLE", - "inputParameters": { - "transcript_id": "${workflow.input.transcript_id}", - "title": "${generate_title.output.title}", - "summary": "${generate_summary.output.summary}", - "short_summary": "${generate_summary.output.short_summary}", - "duration": "${mixdown_tracks.output.duration}" - } - }, - { - "name": "cleanup_consent", - "taskReferenceName": "cleanup_consent", - "type": "SIMPLE", - "inputParameters": { - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "post_zulip", - "taskReferenceName": "post_zulip", - "type": "SIMPLE", - "inputParameters": { - "transcript_id": "${workflow.input.transcript_id}" - } - }, - { - "name": "send_webhook", - "taskReferenceName": "send_webhook", - "type": "SIMPLE", - "inputParameters": { - "transcript_id": "${workflow.input.transcript_id}", - "room_id": "${workflow.input.room_id}" - } - } - ], - "outputParameters": { - "transcript_id": "${workflow.input.transcript_id}", - "title": "${generate_title.output.title}", - "summary": "${generate_summary.output.summary}", - "duration": "${mixdown_tracks.output.duration}", - "word_count": "${merge_transcripts.output.word_count}" - } -} diff --git a/server/reflector/conductor/workflows/register.py b/server/reflector/conductor/workflows/register.py deleted file mode 100644 index fadae51c..00000000 --- a/server/reflector/conductor/workflows/register.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Register workflow definition with Conductor server.""" - -import json -from pathlib import Path - -import httpx - -from reflector.logger import logger -from reflector.settings import settings - - -def register_workflow() -> None: - """Register the diarization pipeline workflow with Conductor server. - - Raises: - httpx.HTTPStatusError: If registration fails. - """ - workflow_path = Path(__file__).parent / "diarization_pipeline.json" - - with open(workflow_path) as f: - workflow = json.load(f) - - base_url = settings.CONDUCTOR_SERVER_URL.rstrip("/") - url = f"{base_url}/metadata/workflow" - - logger.info( - "Registering workflow", - name=workflow["name"], - version=workflow["version"], - url=url, - ) - - with httpx.Client(timeout=30.0) as client: - resp = client.put( - url, - json=[workflow], - headers={"Content-Type": "application/json"}, - ) - resp.raise_for_status() - - logger.info("Workflow registered successfully", name=workflow["name"]) - - -async def register_workflow_async() -> None: - """Async version of register_workflow.""" - workflow_path = Path(__file__).parent / "diarization_pipeline.json" - - with open(workflow_path) as f: - workflow = json.load(f) - - base_url = settings.CONDUCTOR_SERVER_URL.rstrip("/") - url = f"{base_url}/metadata/workflow" - - logger.info( - "Registering workflow", - name=workflow["name"], - version=workflow["version"], - url=url, - ) - - async with httpx.AsyncClient(timeout=30.0) as client: - resp = await client.put( - url, - json=[workflow], - headers={"Content-Type": "application/json"}, - ) - resp.raise_for_status() - - logger.info("Workflow registered successfully", name=workflow["name"]) - - -if __name__ == "__main__": - register_workflow() - print("Workflow registration complete!") diff --git a/server/reflector/db/recordings.py b/server/reflector/db/recordings.py index 26eb5275..18fe358b 100644 --- a/server/reflector/db/recordings.py +++ b/server/reflector/db/recordings.py @@ -22,9 +22,7 @@ recordings = sa.Table( ), sa.Column("meeting_id", sa.String), sa.Column("track_keys", sa.JSON, nullable=True), - sa.Column("workflow_id", sa.String, nullable=True), sa.Index("idx_recording_meeting_id", "meeting_id"), - sa.Index("idx_recording_workflow_id", "workflow_id"), ) @@ -40,8 +38,6 @@ class Recording(BaseModel): # track_keys can be empty list [] if recording finished but no audio was captured (silence/muted) # None means not a multitrack recording, [] means multitrack with no tracks track_keys: list[str] | None = None - # Conductor workflow ID for tracking pipeline execution - workflow_id: str | None = None @property def is_multitrack(self) -> bool: diff --git a/server/reflector/services/transcript_process.py b/server/reflector/services/transcript_process.py index 1c386a86..06f2e6d6 100644 --- a/server/reflector/services/transcript_process.py +++ b/server/reflector/services/transcript_process.py @@ -12,7 +12,6 @@ from typing import Literal, Union, assert_never import celery from celery.result import AsyncResult -from reflector.conductor.client import ConductorClientManager from reflector.db.recordings import recordings_controller from reflector.db.transcripts import Transcript from reflector.hatchet.client import HatchetClientManager @@ -263,26 +262,6 @@ def dispatch_transcript_processing( logger.info("Hatchet workflow dispatched", workflow_id=workflow_id) durable_started = True - elif settings.CONDUCTOR_ENABLED: - workflow_id = ConductorClientManager.start_workflow( - name="diarization_pipeline", - version=1, - input_data={ - "recording_id": config.recording_id, - "room_name": None, # Not available in reprocess path - "tracks": [{"s3_key": k} for k in config.track_keys], - "bucket_name": config.bucket_name, - "transcript_id": config.transcript_id, - "room_id": config.room_id, - }, - ) - logger.info( - "Started Conductor workflow (reprocess)", - workflow_id=workflow_id, - transcript_id=config.transcript_id, - ) - durable_started = True - # If durable workflow started and not in shadow mode, skip Celery if durable_started and not settings.DURABLE_WORKFLOW_SHADOW_MODE: return None diff --git a/server/reflector/settings.py b/server/reflector/settings.py index b0c49907..1d7b3e45 100644 --- a/server/reflector/settings.py +++ b/server/reflector/settings.py @@ -151,33 +151,19 @@ class Settings(BaseSettings): ZULIP_BOT_EMAIL: str | None = None # Durable workflow orchestration - # Provider: "hatchet" or "conductor" (or "none" to disable) + # Provider: "hatchet" (or "none" to disable) DURABLE_WORKFLOW_PROVIDER: str = "none" DURABLE_WORKFLOW_SHADOW_MODE: bool = False # Run both provider + Celery - # Conductor workflow orchestration - CONDUCTOR_SERVER_URL: str = "http://conductor:8080/api" - CONDUCTOR_DEBUG: bool = False - # Hatchet workflow orchestration HATCHET_CLIENT_TOKEN: str | None = None HATCHET_CLIENT_TLS_STRATEGY: str = "none" # none, tls, mtls HATCHET_DEBUG: bool = False - @property - def CONDUCTOR_ENABLED(self) -> bool: - """Legacy compatibility: True if Conductor is the active provider.""" - return self.DURABLE_WORKFLOW_PROVIDER == "conductor" - @property def HATCHET_ENABLED(self) -> bool: """True if Hatchet is the active provider.""" return self.DURABLE_WORKFLOW_PROVIDER == "hatchet" - @property - def CONDUCTOR_SHADOW_MODE(self) -> bool: - """Legacy compatibility for shadow mode.""" - return self.DURABLE_WORKFLOW_SHADOW_MODE and self.CONDUCTOR_ENABLED - settings = Settings() diff --git a/server/reflector/views/conductor.py b/server/reflector/views/conductor.py deleted file mode 100644 index 6394965e..00000000 --- a/server/reflector/views/conductor.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Conductor health and status endpoints.""" - -import httpx -from fastapi import APIRouter - -from reflector.settings import settings - -router = APIRouter(prefix="/conductor", tags=["conductor"]) - - -@router.get("/health") -async def conductor_health(): - """Check Conductor server connectivity and status.""" - if not settings.CONDUCTOR_ENABLED: - return {"status": "disabled", "connected": False} - - # Extract base URL (remove /api suffix for health check) - base_url = settings.CONDUCTOR_SERVER_URL.rstrip("/api").rstrip("/") - health_url = f"{base_url}/health" - - try: - async with httpx.AsyncClient(timeout=5.0) as client: - resp = await client.get(health_url) - if resp.status_code == 200: - return {"status": "healthy", "connected": True} - else: - return { - "status": "unhealthy", - "connected": True, - "error": f"Health check returned {resp.status_code}", - } - except httpx.TimeoutException: - return { - "status": "unhealthy", - "connected": False, - "error": "Connection timeout", - } - except httpx.ConnectError as e: - return { - "status": "unhealthy", - "connected": False, - "error": f"Connection failed: {e}", - } - except Exception as e: - return {"status": "unhealthy", "connected": False, "error": str(e)} diff --git a/server/reflector/worker/process.py b/server/reflector/worker/process.py index 26d4e1d9..4309b486 100644 --- a/server/reflector/worker/process.py +++ b/server/reflector/worker/process.py @@ -286,7 +286,7 @@ async def _process_multitrack_recording_inner( room_id=room.id, ) - # Start durable workflow if enabled (Hatchet or Conductor) + # Start durable workflow if enabled (Hatchet) durable_started = False if settings.HATCHET_ENABLED: @@ -309,33 +309,10 @@ async def _process_multitrack_recording_inner( transcript_id=transcript.id, ) - # Store workflow_id on recording for status tracking - await recordings_controller.update(recording, {"workflow_id": workflow_id}) - durable_started = True - - elif settings.CONDUCTOR_ENABLED: - from reflector.conductor.client import ConductorClientManager # noqa: PLC0415 - - workflow_id = ConductorClientManager.start_workflow( - name="diarization_pipeline", - version=1, - input_data={ - "recording_id": recording_id, - "room_name": daily_room_name, - "tracks": [{"s3_key": k} for k in filter_cam_audio_tracks(track_keys)], - "bucket_name": bucket_name, - "transcript_id": transcript.id, - "room_id": room.id, - }, + # Store workflow_run_id on transcript for replay/resume + await transcripts_controller.update( + transcript, {"workflow_run_id": workflow_id} ) - logger.info( - "Started Conductor workflow", - workflow_id=workflow_id, - transcript_id=transcript.id, - ) - - # Store workflow_id on recording for status tracking - await recordings_controller.update(recording, {"workflow_id": workflow_id}) durable_started = True # If durable workflow started and not in shadow mode, skip Celery diff --git a/server/runserver.sh b/server/runserver.sh index b1b52ab2..3b8976db 100755 --- a/server/runserver.sh +++ b/server/runserver.sh @@ -7,8 +7,6 @@ elif [ "${ENTRYPOINT}" = "worker" ]; then uv run celery -A reflector.worker.app worker --loglevel=info elif [ "${ENTRYPOINT}" = "beat" ]; then uv run celery -A reflector.worker.app beat --loglevel=info -elif [ "${ENTRYPOINT}" = "conductor-worker" ]; then - uv run python -m reflector.conductor.run_workers elif [ "${ENTRYPOINT}" = "hatchet-worker" ]; then uv run python -m reflector.hatchet.run_workers else diff --git a/server/tests/test_hatchet_client.py b/server/tests/test_hatchet_client.py new file mode 100644 index 00000000..8336440c --- /dev/null +++ b/server/tests/test_hatchet_client.py @@ -0,0 +1,59 @@ +""" +Tests for HatchetClientManager error handling and validation. + +Only tests that catch real bugs - not mock verification tests. +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +@pytest.mark.asyncio +async def test_hatchet_client_can_replay_handles_exception(): + """Test can_replay returns False when status check fails. + + Useful: Ensures network/API errors don't crash the system and + gracefully allow reprocessing when workflow state is unknown. + """ + from reflector.hatchet.client import HatchetClientManager + + HatchetClientManager._instance = None + + with patch("reflector.hatchet.client.settings") as mock_settings: + mock_settings.HATCHET_CLIENT_TOKEN = "test-token" + mock_settings.HATCHET_DEBUG = False + + with patch("reflector.hatchet.client.Hatchet") as mock_hatchet_class: + mock_client = MagicMock() + mock_hatchet_class.return_value = mock_client + + mock_client.runs.aio_get_status = AsyncMock( + side_effect=Exception("Network error") + ) + + can_replay = await HatchetClientManager.can_replay("workflow-123") + + # Should return False on error (workflow might be gone) + assert can_replay is False + + HatchetClientManager._instance = None + + +def test_hatchet_client_raises_without_token(): + """Test that get_client raises ValueError without token. + + Useful: Catches if someone removes the token validation, + which would cause cryptic errors later. + """ + from reflector.hatchet.client import HatchetClientManager + + HatchetClientManager._instance = None + + with patch("reflector.hatchet.client.settings") as mock_settings: + mock_settings.HATCHET_CLIENT_TOKEN = None + + with pytest.raises(ValueError, match="HATCHET_CLIENT_TOKEN must be set"): + HatchetClientManager.get_client() + + HatchetClientManager._instance = None diff --git a/server/tests/test_hatchet_dispatch.py b/server/tests/test_hatchet_dispatch.py new file mode 100644 index 00000000..99b5bcd2 --- /dev/null +++ b/server/tests/test_hatchet_dispatch.py @@ -0,0 +1,376 @@ +""" +Tests for Hatchet workflow dispatch and routing logic. + +These tests verify: +1. Routing to Hatchet when HATCHET_ENABLED=True +2. Replay logic for failed workflows +3. Force flag to cancel and restart +4. Validation prevents concurrent workflows +""" + +from unittest.mock import AsyncMock, patch + +import pytest + +from reflector.db.transcripts import Transcript + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_blocks_running_workflow(): + """Test that validation blocks reprocessing when workflow is running.""" + from reflector.services.transcript_process import ( + ValidationAlreadyScheduled, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="processing", + source_kind="room", + workflow_run_id="running-workflow-123", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + mock_hatchet.get_workflow_run_status = AsyncMock(return_value="RUNNING") + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationAlreadyScheduled) + assert "running" in result.detail.lower() + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_blocks_queued_workflow(): + """Test that validation blocks reprocessing when workflow is queued.""" + from reflector.services.transcript_process import ( + ValidationAlreadyScheduled, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="processing", + source_kind="room", + workflow_run_id="queued-workflow-123", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + mock_hatchet.get_workflow_run_status = AsyncMock(return_value="QUEUED") + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationAlreadyScheduled) + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_allows_failed_workflow(): + """Test that validation allows reprocessing when workflow has failed.""" + from reflector.services.transcript_process import ( + ValidationOk, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="error", + source_kind="room", + workflow_run_id="failed-workflow-123", + recording_id="test-recording-id", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + mock_hatchet.get_workflow_run_status = AsyncMock(return_value="FAILED") + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationOk) + assert result.transcript_id == "test-transcript-id" + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_allows_completed_workflow(): + """Test that validation allows reprocessing when workflow has completed.""" + from reflector.services.transcript_process import ( + ValidationOk, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="ended", + source_kind="room", + workflow_run_id="completed-workflow-123", + recording_id="test-recording-id", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + mock_hatchet.get_workflow_run_status = AsyncMock(return_value="COMPLETED") + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationOk) + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_allows_when_status_check_fails(): + """Test that validation allows reprocessing when status check fails (workflow might be gone).""" + from reflector.services.transcript_process import ( + ValidationOk, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="error", + source_kind="room", + workflow_run_id="old-workflow-123", + recording_id="test-recording-id", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + # Status check fails (workflow might be deleted) + mock_hatchet.get_workflow_run_status = AsyncMock( + side_effect=Exception("Workflow not found") + ) + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + # Should allow processing when we can't get status + assert isinstance(result, ValidationOk) + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_skipped_when_no_workflow_id(): + """Test that Hatchet validation is skipped when transcript has no workflow_run_id.""" + from reflector.services.transcript_process import ( + ValidationOk, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="uploaded", + source_kind="room", + workflow_run_id=None, # No workflow yet + recording_id="test-recording-id", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = True + + with patch("reflector.hatchet.client.HatchetClientManager") as mock_hatchet: + # Should not be called + mock_hatchet.get_workflow_run_status = AsyncMock() + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + # Should not check Hatchet status + mock_hatchet.get_workflow_run_status.assert_not_called() + assert isinstance(result, ValidationOk) + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_hatchet_validation_skipped_when_disabled(): + """Test that Hatchet validation is skipped when HATCHET_ENABLED is False.""" + from reflector.services.transcript_process import ( + ValidationOk, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="uploaded", + source_kind="room", + workflow_run_id="some-workflow-123", + recording_id="test-recording-id", + ) + + with patch("reflector.services.transcript_process.settings") as mock_settings: + mock_settings.HATCHET_ENABLED = False # Hatchet disabled + + with patch( + "reflector.services.transcript_process.task_is_scheduled_or_active" + ) as mock_celery_check: + mock_celery_check.return_value = False + + result = await validate_transcript_for_processing(mock_transcript) + + # Should not check Hatchet at all + assert isinstance(result, ValidationOk) + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_validation_locked_transcript(): + """Test that validation rejects locked transcripts.""" + from reflector.services.transcript_process import ( + ValidationLocked, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="ended", + source_kind="room", + locked=True, + ) + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationLocked) + assert "locked" in result.detail.lower() + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_validation_idle_transcript(): + """Test that validation rejects idle transcripts (not ready).""" + from reflector.services.transcript_process import ( + ValidationNotReady, + validate_transcript_for_processing, + ) + + mock_transcript = Transcript( + id="test-transcript-id", + name="Test", + status="idle", + source_kind="room", + ) + + result = await validate_transcript_for_processing(mock_transcript) + + assert isinstance(result, ValidationNotReady) + assert "not ready" in result.detail.lower() + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_prepare_multitrack_config(): + """Test preparing multitrack processing config.""" + from reflector.db.recordings import Recording + from reflector.services.transcript_process import ( + MultitrackProcessingConfig, + ValidationOk, + prepare_transcript_processing, + ) + + validation = ValidationOk( + recording_id="test-recording-id", + transcript_id="test-transcript-id", + ) + + mock_recording = Recording( + id="test-recording-id", + bucket_name="test-bucket", + object_key="recordings/test", + recorded_at="2024-01-01T00:00:00Z", + track_keys=["track1.webm", "track2.webm"], + ) + + with patch( + "reflector.services.transcript_process.recordings_controller" + ) as mock_rc: + mock_rc.get_by_id = AsyncMock(return_value=mock_recording) + + result = await prepare_transcript_processing(validation, room_id="test-room") + + assert isinstance(result, MultitrackProcessingConfig) + assert result.bucket_name == "test-bucket" + assert result.track_keys == ["track1.webm", "track2.webm"] + assert result.transcript_id == "test-transcript-id" + assert result.room_id == "test-room" + + +@pytest.mark.usefixtures("setup_database") +@pytest.mark.asyncio +async def test_prepare_file_config(): + """Test preparing file processing config (no track keys).""" + from reflector.db.recordings import Recording + from reflector.services.transcript_process import ( + FileProcessingConfig, + ValidationOk, + prepare_transcript_processing, + ) + + validation = ValidationOk( + recording_id="test-recording-id", + transcript_id="test-transcript-id", + ) + + mock_recording = Recording( + id="test-recording-id", + bucket_name="test-bucket", + object_key="recordings/test.mp4", + recorded_at="2024-01-01T00:00:00Z", + track_keys=None, # No track keys = file pipeline + ) + + with patch( + "reflector.services.transcript_process.recordings_controller" + ) as mock_rc: + mock_rc.get_by_id = AsyncMock(return_value=mock_recording) + + result = await prepare_transcript_processing(validation) + + assert isinstance(result, FileProcessingConfig) + assert result.transcript_id == "test-transcript-id" diff --git a/server/tests/test_hatchet_progress.py b/server/tests/test_hatchet_progress.py new file mode 100644 index 00000000..059f68e0 --- /dev/null +++ b/server/tests/test_hatchet_progress.py @@ -0,0 +1,62 @@ +""" +Tests for Hatchet progress emission. + +Only tests that catch real bugs - error handling and step completeness. +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +@pytest.mark.asyncio +async def test_emit_progress_async_handles_exception(): + """Test that emit_progress_async catches exceptions gracefully. + + Critical: Progress emission must NEVER crash the pipeline. + WebSocket errors should be silently caught. + """ + from reflector.hatchet.progress import emit_progress_async + + with patch("reflector.hatchet.progress.get_ws_manager") as mock_get_ws: + mock_ws = MagicMock() + mock_ws.send_json = AsyncMock(side_effect=Exception("WebSocket error")) + mock_get_ws.return_value = mock_ws + + # Should not raise - exceptions are caught + await emit_progress_async( + transcript_id="test-transcript-123", + step="finalize", + status="completed", + ) + + +@pytest.mark.asyncio +async def test_pipeline_steps_mapping_complete(): + """Test the PIPELINE_STEPS mapping includes all expected steps. + + Useful: Catches when someone adds a new pipeline step but forgets + to add it to the progress mapping, resulting in missing UI updates. + """ + from reflector.hatchet.progress import PIPELINE_STEPS, TOTAL_STEPS + + expected_steps = [ + "get_recording", + "get_participants", + "pad_track", + "mixdown_tracks", + "generate_waveform", + "transcribe_track", + "merge_transcripts", + "detect_topics", + "generate_title", + "generate_summary", + "finalize", + "cleanup_consent", + "post_zulip", + "send_webhook", + ] + + for step in expected_steps: + assert step in PIPELINE_STEPS, f"Missing step in PIPELINE_STEPS: {step}" + assert 1 <= PIPELINE_STEPS[step] <= TOTAL_STEPS diff --git a/server/uv.lock b/server/uv.lock index b6ddf93e..fd8389d3 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -283,15 +283,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c5/7c/83ff6046176a675e6a1e8aeefed8892cd97fe7c46af93cc540d1b24b8323/asteroid_filterbanks-0.4.0-py3-none-any.whl", hash = "sha256:4932ac8b6acc6e08fb87cbe8ece84215b5a74eee284fe83acf3540a72a02eaf5", size = 29912 }, ] -[[package]] -name = "astor" -version = "0.8.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5a/21/75b771132fee241dfe601d39ade629548a9626d1d39f333fde31bc46febe/astor-0.8.1.tar.gz", hash = "sha256:6a6effda93f4e1ce9f618779b2dd1d9d84f1e32812c23a29b3fff6fd7f63fa5e", size = 35090 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl", hash = "sha256:070a54e890cefb5b3739d19f30f5a5ec840ffc9c50ffa7d23cc9fc1a38ebbfc5", size = 27488 }, -] - [[package]] name = "async-timeout" version = "5.0.1" @@ -629,27 +620,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff", size = 11424 }, ] -[[package]] -name = "conductor-python" -version = "1.2.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "astor" }, - { name = "certifi" }, - { name = "dacite" }, - { name = "deprecated" }, - { name = "prometheus-client" }, - { name = "python-dateutil" }, - { name = "requests" }, - { name = "shortuuid" }, - { name = "six" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8f/cb/b20991a9b4da4fa6e21cb596ef5b3a7ed61ce2a00a6ec63a901c01f01272/conductor_python-1.2.3.tar.gz", hash = "sha256:50c5e8414e6606a964ecf92b263f1a6af397ad6948640cc8e1e953b2a1615777", size = 175152 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/a1/8aeed6eb02d80bc86ea41b807e36d765c79943aaccd2247912181e21d0c9/conductor_python-1.2.3-py3-none-any.whl", hash = "sha256:07feb303d4732916c979e8d5567259b1ff7ff0d4c6b2f5f627473cfeaa026b94", size = 296261 }, -] - [[package]] name = "contourpy" version = "1.3.3" @@ -797,15 +767,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321 }, ] -[[package]] -name = "dacite" -version = "1.9.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/55/a0/7ca79796e799a3e782045d29bf052b5cde7439a2bbb17f15ff44f7aacc63/dacite-1.9.2.tar.gz", hash = "sha256:6ccc3b299727c7aa17582f0021f6ae14d5de47c7227932c47fec4cdfefd26f09", size = 22420 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/35/386550fd60316d1e37eccdda609b074113298f23cef5bddb2049823fe666/dacite-1.9.2-py3-none-any.whl", hash = "sha256:053f7c3f5128ca2e9aceb66892b1a3c8936d02c686e707bee96e19deef4bc4a0", size = 16600 }, -] - [[package]] name = "databases" version = "0.8.0" @@ -3231,7 +3192,6 @@ dependencies = [ { name = "alembic" }, { name = "av" }, { name = "celery" }, - { name = "conductor-python" }, { name = "databases", extra = ["aiosqlite", "asyncpg"] }, { name = "fastapi", extra = ["standard"] }, { name = "fastapi-pagination" }, @@ -3309,7 +3269,6 @@ requires-dist = [ { name = "alembic", specifier = ">=1.11.3" }, { name = "av", specifier = ">=10.0.0" }, { name = "celery", specifier = ">=5.3.4" }, - { name = "conductor-python", specifier = ">=1.2.3" }, { name = "databases", extras = ["aiosqlite", "asyncpg"], specifier = ">=0.7.0" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.100.1" }, { name = "fastapi-pagination", specifier = ">=0.12.6" }, @@ -3754,15 +3713,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 }, ] -[[package]] -name = "shortuuid" -version = "1.0.13" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8c/e2/bcf761f3bff95856203f9559baf3741c416071dd200c0fc19fad7f078f86/shortuuid-1.0.13.tar.gz", hash = "sha256:3bb9cf07f606260584b1df46399c0b87dd84773e7b25912b7e391e30797c5e72", size = 9662 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/44/21d6bf170bf40b41396480d8d49ad640bca3f2b02139cd52aa1e272830a5/shortuuid-1.0.13-py3-none-any.whl", hash = "sha256:a482a497300b49b4953e15108a7913244e1bb0d41f9d332f5e9925dba33a3c5a", size = 10529 }, -] - [[package]] name = "silero-vad" version = "6.0.0"