mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-23 21:59:06 +00:00
* parallelize hatchet (no-mistakes) * dry (no-mistakes) (minimal) * comments * self-review * self-review * self-review * self-review * pr comments * pr comments --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
1264 lines
45 KiB
Python
1264 lines
45 KiB
Python
"""
|
|
Hatchet main workflow: DiarizationPipeline
|
|
|
|
Multitrack diarization pipeline for Daily.co recordings.
|
|
Orchestrates the full processing flow from recording metadata to final transcript.
|
|
|
|
Note: This file uses deferred imports (inside functions/tasks) intentionally.
|
|
Hatchet workers run in forked processes; fresh imports per task ensure DB connections
|
|
are not shared across forks, avoiding connection pooling issues.
|
|
"""
|
|
|
|
import asyncio
|
|
import functools
|
|
import json
|
|
import tempfile
|
|
from contextlib import asynccontextmanager
|
|
from datetime import timedelta
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
|
|
import httpx
|
|
from hatchet_sdk import Context
|
|
from pydantic import BaseModel
|
|
|
|
from reflector.dailyco_api.client import DailyApiClient
|
|
from reflector.hatchet.broadcast import (
|
|
append_event_and_broadcast,
|
|
set_status_and_broadcast,
|
|
)
|
|
from reflector.hatchet.client import HatchetClientManager
|
|
from reflector.hatchet.constants import (
|
|
TIMEOUT_AUDIO,
|
|
TIMEOUT_HEAVY,
|
|
TIMEOUT_LONG,
|
|
TIMEOUT_MEDIUM,
|
|
TIMEOUT_SHORT,
|
|
)
|
|
from reflector.hatchet.workflows.models import (
|
|
ActionItemsResult,
|
|
ConsentResult,
|
|
FinalizeResult,
|
|
MixdownResult,
|
|
PaddedTrackInfo,
|
|
PadTrackResult,
|
|
ParticipantInfo,
|
|
ParticipantsResult,
|
|
ProcessSubjectsResult,
|
|
ProcessTracksResult,
|
|
RecapResult,
|
|
RecordingResult,
|
|
SubjectsResult,
|
|
SubjectSummaryResult,
|
|
TitleResult,
|
|
TopicChunkResult,
|
|
TopicsResult,
|
|
TranscribeTrackResult,
|
|
WaveformResult,
|
|
WebhookResult,
|
|
ZulipResult,
|
|
)
|
|
from reflector.hatchet.workflows.subject_processing import (
|
|
SubjectInput,
|
|
subject_workflow,
|
|
)
|
|
from reflector.hatchet.workflows.topic_chunk_processing import (
|
|
TopicChunkInput,
|
|
topic_chunk_workflow,
|
|
)
|
|
from reflector.hatchet.workflows.track_processing import TrackInput, track_workflow
|
|
from reflector.logger import logger
|
|
from reflector.pipelines import topic_processing
|
|
from reflector.processors import AudioFileWriterProcessor
|
|
from reflector.processors.types import TitleSummary, Word
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
from reflector.settings import settings
|
|
from reflector.storage.storage_aws import AwsStorage
|
|
from reflector.utils.audio_constants import (
|
|
PRESIGNED_URL_EXPIRATION_SECONDS,
|
|
WAVEFORM_SEGMENTS,
|
|
)
|
|
from reflector.utils.audio_mixdown import (
|
|
detect_sample_rate_from_tracks,
|
|
mixdown_tracks_pyav,
|
|
)
|
|
from reflector.utils.audio_waveform import get_audio_waveform
|
|
from reflector.utils.daily import (
|
|
filter_cam_audio_tracks,
|
|
parse_daily_recording_filename,
|
|
)
|
|
from reflector.utils.string import NonEmptyString, assert_non_none_and_non_empty
|
|
from reflector.utils.transcript_constants import TOPIC_CHUNK_WORD_COUNT
|
|
from reflector.zulip import post_transcript_notification
|
|
|
|
|
|
class PipelineInput(BaseModel):
|
|
"""Input to trigger the diarization pipeline."""
|
|
|
|
recording_id: NonEmptyString
|
|
tracks: list[dict] # List of {"s3_key": str}
|
|
bucket_name: NonEmptyString
|
|
transcript_id: NonEmptyString
|
|
room_id: NonEmptyString | None = None
|
|
|
|
|
|
hatchet = HatchetClientManager.get_client()
|
|
|
|
diarization_pipeline = hatchet.workflow(
|
|
name="DiarizationPipeline", input_validator=PipelineInput
|
|
)
|
|
|
|
|
|
@asynccontextmanager
|
|
async def fresh_db_connection():
|
|
"""Context manager for database connections in Hatchet workers.
|
|
TECH DEBT: Made to make connection fork-aware without changing db code too much.
|
|
The real fix would be making the db module fork-aware instead of bypassing it.
|
|
Current pattern is acceptable given Hatchet's process model.
|
|
"""
|
|
import databases # noqa: PLC0415
|
|
|
|
from reflector.db import _database_context # noqa: PLC0415
|
|
|
|
_database_context.set(None)
|
|
db = databases.Database(settings.DATABASE_URL)
|
|
_database_context.set(db)
|
|
await db.connect()
|
|
try:
|
|
yield db
|
|
finally:
|
|
await db.disconnect()
|
|
_database_context.set(None)
|
|
|
|
|
|
async def set_workflow_error_status(transcript_id: NonEmptyString) -> bool:
|
|
"""Set transcript status to 'error' on workflow failure.
|
|
|
|
Returns:
|
|
True if status was set successfully, False if failed.
|
|
Failure is logged as CRITICAL since it means transcript may be stuck.
|
|
"""
|
|
try:
|
|
async with fresh_db_connection():
|
|
await set_status_and_broadcast(transcript_id, "error", logger=logger)
|
|
return True
|
|
except Exception as e:
|
|
logger.critical(
|
|
"[Hatchet] CRITICAL: Failed to set error status - transcript may be stuck in 'processing'",
|
|
transcript_id=transcript_id,
|
|
error=str(e),
|
|
exc_info=True,
|
|
)
|
|
return False
|
|
|
|
|
|
def _spawn_storage():
|
|
"""Create fresh storage instance."""
|
|
return AwsStorage(
|
|
aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
|
|
aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
|
|
aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID,
|
|
aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY,
|
|
)
|
|
|
|
|
|
def with_error_handling(step_name: str, set_error_status: bool = True) -> Callable:
|
|
"""Decorator that handles task failures uniformly.
|
|
|
|
Args:
|
|
step_name: Name of the step for logging and progress tracking.
|
|
set_error_status: Whether to set transcript status to 'error' on failure.
|
|
"""
|
|
|
|
def decorator(func: Callable) -> Callable:
|
|
@functools.wraps(func)
|
|
async def wrapper(input: PipelineInput, ctx: Context):
|
|
try:
|
|
return await func(input, ctx)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"[Hatchet] {step_name} failed",
|
|
transcript_id=input.transcript_id,
|
|
error=str(e),
|
|
exc_info=True,
|
|
)
|
|
if set_error_status:
|
|
await set_workflow_error_status(input.transcript_id)
|
|
raise
|
|
|
|
return wrapper
|
|
|
|
return decorator
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
|
|
)
|
|
@with_error_handling("get_recording")
|
|
async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
|
|
"""Fetch recording metadata from Daily.co API."""
|
|
ctx.log(f"get_recording: starting for recording_id={input.recording_id}")
|
|
ctx.log(
|
|
f"get_recording: transcript_id={input.transcript_id}, room_id={input.room_id}"
|
|
)
|
|
ctx.log(
|
|
f"get_recording: bucket_name={input.bucket_name}, tracks={len(input.tracks)}"
|
|
)
|
|
|
|
# Set transcript status to "processing" at workflow start (broadcasts to WebSocket)
|
|
ctx.log("get_recording: establishing DB connection...")
|
|
async with fresh_db_connection():
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
|
|
ctx.log("get_recording: DB connection established, fetching transcript...")
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
ctx.log(f"get_recording: transcript exists={transcript is not None}")
|
|
if transcript:
|
|
ctx.log(
|
|
f"get_recording: current status={transcript.status}, setting to 'processing'..."
|
|
)
|
|
await set_status_and_broadcast(
|
|
input.transcript_id, "processing", logger=logger
|
|
)
|
|
ctx.log(f"get_recording: status set to 'processing' and broadcasted")
|
|
|
|
if not settings.DAILY_API_KEY:
|
|
ctx.log("get_recording: ERROR - DAILY_API_KEY not configured")
|
|
raise ValueError("DAILY_API_KEY not configured")
|
|
|
|
ctx.log(
|
|
f"get_recording: calling Daily.co API for recording_id={input.recording_id}..."
|
|
)
|
|
async with DailyApiClient(api_key=settings.DAILY_API_KEY) as client:
|
|
recording = await client.get_recording(input.recording_id)
|
|
ctx.log(f"get_recording: Daily.co API returned successfully")
|
|
|
|
ctx.log(
|
|
f"get_recording complete: room={recording.room_name}, duration={recording.duration}s, mtg_session_id={recording.mtgSessionId}"
|
|
)
|
|
|
|
return RecordingResult(
|
|
id=recording.id,
|
|
mtg_session_id=recording.mtgSessionId,
|
|
duration=recording.duration,
|
|
)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[get_recording],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("get_participants")
|
|
async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsResult:
|
|
"""Fetch participant list from Daily.co API and update transcript in database."""
|
|
ctx.log(f"get_participants: transcript_id={input.transcript_id}")
|
|
|
|
recording = ctx.task_output(get_recording)
|
|
mtg_session_id = recording.mtg_session_id
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptParticipant,
|
|
transcripts_controller,
|
|
)
|
|
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
# Note: title NOT cleared - preserves existing titles
|
|
await transcripts_controller.update(
|
|
transcript,
|
|
{
|
|
"events": [],
|
|
"topics": [],
|
|
"participants": [],
|
|
},
|
|
)
|
|
|
|
mtg_session_id = assert_non_none_and_non_empty(
|
|
mtg_session_id, "mtg_session_id is required"
|
|
)
|
|
daily_api_key = assert_non_none_and_non_empty(
|
|
settings.DAILY_API_KEY, "DAILY_API_KEY is required"
|
|
)
|
|
|
|
async with DailyApiClient(api_key=daily_api_key) as client:
|
|
participants = await client.get_meeting_participants(mtg_session_id)
|
|
|
|
id_to_name = {}
|
|
id_to_user_id = {}
|
|
for p in participants.data:
|
|
if p.user_name:
|
|
id_to_name[p.participant_id] = p.user_name
|
|
if p.user_id:
|
|
id_to_user_id[p.participant_id] = p.user_id
|
|
|
|
track_keys = [t["s3_key"] for t in input.tracks]
|
|
cam_audio_keys = filter_cam_audio_tracks(track_keys)
|
|
|
|
participants_list: list[ParticipantInfo] = []
|
|
for idx, key in enumerate(cam_audio_keys):
|
|
try:
|
|
parsed = parse_daily_recording_filename(key)
|
|
participant_id = parsed.participant_id
|
|
except ValueError as e:
|
|
logger.error(
|
|
"Failed to parse Daily recording filename",
|
|
error=str(e),
|
|
key=key,
|
|
)
|
|
continue
|
|
|
|
default_name = f"Speaker {idx}"
|
|
name = id_to_name.get(participant_id, default_name)
|
|
user_id = id_to_user_id.get(participant_id)
|
|
|
|
participant = TranscriptParticipant(
|
|
id=participant_id, speaker=idx, name=name, user_id=user_id
|
|
)
|
|
await transcripts_controller.upsert_participant(transcript, participant)
|
|
participants_list.append(
|
|
ParticipantInfo(
|
|
participant_id=participant_id,
|
|
user_name=name,
|
|
speaker=idx,
|
|
)
|
|
)
|
|
|
|
ctx.log(f"get_participants complete: {len(participants_list)} participants")
|
|
|
|
return ParticipantsResult(
|
|
participants=participants_list,
|
|
num_tracks=len(input.tracks),
|
|
source_language=transcript.source_language if transcript else "en",
|
|
target_language=transcript.target_language if transcript else "en",
|
|
)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[get_participants],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("process_tracks")
|
|
async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksResult:
|
|
"""Spawn child workflows for each track (dynamic fan-out)."""
|
|
ctx.log(f"process_tracks: spawning {len(input.tracks)} track workflows")
|
|
|
|
participants_result = ctx.task_output(get_participants)
|
|
source_language = participants_result.source_language
|
|
|
|
bulk_runs = [
|
|
track_workflow.create_bulk_run_item(
|
|
input=TrackInput(
|
|
track_index=i,
|
|
s3_key=track["s3_key"],
|
|
bucket_name=input.bucket_name,
|
|
transcript_id=input.transcript_id,
|
|
language=source_language,
|
|
)
|
|
)
|
|
for i, track in enumerate(input.tracks)
|
|
]
|
|
|
|
results = await track_workflow.aio_run_many(bulk_runs)
|
|
|
|
target_language = participants_result.target_language
|
|
|
|
track_words: list[list[Word]] = []
|
|
padded_tracks = []
|
|
created_padded_files = set()
|
|
|
|
for result in results:
|
|
transcribe_result = TranscribeTrackResult(**result["transcribe_track"])
|
|
track_words.append(transcribe_result.words)
|
|
|
|
pad_result = PadTrackResult(**result["pad_track"])
|
|
|
|
# Store S3 key info (not presigned URL) - consumer tasks presign on demand
|
|
if pad_result.padded_key:
|
|
padded_tracks.append(
|
|
PaddedTrackInfo(
|
|
key=pad_result.padded_key, bucket_name=pad_result.bucket_name
|
|
)
|
|
)
|
|
|
|
if pad_result.size > 0:
|
|
storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{pad_result.track_index}.webm"
|
|
created_padded_files.add(storage_path)
|
|
|
|
all_words = [word for words in track_words for word in words]
|
|
all_words.sort(key=lambda w: w.start)
|
|
|
|
ctx.log(
|
|
f"process_tracks complete: {len(all_words)} words from {len(input.tracks)} tracks"
|
|
)
|
|
|
|
return ProcessTracksResult(
|
|
all_words=all_words,
|
|
padded_tracks=padded_tracks,
|
|
word_count=len(all_words),
|
|
num_tracks=len(input.tracks),
|
|
target_language=target_language,
|
|
created_padded_files=list(created_padded_files),
|
|
)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[process_tracks],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_AUDIO),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("mixdown_tracks")
|
|
async def mixdown_tracks(input: PipelineInput, ctx: Context) -> MixdownResult:
|
|
"""Mix all padded tracks into single audio file using PyAV (same as Celery)."""
|
|
ctx.log("mixdown_tracks: mixing padded tracks into single audio file")
|
|
|
|
track_result = ctx.task_output(process_tracks)
|
|
padded_tracks = track_result.padded_tracks
|
|
|
|
# TODO think of NonEmpty type to avoid those checks, e.g. sized.NonEmpty from https://github.com/antonagestam/phantom-types/
|
|
if not padded_tracks:
|
|
raise ValueError("No padded tracks to mixdown")
|
|
|
|
storage = _spawn_storage()
|
|
|
|
# Presign URLs on demand (avoids stale URLs on workflow replay)
|
|
padded_urls = []
|
|
for track_info in padded_tracks:
|
|
if track_info.key:
|
|
url = await storage.get_file_url(
|
|
track_info.key,
|
|
operation="get_object",
|
|
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
|
|
bucket=track_info.bucket_name,
|
|
)
|
|
padded_urls.append(url)
|
|
|
|
valid_urls = [url for url in padded_urls if url]
|
|
if not valid_urls:
|
|
raise ValueError("No valid padded tracks to mixdown")
|
|
|
|
target_sample_rate = detect_sample_rate_from_tracks(valid_urls, logger=logger)
|
|
if not target_sample_rate:
|
|
logger.error("Mixdown failed - no decodable audio frames found")
|
|
raise ValueError("No decodable audio frames in any track")
|
|
|
|
output_path = tempfile.mktemp(suffix=".mp3")
|
|
duration_ms_callback_capture_container = [0.0]
|
|
|
|
async def capture_duration(d):
|
|
duration_ms_callback_capture_container[0] = d
|
|
|
|
writer = AudioFileWriterProcessor(path=output_path, on_duration=capture_duration)
|
|
|
|
await mixdown_tracks_pyav(
|
|
valid_urls,
|
|
writer,
|
|
target_sample_rate,
|
|
offsets_seconds=None,
|
|
logger=logger,
|
|
)
|
|
await writer.flush()
|
|
|
|
file_size = Path(output_path).stat().st_size
|
|
storage_path = f"{input.transcript_id}/audio.mp3"
|
|
|
|
with open(output_path, "rb") as mixed_file:
|
|
await storage.put_file(storage_path, mixed_file)
|
|
|
|
Path(output_path).unlink(missing_ok=True)
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
await transcripts_controller.update(
|
|
transcript, {"audio_location": "storage"}
|
|
)
|
|
|
|
ctx.log(f"mixdown_tracks complete: uploaded {file_size} bytes to {storage_path}")
|
|
|
|
return MixdownResult(
|
|
audio_key=storage_path,
|
|
duration=duration_ms_callback_capture_container[0],
|
|
tracks_mixed=len(valid_urls),
|
|
)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[mixdown_tracks],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("generate_waveform")
|
|
async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResult:
|
|
"""Generate audio waveform visualization using AudioWaveformProcessor (matches Celery)."""
|
|
ctx.log(f"generate_waveform: transcript_id={input.transcript_id}")
|
|
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptWaveform,
|
|
transcripts_controller,
|
|
)
|
|
|
|
mixdown_result = ctx.task_output(mixdown_tracks)
|
|
audio_key = mixdown_result.audio_key
|
|
|
|
storage = _spawn_storage()
|
|
audio_url = await storage.get_file_url(
|
|
audio_key,
|
|
operation="get_object",
|
|
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
|
|
)
|
|
|
|
# Download MP3 to temp file (AudioWaveformProcessor needs local file)
|
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
|
|
temp_path = temp_file.name
|
|
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.get(audio_url, timeout=120)
|
|
response.raise_for_status()
|
|
with open(temp_path, "wb") as f:
|
|
f.write(response.content)
|
|
|
|
waveform = get_audio_waveform(
|
|
path=Path(temp_path), segments_count=WAVEFORM_SEGMENTS
|
|
)
|
|
|
|
async with fresh_db_connection():
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
# Write waveform to file (same as Celery AudioWaveformProcessor)
|
|
transcript.data_path.mkdir(parents=True, exist_ok=True)
|
|
with open(transcript.audio_waveform_filename, "w") as f:
|
|
json.dump(waveform, f)
|
|
ctx.log(
|
|
f"generate_waveform: wrote waveform to {transcript.audio_waveform_filename}"
|
|
)
|
|
|
|
waveform_data = TranscriptWaveform(waveform=waveform)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"WAVEFORM",
|
|
waveform_data,
|
|
logger=logger,
|
|
)
|
|
|
|
finally:
|
|
Path(temp_path).unlink(missing_ok=True)
|
|
|
|
ctx.log("generate_waveform complete")
|
|
|
|
return WaveformResult(waveform_generated=True)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[mixdown_tracks],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("detect_topics")
|
|
async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
|
|
"""Detect topics using parallel child workflows (one per chunk)."""
|
|
ctx.log("detect_topics: analyzing transcript for topics")
|
|
|
|
track_result = ctx.task_output(process_tracks)
|
|
words = track_result.all_words
|
|
|
|
if not words:
|
|
ctx.log("detect_topics: no words, returning empty topics")
|
|
return TopicsResult(topics=[])
|
|
|
|
# Deferred imports: Hatchet workers fork processes
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptTopic,
|
|
transcripts_controller,
|
|
)
|
|
|
|
chunk_size = TOPIC_CHUNK_WORD_COUNT
|
|
chunks = []
|
|
for i in range(0, len(words), chunk_size):
|
|
chunk_words = words[i : i + chunk_size]
|
|
if not chunk_words:
|
|
continue
|
|
|
|
first_word = chunk_words[0]
|
|
last_word = chunk_words[-1]
|
|
timestamp = first_word.start
|
|
duration = last_word.end - timestamp
|
|
chunk_text = " ".join(w.text for w in chunk_words)
|
|
|
|
chunks.append(
|
|
{
|
|
"index": len(chunks),
|
|
"text": chunk_text,
|
|
"timestamp": timestamp,
|
|
"duration": duration,
|
|
"words": chunk_words,
|
|
}
|
|
)
|
|
|
|
if not chunks:
|
|
ctx.log("detect_topics: no chunks generated, returning empty topics")
|
|
return TopicsResult(topics=[])
|
|
|
|
ctx.log(f"detect_topics: spawning {len(chunks)} topic chunk workflows in parallel")
|
|
|
|
bulk_runs = [
|
|
topic_chunk_workflow.create_bulk_run_item(
|
|
input=TopicChunkInput(
|
|
chunk_index=chunk["index"],
|
|
chunk_text=chunk["text"],
|
|
timestamp=chunk["timestamp"],
|
|
duration=chunk["duration"],
|
|
words=chunk["words"],
|
|
)
|
|
)
|
|
for chunk in chunks
|
|
]
|
|
|
|
results = await topic_chunk_workflow.aio_run_many(bulk_runs)
|
|
|
|
topic_chunks = [
|
|
TopicChunkResult(**result["detect_chunk_topic"]) for result in results
|
|
]
|
|
|
|
async with fresh_db_connection():
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
|
|
for chunk in topic_chunks:
|
|
topic = TranscriptTopic(
|
|
title=chunk.title,
|
|
summary=chunk.summary,
|
|
timestamp=chunk.timestamp,
|
|
transcript=" ".join(w.text for w in chunk.words),
|
|
words=[w.model_dump() for w in chunk.words],
|
|
)
|
|
await transcripts_controller.upsert_topic(transcript, topic)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id, transcript, "TOPIC", topic, logger=logger
|
|
)
|
|
|
|
topics_list = [
|
|
TitleSummary(
|
|
title=chunk.title,
|
|
summary=chunk.summary,
|
|
timestamp=chunk.timestamp,
|
|
duration=chunk.duration,
|
|
transcript=TranscriptType(words=chunk.words),
|
|
)
|
|
for chunk in topic_chunks
|
|
]
|
|
|
|
ctx.log(f"detect_topics complete: found {len(topics_list)} topics")
|
|
|
|
return TopicsResult(topics=topics_list)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[detect_topics],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("generate_title")
|
|
async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
|
|
"""Generate meeting title using LLM and save to database (matches Celery on_title callback)."""
|
|
ctx.log(f"generate_title: starting for transcript_id={input.transcript_id}")
|
|
|
|
topics_result = ctx.task_output(detect_topics)
|
|
topics = topics_result.topics
|
|
ctx.log(f"generate_title: received {len(topics)} topics from detect_topics")
|
|
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptFinalTitle,
|
|
transcripts_controller,
|
|
)
|
|
|
|
ctx.log(f"generate_title: received {len(topics)} TitleSummary objects")
|
|
|
|
empty_pipeline = topic_processing.EmptyPipeline(logger=logger)
|
|
title_result = None
|
|
|
|
async with fresh_db_connection():
|
|
ctx.log("generate_title: DB connection established")
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
ctx.log(f"generate_title: fetched transcript, exists={transcript is not None}")
|
|
|
|
async def on_title_callback(data):
|
|
nonlocal title_result
|
|
ctx.log(f"generate_title: on_title_callback received title='{data.title}'")
|
|
title_result = data.title
|
|
final_title = TranscriptFinalTitle(title=data.title)
|
|
if not transcript.title:
|
|
await transcripts_controller.update(
|
|
transcript,
|
|
{"title": final_title.title},
|
|
)
|
|
ctx.log("generate_title: saved title to DB")
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"FINAL_TITLE",
|
|
final_title,
|
|
logger=logger,
|
|
)
|
|
ctx.log("generate_title: broadcasted FINAL_TITLE event")
|
|
|
|
ctx.log("generate_title: calling topic_processing.generate_title (LLM call)...")
|
|
await topic_processing.generate_title(
|
|
topics,
|
|
on_title_callback=on_title_callback,
|
|
empty_pipeline=empty_pipeline,
|
|
logger=logger,
|
|
)
|
|
ctx.log("generate_title: topic_processing.generate_title returned")
|
|
|
|
ctx.log(f"generate_title complete: '{title_result}'")
|
|
|
|
return TitleResult(title=title_result)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[detect_topics],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("extract_subjects")
|
|
async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult:
|
|
"""Extract main subjects/topics from transcript for parallel processing."""
|
|
ctx.log(f"extract_subjects: starting for transcript_id={input.transcript_id}")
|
|
|
|
topics_result = ctx.task_output(detect_topics)
|
|
topics = topics_result.topics
|
|
|
|
if not topics:
|
|
ctx.log("extract_subjects: no topics, returning empty subjects")
|
|
return SubjectsResult(
|
|
subjects=[],
|
|
transcript_text="",
|
|
participant_names=[],
|
|
participant_name_to_id={},
|
|
)
|
|
|
|
# Deferred imports: Hatchet workers fork processes, fresh imports avoid
|
|
# sharing DB connections and LLM HTTP pools across forks
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
from reflector.llm import LLM # noqa: PLC0415
|
|
from reflector.processors.summary.summary_builder import ( # noqa: PLC0415
|
|
SummaryBuilder,
|
|
)
|
|
|
|
async with fresh_db_connection():
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
|
|
# Build transcript text from topics (same logic as TranscriptFinalSummaryProcessor)
|
|
speakermap = {}
|
|
if transcript and transcript.participants:
|
|
speakermap = {
|
|
p.speaker: p.name
|
|
for p in transcript.participants
|
|
if p.speaker is not None and p.name
|
|
}
|
|
|
|
text_lines = []
|
|
for topic in topics:
|
|
for segment in topic.transcript.as_segments():
|
|
name = speakermap.get(segment.speaker, f"Speaker {segment.speaker}")
|
|
text_lines.append(f"{name}: {segment.text}")
|
|
|
|
transcript_text = "\n".join(text_lines)
|
|
|
|
participant_names = []
|
|
participant_name_to_id = {}
|
|
if transcript and transcript.participants:
|
|
participant_names = [p.name for p in transcript.participants if p.name]
|
|
participant_name_to_id = {
|
|
p.name: p.id for p in transcript.participants if p.name and p.id
|
|
}
|
|
|
|
# TODO: refactor SummaryBuilder methods into standalone functions
|
|
llm = LLM(settings=settings)
|
|
builder = SummaryBuilder(llm, logger=logger)
|
|
builder.set_transcript(transcript_text)
|
|
|
|
if participant_names:
|
|
builder.set_known_participants(
|
|
participant_names, participant_name_to_id=participant_name_to_id
|
|
)
|
|
|
|
ctx.log("extract_subjects: calling LLM to extract subjects")
|
|
await builder.extract_subjects()
|
|
|
|
ctx.log(f"extract_subjects complete: {len(builder.subjects)} subjects")
|
|
|
|
return SubjectsResult(
|
|
subjects=builder.subjects,
|
|
transcript_text=transcript_text,
|
|
participant_names=participant_names,
|
|
participant_name_to_id=participant_name_to_id,
|
|
)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[extract_subjects],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_HEAVY),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("process_subjects")
|
|
async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubjectsResult:
|
|
"""Spawn child workflows for each subject (dynamic fan-out, parallel LLM calls)."""
|
|
subjects_result = ctx.task_output(extract_subjects)
|
|
subjects = subjects_result.subjects
|
|
|
|
if not subjects:
|
|
ctx.log("process_subjects: no subjects to process")
|
|
return ProcessSubjectsResult(subject_summaries=[])
|
|
|
|
ctx.log(f"process_subjects: spawning {len(subjects)} subject workflows in parallel")
|
|
|
|
bulk_runs = [
|
|
subject_workflow.create_bulk_run_item(
|
|
input=SubjectInput(
|
|
subject=subject,
|
|
subject_index=i,
|
|
transcript_text=subjects_result.transcript_text,
|
|
participant_names=subjects_result.participant_names,
|
|
participant_name_to_id=subjects_result.participant_name_to_id,
|
|
)
|
|
)
|
|
for i, subject in enumerate(subjects)
|
|
]
|
|
|
|
results = await subject_workflow.aio_run_many(bulk_runs)
|
|
|
|
subject_summaries = [
|
|
SubjectSummaryResult(**result["generate_detailed_summary"])
|
|
for result in results
|
|
]
|
|
|
|
ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")
|
|
|
|
return ProcessSubjectsResult(subject_summaries=subject_summaries)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[process_subjects],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("generate_recap")
|
|
async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
|
|
"""Generate recap and long summary from subject summaries, save to database."""
|
|
ctx.log(f"generate_recap: starting for transcript_id={input.transcript_id}")
|
|
|
|
subjects_result = ctx.task_output(extract_subjects)
|
|
process_result = ctx.task_output(process_subjects)
|
|
|
|
# Deferred imports: Hatchet workers fork processes, fresh imports avoid
|
|
# sharing DB connections and LLM HTTP pools across forks
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptFinalLongSummary,
|
|
TranscriptFinalShortSummary,
|
|
transcripts_controller,
|
|
)
|
|
from reflector.llm import LLM # noqa: PLC0415
|
|
from reflector.processors.summary.prompts import ( # noqa: PLC0415
|
|
RECAP_PROMPT,
|
|
build_participant_instructions,
|
|
build_summary_markdown,
|
|
)
|
|
|
|
subject_summaries = process_result.subject_summaries
|
|
|
|
if not subject_summaries:
|
|
ctx.log("generate_recap: no subject summaries, returning empty")
|
|
return RecapResult(short_summary="", long_summary="")
|
|
|
|
summaries = [
|
|
{"subject": s.subject, "summary": s.paragraph_summary}
|
|
for s in subject_summaries
|
|
]
|
|
|
|
summaries_text = "\n\n".join([f"{s['subject']}: {s['summary']}" for s in summaries])
|
|
|
|
llm = LLM(settings=settings)
|
|
|
|
participant_instructions = build_participant_instructions(
|
|
subjects_result.participant_names
|
|
)
|
|
|
|
recap_prompt = RECAP_PROMPT
|
|
if participant_instructions:
|
|
recap_prompt = f"{recap_prompt}\n\n{participant_instructions}"
|
|
|
|
ctx.log("generate_recap: calling LLM for recap")
|
|
recap_response = await llm.get_response(
|
|
recap_prompt,
|
|
[summaries_text],
|
|
tone_name="Recap summarizer",
|
|
)
|
|
short_summary = str(recap_response)
|
|
|
|
long_summary = build_summary_markdown(short_summary, summaries)
|
|
|
|
async with fresh_db_connection():
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
await transcripts_controller.update(
|
|
transcript,
|
|
{
|
|
"short_summary": short_summary,
|
|
"long_summary": long_summary,
|
|
},
|
|
)
|
|
|
|
final_short = TranscriptFinalShortSummary(short_summary=short_summary)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"FINAL_SHORT_SUMMARY",
|
|
final_short,
|
|
logger=logger,
|
|
)
|
|
|
|
final_long = TranscriptFinalLongSummary(long_summary=long_summary)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"FINAL_LONG_SUMMARY",
|
|
final_long,
|
|
logger=logger,
|
|
)
|
|
|
|
ctx.log("generate_recap complete")
|
|
|
|
return RecapResult(short_summary=short_summary, long_summary=long_summary)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[extract_subjects],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_LONG),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("identify_action_items")
|
|
async def identify_action_items(
|
|
input: PipelineInput, ctx: Context
|
|
) -> ActionItemsResult:
|
|
"""Identify action items from transcript (parallel with subject processing)."""
|
|
ctx.log(f"identify_action_items: starting for transcript_id={input.transcript_id}")
|
|
|
|
subjects_result = ctx.task_output(extract_subjects)
|
|
|
|
if not subjects_result.transcript_text:
|
|
ctx.log("identify_action_items: no transcript text, returning empty")
|
|
return ActionItemsResult(action_items={"decisions": [], "next_steps": []})
|
|
|
|
# Deferred imports: Hatchet workers fork processes, fresh imports avoid
|
|
# sharing DB connections and LLM HTTP pools across forks
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptActionItems,
|
|
transcripts_controller,
|
|
)
|
|
from reflector.llm import LLM # noqa: PLC0415
|
|
from reflector.processors.summary.summary_builder import ( # noqa: PLC0415
|
|
SummaryBuilder,
|
|
)
|
|
|
|
# TODO: refactor SummaryBuilder methods into standalone functions
|
|
llm = LLM(settings=settings)
|
|
builder = SummaryBuilder(llm, logger=logger)
|
|
builder.set_transcript(subjects_result.transcript_text)
|
|
|
|
if subjects_result.participant_names:
|
|
builder.set_known_participants(
|
|
subjects_result.participant_names,
|
|
participant_name_to_id=subjects_result.participant_name_to_id,
|
|
)
|
|
|
|
ctx.log("identify_action_items: calling LLM")
|
|
action_items_response = await builder.identify_action_items()
|
|
|
|
if action_items_response is None:
|
|
raise RuntimeError("Failed to identify action items - LLM call failed")
|
|
|
|
action_items_dict = action_items_response.model_dump()
|
|
|
|
async with fresh_db_connection():
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
action_items = TranscriptActionItems(action_items=action_items_dict)
|
|
await transcripts_controller.update(
|
|
transcript, {"action_items": action_items.action_items}
|
|
)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"ACTION_ITEMS",
|
|
action_items,
|
|
logger=logger,
|
|
)
|
|
|
|
ctx.log(
|
|
f"identify_action_items complete: {len(action_items_dict.get('decisions', []))} decisions, "
|
|
f"{len(action_items_dict.get('next_steps', []))} next steps"
|
|
)
|
|
|
|
return ActionItemsResult(action_items=action_items_dict)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[generate_waveform, generate_title, generate_recap, identify_action_items],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
|
|
retries=3,
|
|
)
|
|
@with_error_handling("finalize")
|
|
async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
|
|
"""Finalize transcript: save words, emit TRANSCRIPT event, set status to 'ended'.
|
|
|
|
Matches Celery's on_transcript + set_status behavior.
|
|
Note: Title and summaries are already saved by their respective task callbacks.
|
|
"""
|
|
ctx.log("finalize: saving transcript and setting status to 'ended'")
|
|
|
|
mixdown_result = ctx.task_output(mixdown_tracks)
|
|
track_result = ctx.task_output(process_tracks)
|
|
|
|
duration = mixdown_result.duration
|
|
all_words = track_result.all_words
|
|
|
|
# Cleanup temporary padded S3 files (deferred until finalize for semantic parity with Celery)
|
|
created_padded_files = track_result.created_padded_files
|
|
if created_padded_files:
|
|
ctx.log(f"Cleaning up {len(created_padded_files)} temporary S3 files")
|
|
storage = _spawn_storage()
|
|
cleanup_results = await asyncio.gather(
|
|
*[storage.delete_file(path) for path in created_padded_files],
|
|
return_exceptions=True,
|
|
)
|
|
for storage_path, result in zip(created_padded_files, cleanup_results):
|
|
if isinstance(result, Exception):
|
|
logger.warning(
|
|
"[Hatchet] Failed to cleanup temporary padded track",
|
|
storage_path=storage_path,
|
|
error=str(result),
|
|
)
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.transcripts import ( # noqa: PLC0415
|
|
TranscriptDuration,
|
|
TranscriptText,
|
|
transcripts_controller,
|
|
)
|
|
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript is None:
|
|
raise ValueError(f"Transcript {input.transcript_id} not found in database")
|
|
|
|
merged_transcript = TranscriptType(words=all_words, translation=None)
|
|
|
|
await append_event_and_broadcast(
|
|
input.transcript_id,
|
|
transcript,
|
|
"TRANSCRIPT",
|
|
TranscriptText(
|
|
text=merged_transcript.text,
|
|
translation=merged_transcript.translation,
|
|
),
|
|
logger=logger,
|
|
)
|
|
|
|
# Save duration and clear workflow_run_id (workflow completed successfully)
|
|
# Note: title/long_summary/short_summary already saved by their callbacks
|
|
await transcripts_controller.update(
|
|
transcript,
|
|
{
|
|
"duration": duration,
|
|
"workflow_run_id": None, # Clear on success - no need to resume
|
|
},
|
|
)
|
|
|
|
duration_data = TranscriptDuration(duration=duration)
|
|
await append_event_and_broadcast(
|
|
input.transcript_id, transcript, "DURATION", duration_data, logger=logger
|
|
)
|
|
|
|
await set_status_and_broadcast(input.transcript_id, "ended", logger=logger)
|
|
|
|
ctx.log(
|
|
f"finalize complete: transcript {input.transcript_id} status set to 'ended'"
|
|
)
|
|
|
|
return FinalizeResult(status="COMPLETED")
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[finalize], execution_timeout=timedelta(seconds=TIMEOUT_SHORT), retries=3
|
|
)
|
|
@with_error_handling("cleanup_consent", set_error_status=False)
|
|
async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
|
|
"""Check consent and delete audio files if any participant denied."""
|
|
ctx.log(f"cleanup_consent: transcript_id={input.transcript_id}")
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.meetings import ( # noqa: PLC0415
|
|
meeting_consent_controller,
|
|
meetings_controller,
|
|
)
|
|
from reflector.db.recordings import recordings_controller # noqa: PLC0415
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
from reflector.storage import get_transcripts_storage # noqa: PLC0415
|
|
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if not transcript:
|
|
ctx.log("cleanup_consent: transcript not found")
|
|
return ConsentResult()
|
|
|
|
consent_denied = False
|
|
if transcript.meeting_id:
|
|
meeting = await meetings_controller.get_by_id(transcript.meeting_id)
|
|
if meeting:
|
|
consent_denied = await meeting_consent_controller.has_any_denial(
|
|
meeting.id
|
|
)
|
|
|
|
if not consent_denied:
|
|
ctx.log("cleanup_consent: consent approved, keeping all files")
|
|
return ConsentResult()
|
|
|
|
ctx.log("cleanup_consent: consent denied, deleting audio files")
|
|
|
|
input_track_keys = set(t["s3_key"] for t in input.tracks)
|
|
|
|
# Detect if recording.track_keys was manually modified after workflow started
|
|
if transcript.recording_id:
|
|
recording = await recordings_controller.get_by_id(transcript.recording_id)
|
|
if recording and recording.track_keys:
|
|
db_track_keys = set(filter_cam_audio_tracks(recording.track_keys))
|
|
|
|
if input_track_keys != db_track_keys:
|
|
added = db_track_keys - input_track_keys
|
|
removed = input_track_keys - db_track_keys
|
|
logger.warning(
|
|
"[Hatchet] Track keys mismatch: DB changed since workflow start",
|
|
transcript_id=input.transcript_id,
|
|
recording_id=transcript.recording_id,
|
|
input_count=len(input_track_keys),
|
|
db_count=len(db_track_keys),
|
|
added_in_db=list(added) if added else None,
|
|
removed_from_db=list(removed) if removed else None,
|
|
)
|
|
ctx.log(
|
|
f"WARNING: track_keys mismatch - "
|
|
f"input has {len(input_track_keys)}, DB has {len(db_track_keys)}. "
|
|
f"Using input tracks for deletion."
|
|
)
|
|
|
|
deletion_errors = []
|
|
|
|
if input_track_keys and input.bucket_name:
|
|
master_storage = get_transcripts_storage()
|
|
for key in input_track_keys:
|
|
try:
|
|
await master_storage.delete_file(key, bucket=input.bucket_name)
|
|
ctx.log(f"Deleted recording file: {input.bucket_name}/{key}")
|
|
except Exception as e:
|
|
error_msg = f"Failed to delete {key}: {e}"
|
|
logger.error(error_msg, exc_info=True)
|
|
deletion_errors.append(error_msg)
|
|
|
|
if transcript.audio_location == "storage":
|
|
storage = get_transcripts_storage()
|
|
try:
|
|
await storage.delete_file(transcript.storage_audio_path)
|
|
ctx.log(f"Deleted processed audio: {transcript.storage_audio_path}")
|
|
except Exception as e:
|
|
error_msg = f"Failed to delete processed audio: {e}"
|
|
logger.error(error_msg, exc_info=True)
|
|
deletion_errors.append(error_msg)
|
|
|
|
if deletion_errors:
|
|
logger.warning(
|
|
"[Hatchet] cleanup_consent completed with errors",
|
|
transcript_id=input.transcript_id,
|
|
error_count=len(deletion_errors),
|
|
errors=deletion_errors,
|
|
)
|
|
ctx.log(f"cleanup_consent completed with {len(deletion_errors)} errors")
|
|
else:
|
|
await transcripts_controller.update(transcript, {"audio_deleted": True})
|
|
ctx.log("cleanup_consent: all audio deleted successfully")
|
|
|
|
return ConsentResult()
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[cleanup_consent],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_SHORT),
|
|
retries=5,
|
|
)
|
|
@with_error_handling("post_zulip", set_error_status=False)
|
|
async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
|
|
"""Post notification to Zulip."""
|
|
ctx.log(f"post_zulip: transcript_id={input.transcript_id}")
|
|
|
|
if not settings.ZULIP_REALM:
|
|
ctx.log("post_zulip skipped (Zulip not configured)")
|
|
return ZulipResult(zulip_message_id=None, skipped=True)
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
if transcript:
|
|
message_id = await post_transcript_notification(transcript)
|
|
ctx.log(f"post_zulip complete: zulip_message_id={message_id}")
|
|
else:
|
|
message_id = None
|
|
|
|
return ZulipResult(zulip_message_id=message_id)
|
|
|
|
|
|
@diarization_pipeline.task(
|
|
parents=[post_zulip],
|
|
execution_timeout=timedelta(seconds=TIMEOUT_MEDIUM),
|
|
retries=30,
|
|
)
|
|
@with_error_handling("send_webhook", set_error_status=False)
|
|
async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
|
|
"""Send completion webhook to external service."""
|
|
ctx.log(f"send_webhook: transcript_id={input.transcript_id}")
|
|
|
|
if not input.room_id:
|
|
ctx.log("send_webhook skipped (no room_id)")
|
|
return WebhookResult(webhook_sent=False, skipped=True)
|
|
|
|
async with fresh_db_connection():
|
|
from reflector.db.rooms import rooms_controller # noqa: PLC0415
|
|
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
|
|
|
|
room = await rooms_controller.get_by_id(input.room_id)
|
|
transcript = await transcripts_controller.get_by_id(input.transcript_id)
|
|
|
|
if room and room.webhook_url and transcript:
|
|
webhook_payload = {
|
|
"event": "transcript.completed",
|
|
"transcript_id": input.transcript_id,
|
|
"title": transcript.title,
|
|
"duration": transcript.duration,
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
room.webhook_url, json=webhook_payload, timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
|
|
ctx.log(f"send_webhook complete: status_code={response.status_code}")
|
|
|
|
return WebhookResult(webhook_sent=True, response_code=response.status_code)
|
|
|
|
return WebhookResult(webhook_sent=False, skipped=True)
|