self-review round

2025-12-21 04:39:06 +00:00 · 2025-12-18 12:46:05 -05:00
parent 61e2b3211e
commit acad80df50
4 changed files with 48 additions and 66 deletions
--- a/server/reflector/hatchet/utils.py
+++ b/server/reflector/hatchet/utils.py
@@ -1,17 +0,0 @@
 """
 Hatchet workflow utilities.
 Shared helpers for Hatchet task implementations.
 """
 def to_dict(output) -> dict:
    """Convert task output to dict, handling both dict and Pydantic model returns.
    Hatchet SDK can return task outputs as either raw dicts or Pydantic models
    depending on serialization context. This normalizes the output for consistent
    downstream processing.
    """
    if isinstance(output, dict):
        return output
    return output.model_dump()
--- a/server/reflector/hatchet/workflows/diarization_pipeline.py
+++ b/server/reflector/hatchet/workflows/diarization_pipeline.py
@@ -27,7 +27,6 @@ from reflector.hatchet.broadcast import (
    set_status_and_broadcast,
 )
 from reflector.hatchet.client import HatchetClientManager
 from reflector.hatchet.utils import to_dict
 from reflector.hatchet.workflows.models import (
    ConsentResult,
    FinalizeResult,
@@ -69,7 +68,7 @@ from reflector.utils.daily import (
    filter_cam_audio_tracks,
    parse_daily_recording_filename,
 )
-from reflector.utils.string import NonEmptyString
+from reflector.utils.string import NonEmptyString, assert_non_none_and_non_empty
 from reflector.zulip import post_transcript_notification
@@ -212,8 +211,8 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
    """Fetch participant list from Daily.co API and update transcript in database."""
    ctx.log(f"get_participants: transcript_id={input.transcript_id}")
-    recording_data = to_dict(ctx.task_output(get_recording))
+    recording = ctx.task_output(get_recording)
-    mtg_session_id = recording_data.get("mtg_session_id")
+    mtg_session_id = recording.mtg_session_id
    async with fresh_db_connection():
        from reflector.db.transcripts import (  # noqa: PLC0415
@@ -233,15 +232,14 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
                },
            )
-        if not mtg_session_id or not settings.DAILY_API_KEY:
+        mtg_session_id = assert_non_none_and_non_empty(
-            return ParticipantsResult(
+            mtg_session_id, "mtg_session_id is required"
-                participants=[],
+        )
-                num_tracks=len(input.tracks),
+        daily_api_key = assert_non_none_and_non_empty(
-                source_language=transcript.source_language if transcript else "en",
+            settings.DAILY_API_KEY, "DAILY_API_KEY is required"
-                target_language=transcript.target_language if transcript else "en",
+        )
            )
-        async with DailyApiClient(api_key=settings.DAILY_API_KEY) as client:
+        async with DailyApiClient(api_key=daily_api_key) as client:
            participants = await client.get_meeting_participants(mtg_session_id)
        id_to_name = {}
@@ -302,8 +300,8 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
    """Spawn child workflows for each track (dynamic fan-out)."""
    ctx.log(f"process_tracks: spawning {len(input.tracks)} track workflows")
-    participants_data = to_dict(ctx.task_output(get_participants))
+    participants_result = ctx.task_output(get_participants)
-    source_language = participants_data.get("source_language", "en")
+    source_language = participants_result.source_language
    child_coroutines = [
        track_workflow.aio_run(
@@ -320,7 +318,7 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
    results = await asyncio.gather(*child_coroutines)
-    target_language = participants_data.get("target_language", "en")
+    target_language = participants_result.target_language
    # Collect results from each track (don't mutate lists while iterating)
    track_words = []
@@ -371,31 +369,23 @@ async def mixdown_tracks(input: PipelineInput, ctx: Context) -> MixdownResult:
    """Mix all padded tracks into single audio file using PyAV (same as Celery)."""
    ctx.log("mixdown_tracks: mixing padded tracks into single audio file")
-    track_data = to_dict(ctx.task_output(process_tracks))
+    track_result = ctx.task_output(process_tracks)
-    padded_tracks_data = track_data.get("padded_tracks", [])
+    padded_tracks = track_result.padded_tracks
-    if not padded_tracks_data:
+    if not padded_tracks:
        raise ValueError("No padded tracks to mixdown")
    storage = _spawn_storage()
    # Presign URLs on demand (avoids stale URLs on workflow replay)
    padded_urls = []
-    for track_info in padded_tracks_data:
+    for track_info in padded_tracks:
-        # Handle both dict (from to_dict) and PaddedTrackInfo
+        if track_info.key:
        if isinstance(track_info, dict):
            key = track_info.get("key")
            bucket = track_info.get("bucket_name")
        else:
            key = track_info.key
            bucket = track_info.bucket_name
        if key:
            url = await storage.get_file_url(
-                key,
+                track_info.key,
                operation="get_object",
                expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
-                bucket=bucket,
+                bucket=track_info.bucket_name,
            )
            padded_urls.append(url)
@@ -465,8 +455,8 @@ async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResul
    )
    # Cleanup temporary padded S3 files (deferred until after mixdown)
-    track_data = to_dict(ctx.task_output(process_tracks))
+    track_result = ctx.task_output(process_tracks)
-    created_padded_files = track_data.get("created_padded_files", [])
+    created_padded_files = track_result.created_padded_files
    if created_padded_files:
        ctx.log(f"Cleaning up {len(created_padded_files)} temporary S3 files")
        storage = _spawn_storage()
@@ -483,8 +473,8 @@ async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResul
                    error=str(result),
                )
-    mixdown_data = to_dict(ctx.task_output(mixdown_tracks))
+    mixdown_result = ctx.task_output(mixdown_tracks)
-    audio_key = mixdown_data.get("audio_key")
+    audio_key = mixdown_result.audio_key
    storage = _spawn_storage()
    audio_url = await storage.get_file_url(
@@ -532,9 +522,9 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
    """Detect topics using LLM and save to database (matches Celery on_topic callback)."""
    ctx.log("detect_topics: analyzing transcript for topics")
-    track_data = to_dict(ctx.task_output(process_tracks))
+    track_result = ctx.task_output(process_tracks)
-    words = track_data.get("all_words", [])
+    words = track_result.all_words
-    target_language = track_data.get("target_language", "en")
+    target_language = track_result.target_language
    from reflector.db.transcripts import (  # noqa: PLC0415
        TranscriptTopic,
@@ -589,8 +579,8 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
    """Generate meeting title using LLM and save to database (matches Celery on_title callback)."""
    ctx.log("generate_title: generating title from topics")
-    topics_data = to_dict(ctx.task_output(detect_topics))
+    topics_result = ctx.task_output(detect_topics)
-    topics = topics_data.get("topics", [])
+    topics = topics_result.topics
    from reflector.db.transcripts import (  # noqa: PLC0415
        TranscriptFinalTitle,
@@ -638,8 +628,8 @@ async def generate_summary(input: PipelineInput, ctx: Context) -> SummaryResult:
    """Generate meeting summary using LLM and save to database (matches Celery callbacks)."""
    ctx.log("generate_summary: generating long and short summaries")
-    topics_data = to_dict(ctx.task_output(detect_topics))
+    topics_result = ctx.task_output(detect_topics)
-    topics = topics_data.get("topics", [])
+    topics = topics_result.topics
    from reflector.db.transcripts import (  # noqa: PLC0415
        TranscriptFinalLongSummary,
@@ -718,11 +708,11 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
    """
    ctx.log("finalize: saving transcript and setting status to 'ended'")
-    mixdown_data = to_dict(ctx.task_output(mixdown_tracks))
+    mixdown_result = ctx.task_output(mixdown_tracks)
-    track_data = to_dict(ctx.task_output(process_tracks))
+    track_result = ctx.task_output(process_tracks)
-    duration = mixdown_data.get("duration", 0)
+    duration = mixdown_result.duration
-    all_words = track_data.get("all_words", [])
+    all_words = track_result.all_words
    async with fresh_db_connection():
        from reflector.db.transcripts import (  # noqa: PLC0415
--- a/server/reflector/hatchet/workflows/track_processing.py
+++ b/server/reflector/hatchet/workflows/track_processing.py
@@ -23,7 +23,6 @@ from hatchet_sdk import Context
 from pydantic import BaseModel
 from reflector.hatchet.client import HatchetClientManager
 from reflector.hatchet.utils import to_dict
 from reflector.hatchet.workflows.models import PadTrackResult, TranscribeTrackResult
 from reflector.logger import logger
 from reflector.utils.audio_constants import PRESIGNED_URL_EXPIRATION_SECONDS
@@ -166,9 +165,9 @@ async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackRe
    )
    try:
-        pad_result = to_dict(ctx.task_output(pad_track))
+        pad_result = ctx.task_output(pad_track)
-        padded_key = pad_result.get("padded_key")
+        padded_key = pad_result.padded_key
-        bucket_name = pad_result.get("bucket_name")
+        bucket_name = pad_result.bucket_name
        if not padded_key:
            raise ValueError("Missing padded_key from pad_track")
--- a/server/reflector/utils/string.py
+++ b/server/reflector/utils/string.py
@@ -2,6 +2,8 @@ from typing import Annotated, TypeVar
 from pydantic import Field, TypeAdapter, constr
 from reflector.utils.common import assert_not_none
 NonEmptyStringBase = constr(min_length=1, strip_whitespace=False)
 NonEmptyString = Annotated[
    NonEmptyStringBase,
@@ -30,3 +32,11 @@ def assert_equal[T](s1: T, s2: T) -> T:
    if s1 != s2:
        raise ValueError(f"assert_equal: {s1} != {s2}")
    return s1
 def assert_non_none_and_non_empty(
    value: str | None, error: str | None = None
 ) -> NonEmptyString:
    return parse_non_empty_string(
        assert_not_none(value, error or "Value is None"), error
    )