self-review

2025-12-23 05:39:05 +00:00 · 2025-12-22 18:06:59 -05:00
parent 8665204ab1
commit 2cbc373cc3
4 changed files with 73 additions and 76 deletions
--- a/server/reflector/hatchet/workflows/diarization_pipeline.py
+++ b/server/reflector/hatchet/workflows/diarization_pipeline.py
@@ -34,14 +34,19 @@ from reflector.hatchet.workflows.models import (
    FinalizeResult,
    MixdownResult,
    PaddedTrackInfo,
+    PadTrackResult,
+    ParticipantInfo,
    ParticipantsResult,
    ProcessSubjectsResult,
    ProcessTracksResult,
    RecapResult,
    RecordingResult,
    SubjectsResult,
+    SubjectSummaryResult,
    TitleResult,
+    TopicChunkResult,
    TopicsResult,
+    TranscribeTrackResult,
    WaveformResult,
    WebhookResult,
    ZulipResult,
@@ -58,13 +63,8 @@ from reflector.hatchet.workflows.track_processing import TrackInput, track_workf
 from reflector.logger import logger
 from reflector.pipelines import topic_processing
 from reflector.processors import AudioFileWriterProcessor
-from reflector.processors.types import (
-    TitleSummary,
-    Word,
-)
-from reflector.processors.types import (
-    Transcript as TranscriptType,
-)
+from reflector.processors.types import TitleSummary, Word
+from reflector.processors.types import Transcript as TranscriptType
 from reflector.settings import settings
 from reflector.storage.storage_aws import AwsStorage
 from reflector.utils.audio_constants import (
@@ -285,7 +285,7 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
        track_keys = [t["s3_key"] for t in input.tracks]
        cam_audio_keys = filter_cam_audio_tracks(track_keys)

-        participants_list = []
+        participants_list: list[ParticipantInfo] = []
        for idx, key in enumerate(cam_audio_keys):
            try:
                parsed = parse_daily_recording_filename(key)
@@ -307,11 +307,11 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
            )
            await transcripts_controller.upsert_participant(transcript, participant)
            participants_list.append(
-                {
-                    "participant_id": participant_id,
-                    "user_name": name,
-                    "speaker": idx,
-                }
+                ParticipantInfo(
+                    participant_id=participant_id,
+                    user_name=name,
+                    speaker=idx,
+                )
            )

        ctx.log(f"get_participants complete: {len(participants_list)} participants")
@@ -352,31 +352,30 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes

    target_language = participants_result.target_language

-    track_words = []
+    track_words: list[list[Word]] = []
    padded_tracks = []
    created_padded_files = set()

    for result in results:
-        transcribe_result = result.get("transcribe_track", {})
-        track_words.append(transcribe_result.get("words", []))
+        transcribe_result = TranscribeTrackResult(**result["transcribe_track"])
+        track_words.append(transcribe_result.words)

-        pad_result = result.get("pad_track", {})
-        padded_key = pad_result.get("padded_key")
-        bucket_name = pad_result.get("bucket_name")
+        pad_result = PadTrackResult(**result["pad_track"])

        # Store S3 key info (not presigned URL) - consumer tasks presign on demand
-        if padded_key:
+        if pad_result.padded_key:
            padded_tracks.append(
-                PaddedTrackInfo(key=padded_key, bucket_name=bucket_name)
+                PaddedTrackInfo(
+                    key=pad_result.padded_key, bucket_name=pad_result.bucket_name
+                )
            )

-        track_index = pad_result.get("track_index")
-        if pad_result.get("size", 0) > 0 and track_index is not None:
-            storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{track_index}.webm"
+        if pad_result.size > 0:
+            storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{pad_result.track_index}.webm"
            created_padded_files.add(storage_path)

    all_words = [word for words in track_words for word in words]
-    all_words.sort(key=lambda w: w.get("start", 0))
+    all_words.sort(key=lambda w: w.start)

    ctx.log(
        f"process_tracks complete: {len(all_words)} words from {len(input.tracks)} tracks"
@@ -569,9 +568,9 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:

        first_word = chunk_words[0]
        last_word = chunk_words[-1]
-        timestamp = first_word.get("start", 0)
-        duration = last_word.get("end", 0) - timestamp
-        chunk_text = " ".join(w.get("word", "") for w in chunk_words)
+        timestamp = first_word.start
+        duration = last_word.end - timestamp
+        chunk_text = " ".join(w.text for w in chunk_words)

        chunks.append(
            {
@@ -604,40 +603,37 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:

    results = await topic_chunk_workflow.aio_run_many(bulk_runs)

-    topic_results = [
-        result.get("detect_chunk_topic", {})
+    topic_chunks = [
+        TopicChunkResult(**result["detect_chunk_topic"])
        for result in results
-        if result.get("detect_chunk_topic")
+        if "detect_chunk_topic" in result
    ]

    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)

-        for topic_data in topic_results:
+        for chunk in topic_chunks:
            topic = TranscriptTopic(
-                title=topic_data.get("title", ""),
-                summary=topic_data.get("summary", ""),
-                timestamp=topic_data.get("timestamp", 0),
-                transcript=" ".join(
-                    w.get("word", "") for w in topic_data.get("words", [])
-                ),
-                words=topic_data.get("words", []),
+                title=chunk.title,
+                summary=chunk.summary,
+                timestamp=chunk.timestamp,
+                transcript=" ".join(w.text for w in chunk.words),
+                words=[w.model_dump() for w in chunk.words],
            )
            await transcripts_controller.upsert_topic(transcript, topic)
            await append_event_and_broadcast(
                input.transcript_id, transcript, "TOPIC", topic, logger=logger
            )

-    # Convert to TitleSummary format for downstream steps
    topics_list = [
-        {
-            "title": t.get("title", ""),
-            "summary": t.get("summary", ""),
-            "timestamp": t.get("timestamp", 0),
-            "duration": t.get("duration", 0),
-            "transcript": {"words": t.get("words", [])},
-        }
-        for t in topic_results
+        TitleSummary(
+            title=chunk.title,
+            summary=chunk.summary,
+            timestamp=chunk.timestamp,
+            duration=chunk.duration,
+            transcript=TranscriptType(words=chunk.words),
+        )
+        for chunk in topic_chunks
    ]

    ctx.log(f"detect_topics complete: found {len(topics_list)} topics")
@@ -662,8 +658,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
        transcripts_controller,
    )

-    topic_objects = [TitleSummary(**t) for t in topics]
-    ctx.log(f"generate_title: created {len(topic_objects)} TitleSummary objects")
+    ctx.log(f"generate_title: received {len(topics)} TitleSummary objects")

    empty_pipeline = topic_processing.EmptyPipeline(logger=logger)
    title_result = None
@@ -695,7 +690,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:

        ctx.log("generate_title: calling topic_processing.generate_title (LLM call)...")
        await topic_processing.generate_title(
-            topic_objects,
+            topics,
            on_title_callback=on_title_callback,
            empty_pipeline=empty_pipeline,
            logger=logger,
@@ -735,8 +730,6 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
        SummaryBuilder,
    )

-    topic_objects = [TitleSummary(**t) for t in topics]
-
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)

@@ -750,7 +743,7 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
            }

        text_lines = []
-        for topic in topic_objects:
+        for topic in topics:
            for segment in topic.transcript.as_segments():
                name = speakermap.get(segment.speaker, f"Speaker {segment.speaker}")
                text_lines.append(f"{name}: {segment.text}")
@@ -818,7 +811,9 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
    results = await subject_workflow.aio_run_many(bulk_runs)

    subject_summaries = [
-        result.get("generate_detailed_summary", {}) for result in results
+        SubjectSummaryResult(**result["generate_detailed_summary"])
+        for result in results
+        if "generate_detailed_summary" in result
    ]

    ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")
@@ -858,7 +853,7 @@ async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
        return RecapResult(short_summary="", long_summary="")

    summaries = [
-        {"subject": s.get("subject", ""), "summary": s.get("paragraph_summary", "")}
+        {"subject": s.subject, "summary": s.paragraph_summary}
        for s in subject_summaries
    ]

@@ -963,7 +958,6 @@ async def identify_action_items(

    action_items_dict = action_items_response.model_dump()

-    # Save to database and broadcast
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
        if transcript:
@@ -1035,8 +1029,7 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
        if transcript is None:
            raise ValueError(f"Transcript {input.transcript_id} not found in database")

-        word_objects = [Word(**w) for w in all_words]
-        merged_transcript = TranscriptType(words=word_objects, translation=None)
+        merged_transcript = TranscriptType(words=all_words, translation=None)

        await append_event_and_broadcast(
            input.transcript_id,
--- a/server/reflector/hatchet/workflows/models.py
+++ b/server/reflector/hatchet/workflows/models.py
@@ -5,13 +5,20 @@ Provides static typing for all task outputs, enabling type checking
 and better IDE support.
 """

-from typing import Any
-
 from pydantic import BaseModel

+from reflector.processors.types import TitleSummary, Word
 from reflector.utils.string import NonEmptyString


+class ParticipantInfo(BaseModel):
+    """Participant info with speaker index for workflow result."""
+
+    participant_id: NonEmptyString
+    user_name: NonEmptyString
+    speaker: int
+
+
 class PadTrackResult(BaseModel):
    """Result from pad_track task."""

@@ -26,7 +33,7 @@ class PadTrackResult(BaseModel):
 class TranscribeTrackResult(BaseModel):
    """Result from transcribe_track task."""

-    words: list[dict[str, Any]]
+    words: list[Word]
    track_index: int


@@ -41,7 +48,7 @@ class RecordingResult(BaseModel):
 class ParticipantsResult(BaseModel):
    """Result from get_participants task."""

-    participants: list[dict[str, Any]]
+    participants: list[ParticipantInfo]
    num_tracks: int
    source_language: NonEmptyString
    target_language: NonEmptyString
@@ -57,7 +64,7 @@ class PaddedTrackInfo(BaseModel):
 class ProcessTracksResult(BaseModel):
    """Result from process_tracks task."""

-    all_words: list[dict[str, Any]]
+    all_words: list[Word]
    padded_tracks: list[PaddedTrackInfo]  # S3 keys, not presigned URLs
    word_count: int
    num_tracks: int
@@ -87,13 +94,13 @@ class TopicChunkResult(BaseModel):
    summary: str
    timestamp: float
    duration: float
-    words: list[dict[str, Any]]
+    words: list[Word]


 class TopicsResult(BaseModel):
    """Result from detect_topics task."""

-    topics: list[dict[str, Any]]
+    topics: list[TitleSummary]


 class TitleResult(BaseModel):
@@ -123,7 +130,7 @@ class SubjectSummaryResult(BaseModel):
 class ProcessSubjectsResult(BaseModel):
    """Result from process_subjects fan-out task."""

-    subject_summaries: list[dict[str, Any]]  # List of SubjectSummaryResult dicts
+    subject_summaries: list[SubjectSummaryResult]


 class RecapResult(BaseModel):
--- a/server/reflector/hatchet/workflows/topic_chunk_processing.py
+++ b/server/reflector/hatchet/workflows/topic_chunk_processing.py
@@ -6,7 +6,6 @@ Spawned dynamically by detect_topics via aio_run_many() for parallel processing.
 """

 from datetime import timedelta
-from typing import Any

 from hatchet_sdk import Context
 from pydantic import BaseModel
@@ -15,6 +14,7 @@ from reflector.hatchet.client import HatchetClientManager
 from reflector.hatchet.workflows.models import TopicChunkResult
 from reflector.logger import logger
 from reflector.processors.prompts import TOPIC_PROMPT
+from reflector.processors.types import Word


 class TopicChunkInput(BaseModel):
@@ -24,7 +24,7 @@ class TopicChunkInput(BaseModel):
    chunk_text: str
    timestamp: float
    duration: float
-    words: list[dict[str, Any]]
+    words: list[Word]


 hatchet = HatchetClientManager.get_client()
--- a/server/reflector/hatchet/workflows/track_processing.py
+++ b/server/reflector/hatchet/workflows/track_processing.py
@@ -197,23 +197,20 @@ async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackRe
        transcript = await transcribe_file_with_processor(audio_url, input.language)

        # Tag all words with speaker index
-        words = []
        for word in transcript.words:
-            word_dict = word.model_dump()
-            word_dict["speaker"] = input.track_index
-            words.append(word_dict)
+            word.speaker = input.track_index

        ctx.log(
-            f"transcribe_track complete: track {input.track_index}, {len(words)} words"
+            f"transcribe_track complete: track {input.track_index}, {len(transcript.words)} words"
        )
        logger.info(
            "[Hatchet] transcribe_track complete",
            track_index=input.track_index,
-            word_count=len(words),
+            word_count=len(transcript.words),
        )

        return TranscribeTrackResult(
-            words=words,
+            words=transcript.words,
            track_index=input.track_index,
        )