self-review

2025-12-23 05:39:05 +00:00 · 2025-12-22 18:06:59 -05:00
parent 8665204ab1
commit 2cbc373cc3
4 changed files with 73 additions and 76 deletions
--- a/server/reflector/hatchet/workflows/diarization_pipeline.py
+++ b/server/reflector/hatchet/workflows/diarization_pipeline.py
@@ -34,14 +34,19 @@ from reflector.hatchet.workflows.models import (
    FinalizeResult,
    MixdownResult,
    PaddedTrackInfo,
    PadTrackResult,
    ParticipantInfo,
    ParticipantsResult,
    ProcessSubjectsResult,
    ProcessTracksResult,
    RecapResult,
    RecordingResult,
    SubjectsResult,
    SubjectSummaryResult,
    TitleResult,
    TopicChunkResult,
    TopicsResult,
    TranscribeTrackResult,
    WaveformResult,
    WebhookResult,
    ZulipResult,
@@ -58,13 +63,8 @@ from reflector.hatchet.workflows.track_processing import TrackInput, track_workf
 from reflector.logger import logger
 from reflector.pipelines import topic_processing
 from reflector.processors import AudioFileWriterProcessor
-from reflector.processors.types import (
+from reflector.processors.types import TitleSummary, Word
-    TitleSummary,
+from reflector.processors.types import Transcript as TranscriptType
    Word,
 )
 from reflector.processors.types import (
    Transcript as TranscriptType,
 )
 from reflector.settings import settings
 from reflector.storage.storage_aws import AwsStorage
 from reflector.utils.audio_constants import (
@@ -285,7 +285,7 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
        track_keys = [t["s3_key"] for t in input.tracks]
        cam_audio_keys = filter_cam_audio_tracks(track_keys)
-        participants_list = []
+        participants_list: list[ParticipantInfo] = []
        for idx, key in enumerate(cam_audio_keys):
            try:
                parsed = parse_daily_recording_filename(key)
@@ -307,11 +307,11 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
            )
            await transcripts_controller.upsert_participant(transcript, participant)
            participants_list.append(
-                {
+                ParticipantInfo(
-                    "participant_id": participant_id,
+                    participant_id=participant_id,
-                    "user_name": name,
+                    user_name=name,
-                    "speaker": idx,
+                    speaker=idx,
-                }
+                )
            )
        ctx.log(f"get_participants complete: {len(participants_list)} participants")
@@ -352,31 +352,30 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
    target_language = participants_result.target_language
-    track_words = []
+    track_words: list[list[Word]] = []
    padded_tracks = []
    created_padded_files = set()
    for result in results:
-        transcribe_result = result.get("transcribe_track", {})
+        transcribe_result = TranscribeTrackResult(**result["transcribe_track"])
-        track_words.append(transcribe_result.get("words", []))
+        track_words.append(transcribe_result.words)
-        pad_result = result.get("pad_track", {})
+        pad_result = PadTrackResult(**result["pad_track"])
        padded_key = pad_result.get("padded_key")
        bucket_name = pad_result.get("bucket_name")
        # Store S3 key info (not presigned URL) - consumer tasks presign on demand
-        if padded_key:
+        if pad_result.padded_key:
            padded_tracks.append(
-                PaddedTrackInfo(key=padded_key, bucket_name=bucket_name)
+                PaddedTrackInfo(
                    key=pad_result.padded_key, bucket_name=pad_result.bucket_name
                )
            )
-        track_index = pad_result.get("track_index")
+        if pad_result.size > 0:
-        if pad_result.get("size", 0) > 0 and track_index is not None:
+            storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{pad_result.track_index}.webm"
            storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{track_index}.webm"
            created_padded_files.add(storage_path)
    all_words = [word for words in track_words for word in words]
-    all_words.sort(key=lambda w: w.get("start", 0))
+    all_words.sort(key=lambda w: w.start)
    ctx.log(
        f"process_tracks complete: {len(all_words)} words from {len(input.tracks)} tracks"
@@ -569,9 +568,9 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
        first_word = chunk_words[0]
        last_word = chunk_words[-1]
-        timestamp = first_word.get("start", 0)
+        timestamp = first_word.start
-        duration = last_word.get("end", 0) - timestamp
+        duration = last_word.end - timestamp
-        chunk_text = " ".join(w.get("word", "") for w in chunk_words)
+        chunk_text = " ".join(w.text for w in chunk_words)
        chunks.append(
            {
@@ -604,40 +603,37 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
    results = await topic_chunk_workflow.aio_run_many(bulk_runs)
-    topic_results = [
+    topic_chunks = [
-        result.get("detect_chunk_topic", {})
+        TopicChunkResult(**result["detect_chunk_topic"])
        for result in results
-        if result.get("detect_chunk_topic")
+        if "detect_chunk_topic" in result
    ]
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
-        for topic_data in topic_results:
+        for chunk in topic_chunks:
            topic = TranscriptTopic(
-                title=topic_data.get("title", ""),
+                title=chunk.title,
-                summary=topic_data.get("summary", ""),
+                summary=chunk.summary,
-                timestamp=topic_data.get("timestamp", 0),
+                timestamp=chunk.timestamp,
-                transcript=" ".join(
+                transcript=" ".join(w.text for w in chunk.words),
-                    w.get("word", "") for w in topic_data.get("words", [])
+                words=[w.model_dump() for w in chunk.words],
                ),
                words=topic_data.get("words", []),
            )
            await transcripts_controller.upsert_topic(transcript, topic)
            await append_event_and_broadcast(
                input.transcript_id, transcript, "TOPIC", topic, logger=logger
            )
    # Convert to TitleSummary format for downstream steps
    topics_list = [
-        {
+        TitleSummary(
-            "title": t.get("title", ""),
+            title=chunk.title,
-            "summary": t.get("summary", ""),
+            summary=chunk.summary,
-            "timestamp": t.get("timestamp", 0),
+            timestamp=chunk.timestamp,
-            "duration": t.get("duration", 0),
+            duration=chunk.duration,
-            "transcript": {"words": t.get("words", [])},
+            transcript=TranscriptType(words=chunk.words),
-        }
+        )
-        for t in topic_results
+        for chunk in topic_chunks
    ]
    ctx.log(f"detect_topics complete: found {len(topics_list)} topics")
@@ -662,8 +658,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
        transcripts_controller,
    )
-    topic_objects = [TitleSummary(**t) for t in topics]
+    ctx.log(f"generate_title: received {len(topics)} TitleSummary objects")
    ctx.log(f"generate_title: created {len(topic_objects)} TitleSummary objects")
    empty_pipeline = topic_processing.EmptyPipeline(logger=logger)
    title_result = None
@@ -695,7 +690,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
        ctx.log("generate_title: calling topic_processing.generate_title (LLM call)...")
        await topic_processing.generate_title(
-            topic_objects,
+            topics,
            on_title_callback=on_title_callback,
            empty_pipeline=empty_pipeline,
            logger=logger,
@@ -735,8 +730,6 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
        SummaryBuilder,
    )
    topic_objects = [TitleSummary(**t) for t in topics]
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
@@ -750,7 +743,7 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
            }
        text_lines = []
-        for topic in topic_objects:
+        for topic in topics:
            for segment in topic.transcript.as_segments():
                name = speakermap.get(segment.speaker, f"Speaker {segment.speaker}")
                text_lines.append(f"{name}: {segment.text}")
@@ -818,7 +811,9 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
    results = await subject_workflow.aio_run_many(bulk_runs)
    subject_summaries = [
-        result.get("generate_detailed_summary", {}) for result in results
+        SubjectSummaryResult(**result["generate_detailed_summary"])
        for result in results
        if "generate_detailed_summary" in result
    ]
    ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")
@@ -858,7 +853,7 @@ async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
        return RecapResult(short_summary="", long_summary="")
    summaries = [
-        {"subject": s.get("subject", ""), "summary": s.get("paragraph_summary", "")}
+        {"subject": s.subject, "summary": s.paragraph_summary}
        for s in subject_summaries
    ]
@@ -963,7 +958,6 @@ async def identify_action_items(
    action_items_dict = action_items_response.model_dump()
    # Save to database and broadcast
    async with fresh_db_connection():
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
        if transcript:
@@ -1035,8 +1029,7 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
        if transcript is None:
            raise ValueError(f"Transcript {input.transcript_id} not found in database")
-        word_objects = [Word(**w) for w in all_words]
+        merged_transcript = TranscriptType(words=all_words, translation=None)
        merged_transcript = TranscriptType(words=word_objects, translation=None)
        await append_event_and_broadcast(
            input.transcript_id,
--- a/server/reflector/hatchet/workflows/models.py
+++ b/server/reflector/hatchet/workflows/models.py
@@ -5,13 +5,20 @@ Provides static typing for all task outputs, enabling type checking
 and better IDE support.
 """
 from typing import Any
 from pydantic import BaseModel
 from reflector.processors.types import TitleSummary, Word
 from reflector.utils.string import NonEmptyString
 class ParticipantInfo(BaseModel):
    """Participant info with speaker index for workflow result."""
    participant_id: NonEmptyString
    user_name: NonEmptyString
    speaker: int
 class PadTrackResult(BaseModel):
    """Result from pad_track task."""
@@ -26,7 +33,7 @@ class PadTrackResult(BaseModel):
 class TranscribeTrackResult(BaseModel):
    """Result from transcribe_track task."""
-    words: list[dict[str, Any]]
+    words: list[Word]
    track_index: int
@@ -41,7 +48,7 @@ class RecordingResult(BaseModel):
 class ParticipantsResult(BaseModel):
    """Result from get_participants task."""
-    participants: list[dict[str, Any]]
+    participants: list[ParticipantInfo]
    num_tracks: int
    source_language: NonEmptyString
    target_language: NonEmptyString
@@ -57,7 +64,7 @@ class PaddedTrackInfo(BaseModel):
 class ProcessTracksResult(BaseModel):
    """Result from process_tracks task."""
-    all_words: list[dict[str, Any]]
+    all_words: list[Word]
    padded_tracks: list[PaddedTrackInfo]  # S3 keys, not presigned URLs
    word_count: int
    num_tracks: int
@@ -87,13 +94,13 @@ class TopicChunkResult(BaseModel):
    summary: str
    timestamp: float
    duration: float
-    words: list[dict[str, Any]]
+    words: list[Word]
 class TopicsResult(BaseModel):
    """Result from detect_topics task."""
-    topics: list[dict[str, Any]]
+    topics: list[TitleSummary]
 class TitleResult(BaseModel):
@@ -123,7 +130,7 @@ class SubjectSummaryResult(BaseModel):
 class ProcessSubjectsResult(BaseModel):
    """Result from process_subjects fan-out task."""
-    subject_summaries: list[dict[str, Any]]  # List of SubjectSummaryResult dicts
+    subject_summaries: list[SubjectSummaryResult]
 class RecapResult(BaseModel):
--- a/server/reflector/hatchet/workflows/topic_chunk_processing.py
+++ b/server/reflector/hatchet/workflows/topic_chunk_processing.py
@@ -6,7 +6,6 @@ Spawned dynamically by detect_topics via aio_run_many() for parallel processing.
 """
 from datetime import timedelta
 from typing import Any
 from hatchet_sdk import Context
 from pydantic import BaseModel
@@ -15,6 +14,7 @@ from reflector.hatchet.client import HatchetClientManager
 from reflector.hatchet.workflows.models import TopicChunkResult
 from reflector.logger import logger
 from reflector.processors.prompts import TOPIC_PROMPT
 from reflector.processors.types import Word
 class TopicChunkInput(BaseModel):
@@ -24,7 +24,7 @@ class TopicChunkInput(BaseModel):
    chunk_text: str
    timestamp: float
    duration: float
-    words: list[dict[str, Any]]
+    words: list[Word]
 hatchet = HatchetClientManager.get_client()
--- a/server/reflector/hatchet/workflows/track_processing.py
+++ b/server/reflector/hatchet/workflows/track_processing.py
@@ -197,23 +197,20 @@ async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackRe
        transcript = await transcribe_file_with_processor(audio_url, input.language)
        # Tag all words with speaker index
        words = []
        for word in transcript.words:
-            word_dict = word.model_dump()
+            word.speaker = input.track_index
            word_dict["speaker"] = input.track_index
            words.append(word_dict)
        ctx.log(
-            f"transcribe_track complete: track {input.track_index}, {len(words)} words"
+            f"transcribe_track complete: track {input.track_index}, {len(transcript.words)} words"
        )
        logger.info(
            "[Hatchet] transcribe_track complete",
            track_index=input.track_index,
-            word_count=len(words),
+            word_count=len(transcript.words),
        )
        return TranscribeTrackResult(
-            words=words,
+            words=transcript.words,
            track_index=input.track_index,
        )