self-review

This commit is contained in:
Igor Loskutov
2025-12-22 18:06:59 -05:00
parent 8665204ab1
commit 2cbc373cc3
4 changed files with 73 additions and 76 deletions

View File

@@ -34,14 +34,19 @@ from reflector.hatchet.workflows.models import (
FinalizeResult,
MixdownResult,
PaddedTrackInfo,
PadTrackResult,
ParticipantInfo,
ParticipantsResult,
ProcessSubjectsResult,
ProcessTracksResult,
RecapResult,
RecordingResult,
SubjectsResult,
SubjectSummaryResult,
TitleResult,
TopicChunkResult,
TopicsResult,
TranscribeTrackResult,
WaveformResult,
WebhookResult,
ZulipResult,
@@ -58,13 +63,8 @@ from reflector.hatchet.workflows.track_processing import TrackInput, track_workf
from reflector.logger import logger
from reflector.pipelines import topic_processing
from reflector.processors import AudioFileWriterProcessor
from reflector.processors.types import (
TitleSummary,
Word,
)
from reflector.processors.types import (
Transcript as TranscriptType,
)
from reflector.processors.types import TitleSummary, Word
from reflector.processors.types import Transcript as TranscriptType
from reflector.settings import settings
from reflector.storage.storage_aws import AwsStorage
from reflector.utils.audio_constants import (
@@ -285,7 +285,7 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
track_keys = [t["s3_key"] for t in input.tracks]
cam_audio_keys = filter_cam_audio_tracks(track_keys)
participants_list = []
participants_list: list[ParticipantInfo] = []
for idx, key in enumerate(cam_audio_keys):
try:
parsed = parse_daily_recording_filename(key)
@@ -307,11 +307,11 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
)
await transcripts_controller.upsert_participant(transcript, participant)
participants_list.append(
{
"participant_id": participant_id,
"user_name": name,
"speaker": idx,
}
ParticipantInfo(
participant_id=participant_id,
user_name=name,
speaker=idx,
)
)
ctx.log(f"get_participants complete: {len(participants_list)} participants")
@@ -352,31 +352,30 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
target_language = participants_result.target_language
track_words = []
track_words: list[list[Word]] = []
padded_tracks = []
created_padded_files = set()
for result in results:
transcribe_result = result.get("transcribe_track", {})
track_words.append(transcribe_result.get("words", []))
transcribe_result = TranscribeTrackResult(**result["transcribe_track"])
track_words.append(transcribe_result.words)
pad_result = result.get("pad_track", {})
padded_key = pad_result.get("padded_key")
bucket_name = pad_result.get("bucket_name")
pad_result = PadTrackResult(**result["pad_track"])
# Store S3 key info (not presigned URL) - consumer tasks presign on demand
if padded_key:
if pad_result.padded_key:
padded_tracks.append(
PaddedTrackInfo(key=padded_key, bucket_name=bucket_name)
PaddedTrackInfo(
key=pad_result.padded_key, bucket_name=pad_result.bucket_name
)
)
track_index = pad_result.get("track_index")
if pad_result.get("size", 0) > 0 and track_index is not None:
storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{track_index}.webm"
if pad_result.size > 0:
storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{pad_result.track_index}.webm"
created_padded_files.add(storage_path)
all_words = [word for words in track_words for word in words]
all_words.sort(key=lambda w: w.get("start", 0))
all_words.sort(key=lambda w: w.start)
ctx.log(
f"process_tracks complete: {len(all_words)} words from {len(input.tracks)} tracks"
@@ -569,9 +568,9 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
first_word = chunk_words[0]
last_word = chunk_words[-1]
timestamp = first_word.get("start", 0)
duration = last_word.get("end", 0) - timestamp
chunk_text = " ".join(w.get("word", "") for w in chunk_words)
timestamp = first_word.start
duration = last_word.end - timestamp
chunk_text = " ".join(w.text for w in chunk_words)
chunks.append(
{
@@ -604,40 +603,37 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
results = await topic_chunk_workflow.aio_run_many(bulk_runs)
topic_results = [
result.get("detect_chunk_topic", {})
topic_chunks = [
TopicChunkResult(**result["detect_chunk_topic"])
for result in results
if result.get("detect_chunk_topic")
if "detect_chunk_topic" in result
]
async with fresh_db_connection():
transcript = await transcripts_controller.get_by_id(input.transcript_id)
for topic_data in topic_results:
for chunk in topic_chunks:
topic = TranscriptTopic(
title=topic_data.get("title", ""),
summary=topic_data.get("summary", ""),
timestamp=topic_data.get("timestamp", 0),
transcript=" ".join(
w.get("word", "") for w in topic_data.get("words", [])
),
words=topic_data.get("words", []),
title=chunk.title,
summary=chunk.summary,
timestamp=chunk.timestamp,
transcript=" ".join(w.text for w in chunk.words),
words=[w.model_dump() for w in chunk.words],
)
await transcripts_controller.upsert_topic(transcript, topic)
await append_event_and_broadcast(
input.transcript_id, transcript, "TOPIC", topic, logger=logger
)
# Convert to TitleSummary format for downstream steps
topics_list = [
{
"title": t.get("title", ""),
"summary": t.get("summary", ""),
"timestamp": t.get("timestamp", 0),
"duration": t.get("duration", 0),
"transcript": {"words": t.get("words", [])},
}
for t in topic_results
TitleSummary(
title=chunk.title,
summary=chunk.summary,
timestamp=chunk.timestamp,
duration=chunk.duration,
transcript=TranscriptType(words=chunk.words),
)
for chunk in topic_chunks
]
ctx.log(f"detect_topics complete: found {len(topics_list)} topics")
@@ -662,8 +658,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
transcripts_controller,
)
topic_objects = [TitleSummary(**t) for t in topics]
ctx.log(f"generate_title: created {len(topic_objects)} TitleSummary objects")
ctx.log(f"generate_title: received {len(topics)} TitleSummary objects")
empty_pipeline = topic_processing.EmptyPipeline(logger=logger)
title_result = None
@@ -695,7 +690,7 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
ctx.log("generate_title: calling topic_processing.generate_title (LLM call)...")
await topic_processing.generate_title(
topic_objects,
topics,
on_title_callback=on_title_callback,
empty_pipeline=empty_pipeline,
logger=logger,
@@ -735,8 +730,6 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
SummaryBuilder,
)
topic_objects = [TitleSummary(**t) for t in topics]
async with fresh_db_connection():
transcript = await transcripts_controller.get_by_id(input.transcript_id)
@@ -750,7 +743,7 @@ async def extract_subjects(input: PipelineInput, ctx: Context) -> SubjectsResult
}
text_lines = []
for topic in topic_objects:
for topic in topics:
for segment in topic.transcript.as_segments():
name = speakermap.get(segment.speaker, f"Speaker {segment.speaker}")
text_lines.append(f"{name}: {segment.text}")
@@ -818,7 +811,9 @@ async def process_subjects(input: PipelineInput, ctx: Context) -> ProcessSubject
results = await subject_workflow.aio_run_many(bulk_runs)
subject_summaries = [
result.get("generate_detailed_summary", {}) for result in results
SubjectSummaryResult(**result["generate_detailed_summary"])
for result in results
if "generate_detailed_summary" in result
]
ctx.log(f"process_subjects complete: {len(subject_summaries)} summaries")
@@ -858,7 +853,7 @@ async def generate_recap(input: PipelineInput, ctx: Context) -> RecapResult:
return RecapResult(short_summary="", long_summary="")
summaries = [
{"subject": s.get("subject", ""), "summary": s.get("paragraph_summary", "")}
{"subject": s.subject, "summary": s.paragraph_summary}
for s in subject_summaries
]
@@ -963,7 +958,6 @@ async def identify_action_items(
action_items_dict = action_items_response.model_dump()
# Save to database and broadcast
async with fresh_db_connection():
transcript = await transcripts_controller.get_by_id(input.transcript_id)
if transcript:
@@ -1035,8 +1029,7 @@ async def finalize(input: PipelineInput, ctx: Context) -> FinalizeResult:
if transcript is None:
raise ValueError(f"Transcript {input.transcript_id} not found in database")
word_objects = [Word(**w) for w in all_words]
merged_transcript = TranscriptType(words=word_objects, translation=None)
merged_transcript = TranscriptType(words=all_words, translation=None)
await append_event_and_broadcast(
input.transcript_id,

View File

@@ -5,13 +5,20 @@ Provides static typing for all task outputs, enabling type checking
and better IDE support.
"""
from typing import Any
from pydantic import BaseModel
from reflector.processors.types import TitleSummary, Word
from reflector.utils.string import NonEmptyString
class ParticipantInfo(BaseModel):
"""Participant info with speaker index for workflow result."""
participant_id: NonEmptyString
user_name: NonEmptyString
speaker: int
class PadTrackResult(BaseModel):
"""Result from pad_track task."""
@@ -26,7 +33,7 @@ class PadTrackResult(BaseModel):
class TranscribeTrackResult(BaseModel):
"""Result from transcribe_track task."""
words: list[dict[str, Any]]
words: list[Word]
track_index: int
@@ -41,7 +48,7 @@ class RecordingResult(BaseModel):
class ParticipantsResult(BaseModel):
"""Result from get_participants task."""
participants: list[dict[str, Any]]
participants: list[ParticipantInfo]
num_tracks: int
source_language: NonEmptyString
target_language: NonEmptyString
@@ -57,7 +64,7 @@ class PaddedTrackInfo(BaseModel):
class ProcessTracksResult(BaseModel):
"""Result from process_tracks task."""
all_words: list[dict[str, Any]]
all_words: list[Word]
padded_tracks: list[PaddedTrackInfo] # S3 keys, not presigned URLs
word_count: int
num_tracks: int
@@ -87,13 +94,13 @@ class TopicChunkResult(BaseModel):
summary: str
timestamp: float
duration: float
words: list[dict[str, Any]]
words: list[Word]
class TopicsResult(BaseModel):
"""Result from detect_topics task."""
topics: list[dict[str, Any]]
topics: list[TitleSummary]
class TitleResult(BaseModel):
@@ -123,7 +130,7 @@ class SubjectSummaryResult(BaseModel):
class ProcessSubjectsResult(BaseModel):
"""Result from process_subjects fan-out task."""
subject_summaries: list[dict[str, Any]] # List of SubjectSummaryResult dicts
subject_summaries: list[SubjectSummaryResult]
class RecapResult(BaseModel):

View File

@@ -6,7 +6,6 @@ Spawned dynamically by detect_topics via aio_run_many() for parallel processing.
"""
from datetime import timedelta
from typing import Any
from hatchet_sdk import Context
from pydantic import BaseModel
@@ -15,6 +14,7 @@ from reflector.hatchet.client import HatchetClientManager
from reflector.hatchet.workflows.models import TopicChunkResult
from reflector.logger import logger
from reflector.processors.prompts import TOPIC_PROMPT
from reflector.processors.types import Word
class TopicChunkInput(BaseModel):
@@ -24,7 +24,7 @@ class TopicChunkInput(BaseModel):
chunk_text: str
timestamp: float
duration: float
words: list[dict[str, Any]]
words: list[Word]
hatchet = HatchetClientManager.get_client()

View File

@@ -197,23 +197,20 @@ async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackRe
transcript = await transcribe_file_with_processor(audio_url, input.language)
# Tag all words with speaker index
words = []
for word in transcript.words:
word_dict = word.model_dump()
word_dict["speaker"] = input.track_index
words.append(word_dict)
word.speaker = input.track_index
ctx.log(
f"transcribe_track complete: track {input.track_index}, {len(words)} words"
f"transcribe_track complete: track {input.track_index}, {len(transcript.words)} words"
)
logger.info(
"[Hatchet] transcribe_track complete",
track_index=input.track_index,
word_count=len(words),
word_count=len(transcript.words),
)
return TranscribeTrackResult(
words=words,
words=transcript.words,
track_index=input.track_index,
)