feat: durable (#794)

* durable (no-mistakes) * hatchet no-mistake * hatchet no-mistake * hatchet no-mistake, better logging * remove conductor and add hatchet tests (no-mistakes) * self-review (no-mistakes) * hatched logs * remove shadow mode for hatchet * and add hatchet processor setting to room * . * cleanup * hatchet init db * self-review (no-mistakes) * self-review (no-mistakes) * hatchet: restore zullip report * self-review round * self-review round * self-review round * dry hatchet with celery * dry hatched with celery - 2 * self-review round * more NES instead of str * self-review wip * self-review round * self-review round * self-review round * can_replay cancelled * add forgotten file * pr autoreviewer fixes * better log webhook events * durable_started return * migration sync * latest changes feature parity * migration merge * pr review --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-02-05 10:26:48 +00:00 · 2025-12-22 12:09:20 -05:00
parent f580b996ee
commit 1dac999b56
36 changed files with 4908 additions and 2009 deletions
--- a/server/reflector/pipelines/main_file_pipeline.py
+++ b/server/reflector/pipelines/main_file_pipeline.py
@@ -97,13 +97,8 @@ class PipelineMainFile(PipelineMainBase):
                },
            )

-        # Extract audio and write to transcript location
        audio_path = await self.extract_and_write_audio(file_path, transcript)
-
-        # Upload for processing
        audio_url = await self.upload_audio(audio_path, transcript)
-
-        # Run parallel processing
        await self.run_parallel_processing(
            audio_path,
            audio_url,
@@ -197,7 +192,6 @@ class PipelineMainFile(PipelineMainBase):
        transcript_result = results[0]
        diarization_result = results[1]

-        # Handle errors - raise any exception that occurred
        self._handle_gather_exceptions(results, "parallel processing")
        for result in results:
            if isinstance(result, Exception):
@@ -212,7 +206,6 @@ class PipelineMainFile(PipelineMainBase):
            transcript=transcript_result, diarization=diarization_result or []
        )

-        # Store result for retrieval
        diarized_transcript: Transcript | None = None

        async def capture_result(transcript):
@@ -349,7 +342,6 @@ async def task_pipeline_file_process(*, transcript_id: str):
    try:
        await pipeline.set_status(transcript_id, "processing")

-        # Find the file to process
        audio_file = next(transcript.data_path.glob("upload.*"), None)
        if not audio_file:
            audio_file = next(transcript.data_path.glob("audio.*"), None)
--- a/server/reflector/pipelines/main_multitrack_pipeline.py
+++ b/server/reflector/pipelines/main_multitrack_pipeline.py
@@ -1,11 +1,8 @@
 import asyncio
-import math
 import tempfile
-from fractions import Fraction
 from pathlib import Path

 import av
-from av.audio.resampler import AudioResampler
 from celery import chain, shared_task

 from reflector.asynctask import asynctask
@@ -32,6 +29,15 @@ from reflector.processors.audio_waveform_processor import AudioWaveformProcessor
 from reflector.processors.types import TitleSummary
 from reflector.processors.types import Transcript as TranscriptType
 from reflector.storage import Storage, get_transcripts_storage
+from reflector.utils.audio_constants import PRESIGNED_URL_EXPIRATION_SECONDS
+from reflector.utils.audio_mixdown import (
+    detect_sample_rate_from_tracks,
+    mixdown_tracks_pyav,
+)
+from reflector.utils.audio_padding import (
+    apply_audio_padding_to_file,
+    extract_stream_start_time_from_container,
+)
 from reflector.utils.daily import (
    filter_cam_audio_tracks,
    parse_daily_recording_filename,
@@ -39,13 +45,6 @@ from reflector.utils.daily import (
 from reflector.utils.string import NonEmptyString
 from reflector.video_platforms.factory import create_platform_client

-# Audio encoding constants
-OPUS_STANDARD_SAMPLE_RATE = 48000
-OPUS_DEFAULT_BIT_RATE = 128000
-
-# Storage operation constants
-PRESIGNED_URL_EXPIRATION_SECONDS = 7200  # 2 hours
-

 class PipelineMainMultitrack(PipelineMainBase):
    def __init__(self, transcript_id: str):
@@ -125,8 +124,8 @@ class PipelineMainMultitrack(PipelineMainBase):
        try:
            # PyAV streams input from S3 URL efficiently (2-5MB fixed overhead for codec/filters)
            with av.open(track_url) as in_container:
-                start_time_seconds = self._extract_stream_start_time_from_container(
-                    in_container, track_idx
+                start_time_seconds = extract_stream_start_time_from_container(
+                    in_container, track_idx, logger=self.logger
                )

                if start_time_seconds <= 0:
@@ -144,8 +143,12 @@ class PipelineMainMultitrack(PipelineMainBase):
                    temp_path = temp_file.name

                try:
-                    self._apply_audio_padding_to_file(
-                        in_container, temp_path, start_time_seconds, track_idx
+                    apply_audio_padding_to_file(
+                        in_container,
+                        temp_path,
+                        start_time_seconds,
+                        track_idx,
+                        logger=self.logger,
                    )

                    storage_path = (
@@ -156,7 +159,6 @@ class PipelineMainMultitrack(PipelineMainBase):
                    with open(temp_path, "rb") as padded_file:
                        await storage.put_file(storage_path, padded_file)
                finally:
-                    # Clean up temp file
                    Path(temp_path).unlink(missing_ok=True)

                padded_url = await storage.get_file_url(
@@ -186,317 +188,28 @@ class PipelineMainMultitrack(PipelineMainBase):
                f"Track {track_idx} padding failed - transcript would have incorrect timestamps"
            ) from e

-    def _extract_stream_start_time_from_container(
-        self, container, track_idx: int
-    ) -> float:
-        """
-        Extract meeting-relative start time from WebM stream metadata.
-        Uses PyAV to read stream.start_time from WebM container.
-        More accurate than filename timestamps by ~209ms due to network/encoding delays.
-        """
-        start_time_seconds = 0.0
-        try:
-            audio_streams = [s for s in container.streams if s.type == "audio"]
-            stream = audio_streams[0] if audio_streams else container.streams[0]
-
-            # 1) Try stream-level start_time (most reliable for Daily.co tracks)
-            if stream.start_time is not None and stream.time_base is not None:
-                start_time_seconds = float(stream.start_time * stream.time_base)
-
-            # 2) Fallback to container-level start_time (in av.time_base units)
-            if (start_time_seconds <= 0) and (container.start_time is not None):
-                start_time_seconds = float(container.start_time * av.time_base)
-
-            # 3) Fallback to first packet DTS in stream.time_base
-            if start_time_seconds <= 0:
-                for packet in container.demux(stream):
-                    if packet.dts is not None:
-                        start_time_seconds = float(packet.dts * stream.time_base)
-                        break
-        except Exception as e:
-            self.logger.warning(
-                "PyAV metadata read failed; assuming 0 start_time",
-                track_idx=track_idx,
-                error=str(e),
-            )
-            start_time_seconds = 0.0
-
-        self.logger.info(
-            f"Track {track_idx} stream metadata: start_time={start_time_seconds:.3f}s",
-            track_idx=track_idx,
-        )
-        return start_time_seconds
-
-    def _apply_audio_padding_to_file(
-        self,
-        in_container,
-        output_path: str,
-        start_time_seconds: float,
-        track_idx: int,
-    ) -> None:
-        """Apply silence padding to audio track using PyAV filter graph, writing to file"""
-        delay_ms = math.floor(start_time_seconds * 1000)
-
-        self.logger.info(
-            f"Padding track {track_idx} with {delay_ms}ms delay using PyAV",
-            track_idx=track_idx,
-            delay_ms=delay_ms,
-        )
-
-        try:
-            with av.open(output_path, "w", format="webm") as out_container:
-                in_stream = next(
-                    (s for s in in_container.streams if s.type == "audio"), None
-                )
-                if in_stream is None:
-                    raise Exception("No audio stream in input")
-
-                out_stream = out_container.add_stream(
-                    "libopus", rate=OPUS_STANDARD_SAMPLE_RATE
-                )
-                out_stream.bit_rate = OPUS_DEFAULT_BIT_RATE
-                graph = av.filter.Graph()
-
-                abuf_args = (
-                    f"time_base=1/{OPUS_STANDARD_SAMPLE_RATE}:"
-                    f"sample_rate={OPUS_STANDARD_SAMPLE_RATE}:"
-                    f"sample_fmt=s16:"
-                    f"channel_layout=stereo"
-                )
-                src = graph.add("abuffer", args=abuf_args, name="src")
-                aresample_f = graph.add("aresample", args="async=1", name="ares")
-                # adelay requires one delay value per channel separated by '|'
-                delays_arg = f"{delay_ms}|{delay_ms}"
-                adelay_f = graph.add(
-                    "adelay", args=f"delays={delays_arg}:all=1", name="delay"
-                )
-                sink = graph.add("abuffersink", name="sink")
-
-                src.link_to(aresample_f)
-                aresample_f.link_to(adelay_f)
-                adelay_f.link_to(sink)
-                graph.configure()
-
-                resampler = AudioResampler(
-                    format="s16", layout="stereo", rate=OPUS_STANDARD_SAMPLE_RATE
-                )
-                # Decode -> resample -> push through graph -> encode Opus
-                for frame in in_container.decode(in_stream):
-                    out_frames = resampler.resample(frame) or []
-                    for rframe in out_frames:
-                        rframe.sample_rate = OPUS_STANDARD_SAMPLE_RATE
-                        rframe.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE)
-                        src.push(rframe)
-
-                        while True:
-                            try:
-                                f_out = sink.pull()
-                            except Exception:
-                                break
-                            f_out.sample_rate = OPUS_STANDARD_SAMPLE_RATE
-                            f_out.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE)
-                            for packet in out_stream.encode(f_out):
-                                out_container.mux(packet)
-
-                src.push(None)
-                while True:
-                    try:
-                        f_out = sink.pull()
-                    except Exception:
-                        break
-                    f_out.sample_rate = OPUS_STANDARD_SAMPLE_RATE
-                    f_out.time_base = Fraction(1, OPUS_STANDARD_SAMPLE_RATE)
-                    for packet in out_stream.encode(f_out):
-                        out_container.mux(packet)
-
-                for packet in out_stream.encode(None):
-                    out_container.mux(packet)
-        except Exception as e:
-            self.logger.error(
-                "PyAV padding failed for track",
-                track_idx=track_idx,
-                delay_ms=delay_ms,
-                error=str(e),
-                exc_info=True,
-            )
-            raise
-
    async def mixdown_tracks(
        self,
        track_urls: list[str],
        writer: AudioFileWriterProcessor,
        offsets_seconds: list[float] | None = None,
    ) -> None:
-        """Multi-track mixdown using PyAV filter graph (amix), reading from S3 presigned URLs"""
-
-        target_sample_rate: int | None = None
-        for url in track_urls:
-            if not url:
-                continue
-            container = None
-            try:
-                container = av.open(url)
-                for frame in container.decode(audio=0):
-                    target_sample_rate = frame.sample_rate
-                    break
-            except Exception:
-                continue
-            finally:
-                if container is not None:
-                    container.close()
-            if target_sample_rate:
-                break
-
+        """Multi-track mixdown using PyAV filter graph (amix), reading from S3 presigned URLs."""
+        target_sample_rate = detect_sample_rate_from_tracks(
+            track_urls, logger=self.logger
+        )
        if not target_sample_rate:
            self.logger.error("Mixdown failed - no decodable audio frames found")
            raise Exception("Mixdown failed: No decodable audio frames in any track")
-        # Build PyAV filter graph:
-        # N abuffer (s32/stereo)
-        #   -> optional adelay per input (for alignment)
-        #   -> amix (s32)
-        #   -> aformat(s16)
-        #   -> sink
-        graph = av.filter.Graph()
-        inputs = []
-        valid_track_urls = [url for url in track_urls if url]
-        input_offsets_seconds = None
-        if offsets_seconds is not None:
-            input_offsets_seconds = [
-                offsets_seconds[i] for i, url in enumerate(track_urls) if url
-            ]
-        for idx, url in enumerate(valid_track_urls):
-            args = (
-                f"time_base=1/{target_sample_rate}:"
-                f"sample_rate={target_sample_rate}:"
-                f"sample_fmt=s32:"
-                f"channel_layout=stereo"
-            )
-            in_ctx = graph.add("abuffer", args=args, name=f"in{idx}")
-            inputs.append(in_ctx)

-        if not inputs:
-            self.logger.error("Mixdown failed - no valid inputs for graph")
-            raise Exception("Mixdown failed: No valid inputs for filter graph")
-
-        mixer = graph.add("amix", args=f"inputs={len(inputs)}:normalize=0", name="mix")
-
-        fmt = graph.add(
-            "aformat",
-            args=(
-                f"sample_fmts=s32:channel_layouts=stereo:sample_rates={target_sample_rate}"
-            ),
-            name="fmt",
+        await mixdown_tracks_pyav(
+            track_urls,
+            writer,
+            target_sample_rate,
+            offsets_seconds=offsets_seconds,
+            logger=self.logger,
        )

-        sink = graph.add("abuffersink", name="out")
-
-        # Optional per-input delay before mixing
-        delays_ms: list[int] = []
-        if input_offsets_seconds is not None:
-            base = min(input_offsets_seconds) if input_offsets_seconds else 0.0
-            delays_ms = [
-                max(0, int(round((o - base) * 1000))) for o in input_offsets_seconds
-            ]
-        else:
-            delays_ms = [0 for _ in inputs]
-
-        for idx, in_ctx in enumerate(inputs):
-            delay_ms = delays_ms[idx] if idx < len(delays_ms) else 0
-            if delay_ms > 0:
-                # adelay requires one value per channel; use same for stereo
-                adelay = graph.add(
-                    "adelay",
-                    args=f"delays={delay_ms}|{delay_ms}:all=1",
-                    name=f"delay{idx}",
-                )
-                in_ctx.link_to(adelay)
-                adelay.link_to(mixer, 0, idx)
-            else:
-                in_ctx.link_to(mixer, 0, idx)
-        mixer.link_to(fmt)
-        fmt.link_to(sink)
-        graph.configure()
-
-        containers = []
-        try:
-            # Open all containers with cleanup guaranteed
-            for i, url in enumerate(valid_track_urls):
-                try:
-                    c = av.open(
-                        url,
-                        options={
-                            # it's trying to stream from s3 by default
-                            "reconnect": "1",
-                            "reconnect_streamed": "1",
-                            "reconnect_delay_max": "5",
-                        },
-                    )
-                    containers.append(c)
-                except Exception as e:
-                    self.logger.warning(
-                        "Mixdown: failed to open container from URL",
-                        input=i,
-                        url=url,
-                        error=str(e),
-                    )
-
-            if not containers:
-                self.logger.error("Mixdown failed - no valid containers opened")
-                raise Exception("Mixdown failed: Could not open any track containers")
-
-            decoders = [c.decode(audio=0) for c in containers]
-            active = [True] * len(decoders)
-            resamplers = [
-                AudioResampler(format="s32", layout="stereo", rate=target_sample_rate)
-                for _ in decoders
-            ]
-
-            while any(active):
-                for i, (dec, is_active) in enumerate(zip(decoders, active)):
-                    if not is_active:
-                        continue
-                    try:
-                        frame = next(dec)
-                    except StopIteration:
-                        active[i] = False
-                        # causes stream to move on / unclogs memory
-                        inputs[i].push(None)
-                        continue
-
-                    if frame.sample_rate != target_sample_rate:
-                        continue
-                    out_frames = resamplers[i].resample(frame) or []
-                    for rf in out_frames:
-                        rf.sample_rate = target_sample_rate
-                        rf.time_base = Fraction(1, target_sample_rate)
-                        inputs[i].push(rf)
-
-                    while True:
-                        try:
-                            mixed = sink.pull()
-                        except Exception:
-                            break
-                        mixed.sample_rate = target_sample_rate
-                        mixed.time_base = Fraction(1, target_sample_rate)
-                        await writer.push(mixed)
-
-            while True:
-                try:
-                    mixed = sink.pull()
-                except Exception:
-                    break
-                mixed.sample_rate = target_sample_rate
-                mixed.time_base = Fraction(1, target_sample_rate)
-                await writer.push(mixed)
-        finally:
-            # Cleanup all containers, even if processing failed
-            for c in containers:
-                if c is not None:
-                    try:
-                        c.close()
-                    except Exception:
-                        pass  # Best effort cleanup
-
    @broadcast_to_sockets
    async def set_status(self, transcript_id: str, status: TranscriptStatus):
        async with self.lock_transaction():
--- a/server/reflector/pipelines/topic_processing.py
+++ b/server/reflector/pipelines/topic_processing.py
@@ -74,13 +74,24 @@ async def generate_title(
        logger.warning("No topics for title generation")
        return

+    logger.info(
+        "generate_title: creating TranscriptFinalTitleProcessor",
+        topic_count=len(topics),
+    )
    processor = TranscriptFinalTitleProcessor(callback=on_title_callback)
    processor.set_pipeline(empty_pipeline)

-    for topic in topics:
+    for i, topic in enumerate(topics):
+        logger.info(
+            "generate_title: pushing topic to processor",
+            topic_index=i,
+            topic_title=topic.title[:50] if topic.title else None,
+        )
        await processor.push(topic)

+    logger.info("generate_title: calling processor.flush() - this triggers LLM call")
    await processor.flush()
+    logger.info("generate_title: processor.flush() completed")


 async def generate_summaries(
@@ -97,6 +108,10 @@ async def generate_summaries(
        logger.warning("No topics for summary generation")
        return

+    logger.info(
+        "generate_summaries: creating TranscriptFinalSummaryProcessor",
+        topic_count=len(topics),
+    )
    processor_kwargs = {
        "transcript": transcript,
        "callback": on_long_summary_callback,
@@ -107,7 +122,16 @@ async def generate_summaries(
    processor = TranscriptFinalSummaryProcessor(**processor_kwargs)
    processor.set_pipeline(empty_pipeline)

-    for topic in topics:
+    for i, topic in enumerate(topics):
+        logger.info(
+            "generate_summaries: pushing topic to processor",
+            topic_index=i,
+            topic_title=topic.title[:50] if topic.title else None,
+        )
        await processor.push(topic)

+    logger.info(
+        "generate_summaries: calling processor.flush() - this triggers LLM calls"
+    )
    await processor.flush()
+    logger.info("generate_summaries: processor.flush() completed")