reflector/server/reflector/pipelines/main_multitrack_pipeline.py

import asyncio
import io
from fractions import Fraction

import av
import boto3
import structlog
from av.audio.resampler import AudioResampler
from celery import chain, shared_task

from reflector.asynctask import asynctask
from reflector.db.transcripts import (
    TranscriptStatus,
    TranscriptText,
    TranscriptWaveform,
    transcripts_controller,
)
from reflector.logger import logger
from reflector.pipelines.main_file_pipeline import task_send_webhook_if_needed
from reflector.pipelines.main_live_pipeline import (
    PipelineMainBase,
    task_cleanup_consent,
    task_pipeline_post_to_zulip,
)
from reflector.processors import (
    AudioFileWriterProcessor,
    TranscriptFinalSummaryProcessor,
    TranscriptFinalTitleProcessor,
    TranscriptTopicDetectorProcessor,
)
from reflector.processors.audio_waveform_processor import AudioWaveformProcessor
from reflector.processors.file_transcript import FileTranscriptInput
from reflector.processors.file_transcript_auto import FileTranscriptAutoProcessor
from reflector.processors.types import TitleSummary
from reflector.processors.types import (
    Transcript as TranscriptType,
)
from reflector.settings import settings
from reflector.storage import get_transcripts_storage


class EmptyPipeline:
    def __init__(self, logger: structlog.BoundLogger):
        self.logger = logger

    def get_pref(self, k, d=None):
        return d

    async def emit(self, event):
        pass


class PipelineMainMultitrack(PipelineMainBase):
    """Process multiple participant tracks for a transcript without mixing audio."""

    def __init__(self, transcript_id: str):
        super().__init__(transcript_id=transcript_id)
        self.logger = logger.bind(transcript_id=self.transcript_id)
        self.empty_pipeline = EmptyPipeline(logger=self.logger)

    async def pad_track_for_transcription(
        self,
        track_data: bytes,
        track_idx: int,
        storage,
    ) -> tuple[bytes, str]:
        """
        Pad a single track with silence based on stream metadata start_time.
        This ensures Whisper timestamps will be relative to recording start.
        Uses ffmpeg subprocess approach proven to work with python-raw-tracks-align.

        Returns: (padded_data, storage_url)
        """
        import json
        import math
        import subprocess
        import tempfile

        if not track_data:
            return b"", ""

        transcript = await self.get_transcript()

        # Create temp files for ffmpeg processing
        with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as input_file:
            input_file.write(track_data)
            input_file_path = input_file.name

        output_file_path = input_file_path.replace(".webm", "_padded.webm")

        try:
            # Get stream metadata using ffprobe
            ffprobe_cmd = [
                "ffprobe",
                "-v",
                "error",
                "-show_entries",
                "stream=start_time",
                "-of",
                "json",
                input_file_path,
            ]

            result = subprocess.run(
                ffprobe_cmd, capture_output=True, text=True, check=True
            )
            metadata = json.loads(result.stdout)

            # Extract start_time from stream metadata
            start_time_seconds = 0.0
            if metadata.get("streams") and len(metadata["streams"]) > 0:
                start_time_str = metadata["streams"][0].get("start_time", "0")
                start_time_seconds = float(start_time_str)

            self.logger.info(
                f"Track {track_idx} stream metadata: start_time={start_time_seconds:.3f}s",
                track_idx=track_idx,
            )

            # If no padding needed, use original
            if start_time_seconds <= 0:
                storage_path = f"file_pipeline/{transcript.id}/tracks/original_track_{track_idx}.webm"
                await storage.put_file(storage_path, track_data)
                url = await storage.get_file_url(storage_path)
                return track_data, url

            # Calculate delay in milliseconds
            delay_ms = math.floor(start_time_seconds * 1000)

            # Run ffmpeg to pad the audio while maintaining WebM/Opus format for Modal compatibility
            # ffmpeg quirk: aresample needs to come before adelay in the filter chain
            ffmpeg_cmd = [
                "ffmpeg",
                "-hide_banner",
                "-loglevel",
                "error",
                "-y",  # overwrite output
                "-i",
                input_file_path,
                "-af",
                f"aresample=async=1,adelay={delay_ms}:all=true",
                "-c:a",
                "libopus",  # Keep Opus codec for Modal compatibility
                "-b:a",
                "128k",  # Standard bitrate for Opus
                output_file_path,
            ]

            self.logger.info(
                f"Padding track {track_idx} with {delay_ms}ms delay using ffmpeg",
                track_idx=track_idx,
                delay_ms=delay_ms,
                command=" ".join(ffmpeg_cmd),
            )

            result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
            if result.returncode != 0:
                self.logger.error(
                    f"ffmpeg padding failed for track {track_idx}",
                    track_idx=track_idx,
                    stderr=result.stderr,
                    returncode=result.returncode,
                )
                raise Exception(f"ffmpeg padding failed: {result.stderr}")

            # Read the padded output
            with open(output_file_path, "rb") as f:
                padded_data = f.read()

            # Store padded track
            storage_path = (
                f"file_pipeline/{transcript.id}/tracks/padded_track_{track_idx}.webm"
            )
            await storage.put_file(storage_path, padded_data)
            padded_url = await storage.get_file_url(storage_path)

            self.logger.info(
                f"Successfully padded track {track_idx} with {start_time_seconds:.3f}s offset, stored at {storage_path}",
                track_idx=track_idx,
                delay_ms=delay_ms,
                padded_url=padded_url,
                padded_size=len(padded_data),
            )

            return padded_data, padded_url

        finally:
            # Clean up temp files
            import os

            try:
                os.unlink(input_file_path)
            except:
                pass
            try:
                os.unlink(output_file_path)
            except:
                pass

    async def mixdown_tracks(
        self,
        track_datas: list[bytes],
        writer: AudioFileWriterProcessor,
        offsets_seconds: list[float] | None = None,
    ) -> None:
        """
        Minimal multi-track mixdown using a PyAV filter graph (amix), no resampling.
        """

        # Discover target sample rate from first decodable frame
        target_sample_rate: int | None = None
        for data in track_datas:
            if not data:
                continue
            try:
                container = av.open(io.BytesIO(data))
                try:
                    for frame in container.decode(audio=0):
                        target_sample_rate = frame.sample_rate
                        break
                finally:
                    container.close()
            except Exception:
                continue
            if target_sample_rate:
                break

        if not target_sample_rate:
            self.logger.warning("Mixdown skipped - no decodable audio frames found")
            return

        # Build PyAV filter graph:
        # N abuffer (s32/stereo)
        #   -> optional adelay per input (for alignment)
        #   -> amix (s32)
        #   -> aformat(s16)
        #   -> sink
        graph = av.filter.Graph()
        inputs = []
        valid_track_datas = [d for d in track_datas if d]
        # Align offsets list with the filtered inputs (skip empties)
        input_offsets_seconds = None
        if offsets_seconds is not None:
            input_offsets_seconds = [
                offsets_seconds[i] for i, d in enumerate(track_datas) if d
            ]
        for idx, data in enumerate(valid_track_datas):
            args = (
                f"time_base=1/{target_sample_rate}:"
                f"sample_rate={target_sample_rate}:"
                f"sample_fmt=s32:"
                f"channel_layout=stereo"
            )
            in_ctx = graph.add("abuffer", args=args, name=f"in{idx}")
            inputs.append(in_ctx)

        if not inputs:
            self.logger.warning("Mixdown skipped - no valid inputs for graph")
            return

        mixer = graph.add("amix", args=f"inputs={len(inputs)}:normalize=0", name="mix")

        fmt = graph.add(
            "aformat",
            args=(
                f"sample_fmts=s32:channel_layouts=stereo:sample_rates={target_sample_rate}"
            ),
            name="fmt",
        )

        sink = graph.add("abuffersink", name="out")

        # Optional per-input delay before mixing
        delays_ms: list[int] = []
        if input_offsets_seconds is not None:
            base = min(input_offsets_seconds) if input_offsets_seconds else 0.0
            delays_ms = [
                max(0, int(round((o - base) * 1000))) for o in input_offsets_seconds
            ]
        else:
            delays_ms = [0 for _ in inputs]

        for idx, in_ctx in enumerate(inputs):
            delay_ms = delays_ms[idx] if idx < len(delays_ms) else 0
            if delay_ms > 0:
                # adelay requires one value per channel; use same for stereo
                adelay = graph.add(
                    "adelay",
                    args=f"delays={delay_ms}|{delay_ms}:all=1",
                    name=f"delay{idx}",
                )
                in_ctx.link_to(adelay)
                adelay.link_to(mixer, 0, idx)
            else:
                in_ctx.link_to(mixer, 0, idx)
        mixer.link_to(fmt)
        fmt.link_to(sink)
        graph.configure()

        # Open containers for decoding
        containers = []
        for i, d in enumerate(valid_track_datas):
            try:
                c = av.open(io.BytesIO(d))
                containers.append(c)
            except Exception as e:
                self.logger.warning(
                    "Mixdown: failed to open container", input=i, error=str(e)
                )
                containers.append(None)
        # Filter out Nones for decoders
        containers = [c for c in containers if c is not None]
        decoders = [c.decode(audio=0) for c in containers]
        active = [True] * len(decoders)
        # Per-input resamplers to enforce s32/stereo at the same rate (no resample of rate)
        resamplers = [
            AudioResampler(format="s32", layout="stereo", rate=target_sample_rate)
            for _ in decoders
        ]

        try:
            # Round-robin feed frames into graph, pull mixed frames as they become available
            while any(active):
                for i, (dec, is_active) in enumerate(zip(decoders, active)):
                    if not is_active:
                        continue
                    try:
                        frame = next(dec)
                    except StopIteration:
                        active[i] = False
                        continue

                    # Enforce same sample rate; convert format/layout to s16/stereo (no resample)
                    if frame.sample_rate != target_sample_rate:
                        # Skip frames with differing rate
                        continue
                    out_frames = resamplers[i].resample(frame) or []
                    for rf in out_frames:
                        rf.sample_rate = target_sample_rate
                        rf.time_base = Fraction(1, target_sample_rate)
                        inputs[i].push(rf)

                    # Drain available mixed frames
                    while True:
                        try:
                            mixed = sink.pull()
                        except Exception:
                            break
                        mixed.sample_rate = target_sample_rate
                        mixed.time_base = Fraction(1, target_sample_rate)
                        await writer.push(mixed)

            # Signal EOF to inputs and drain remaining
            for in_ctx in inputs:
                in_ctx.push(None)
            while True:
                try:
                    mixed = sink.pull()
                except Exception:
                    break
                mixed.sample_rate = target_sample_rate
                mixed.time_base = Fraction(1, target_sample_rate)
                await writer.push(mixed)
        finally:
            for c in containers:
                c.close()

    async def set_status(self, transcript_id: str, status: TranscriptStatus):
        async with self.lock_transaction():
            return await transcripts_controller.set_status(transcript_id, status)

    async def on_waveform(self, data):
        async with self.transaction():
            waveform = TranscriptWaveform(waveform=data)
            transcript = await self.get_transcript()
            return await transcripts_controller.append_event(
                transcript=transcript, event="WAVEFORM", data=waveform
            )

    async def process(self, bucket_name: str, track_keys: list[str]):
        transcript = await self.get_transcript()

        s3 = boto3.client(
            "s3",
            region_name=settings.RECORDING_STORAGE_AWS_REGION,
            aws_access_key_id=settings.RECORDING_STORAGE_AWS_ACCESS_KEY_ID,
            aws_secret_access_key=settings.RECORDING_STORAGE_AWS_SECRET_ACCESS_KEY,
        )

        storage = get_transcripts_storage()

        # Pre-download bytes for all tracks for mixing and transcription
        track_datas: list[bytes] = []
        for key in track_keys:
            try:
                obj = s3.get_object(Bucket=bucket_name, Key=key)
                track_datas.append(obj["Body"].read())
            except Exception as e:
                self.logger.warning(
                    "Skipping track - cannot read S3 object", key=key, error=str(e)
                )
                track_datas.append(b"")

        # PAD TRACKS FIRST - this creates full-length tracks with correct timeline
        padded_track_datas: list[bytes] = []
        padded_track_urls: list[str] = []
        for idx, data in enumerate(track_datas):
            if not data:
                padded_track_datas.append(b"")
                padded_track_urls.append("")
                continue

            padded_data, padded_url = await self.pad_track_for_transcription(
                data, idx, storage
            )
            padded_track_datas.append(padded_data)
            padded_track_urls.append(padded_url)
            self.logger.info(f"Padded track {idx} for transcription: {padded_url}")

        # Mixdown PADDED tracks (already aligned with timeline) into transcript.audio_mp3_filename
        try:
            # Ensure data directory exists
            transcript.data_path.mkdir(parents=True, exist_ok=True)

            mp3_writer = AudioFileWriterProcessor(
                path=str(transcript.audio_mp3_filename)
            )
            # Use PADDED tracks with NO additional offsets (already aligned by padding)
            await self.mixdown_tracks(
                padded_track_datas, mp3_writer, offsets_seconds=None
            )
            await mp3_writer.flush()

            # Upload the mixed audio to S3 for web playback
            if transcript.audio_mp3_filename.exists():
                mp3_data = transcript.audio_mp3_filename.read_bytes()
                storage_path = f"{transcript.id}/audio.mp3"
                await storage.put_file(storage_path, mp3_data)
                mp3_url = await storage.get_file_url(storage_path)

                # Update transcript to indicate audio is in storage
                await transcripts_controller.update(
                    transcript, {"audio_location": "storage"}
                )

                self.logger.info(
                    f"Uploaded mixed audio to storage",
                    storage_path=storage_path,
                    size=len(mp3_data),
                    url=mp3_url,
                )
            else:
                self.logger.warning("Mixdown file does not exist after processing")
        except Exception as e:
            self.logger.error("Mixdown failed", error=str(e), exc_info=True)

        # Generate waveform from the mixed audio file
        if transcript.audio_mp3_filename.exists():
            try:
                self.logger.info("Generating waveform from mixed audio")
                waveform_processor = AudioWaveformProcessor(
                    audio_path=transcript.audio_mp3_filename,
                    waveform_path=transcript.audio_waveform_filename,
                    on_waveform=self.on_waveform,
                )
                waveform_processor.set_pipeline(self.empty_pipeline)
                await waveform_processor.flush()
                self.logger.info("Waveform generated successfully")
            except Exception as e:
                self.logger.error(
                    "Waveform generation failed", error=str(e), exc_info=True
                )

        # Transcribe PADDED tracks - timestamps will be automatically correct!
        speaker_transcripts: list[TranscriptType] = []
        for idx, padded_url in enumerate(padded_track_urls):
            if not padded_url:
                continue

            try:
                # Transcribe the PADDED track
                t = await self.transcribe_file(padded_url, transcript.source_language)
            except Exception as e:
                self.logger.error(
                    "Transcription via default backend failed, trying local whisper",
                    track_idx=idx,
                    url=padded_url,
                    error=str(e),
                )
                try:
                    fallback = FileTranscriptAutoProcessor(name="whisper")
                    result = None

                    async def capture_result(r):
                        nonlocal result
                        result = r

                    fallback.on(capture_result)
                    await fallback.push(
                        FileTranscriptInput(
                            audio_url=padded_url, language=transcript.source_language
                        )
                    )
                    await fallback.flush()
                    if not result:
                        raise Exception("No transcript captured in fallback")
                    t = result
                except Exception as e2:
                    self.logger.error(
                        "Skipping track - transcription failed after fallback",
                        track_idx=idx,
                        url=padded_url,
                        error=str(e2),
                    )
                    continue

            if not t.words:
                continue

            # NO OFFSET ADJUSTMENT NEEDED!
            # Timestamps are already correct because we transcribed padded tracks
            # Just set speaker ID
            for w in t.words:
                w.speaker = idx

            speaker_transcripts.append(t)
            self.logger.info(
                f"Track {idx} transcribed successfully with {len(t.words)} words",
                track_idx=idx,
            )

        if not speaker_transcripts:
            raise Exception("No valid track transcriptions")

        # Merge all words and sort by timestamp
        merged_words = []
        for t in speaker_transcripts:
            merged_words.extend(t.words)
        merged_words.sort(
            key=lambda w: w.start if hasattr(w, "start") and w.start is not None else 0
        )

        merged_transcript = TranscriptType(words=merged_words, translation=None)

        await transcripts_controller.append_event(
            transcript,
            event="TRANSCRIPT",
            data=TranscriptText(
                text=merged_transcript.text, translation=merged_transcript.translation
            ),
        )

        topics = await self.detect_topics(merged_transcript, transcript.target_language)
        await asyncio.gather(
            self.generate_title(topics),
            self.generate_summaries(topics),
            return_exceptions=False,
        )

        await self.set_status(transcript.id, "ended")

    async def transcribe_file(self, audio_url: str, language: str) -> TranscriptType:
        processor = FileTranscriptAutoProcessor()
        input_data = FileTranscriptInput(audio_url=audio_url, language=language)

        result: TranscriptType | None = None

        async def capture_result(transcript):
            nonlocal result
            result = transcript

        processor.on(capture_result)
        await processor.push(input_data)
        await processor.flush()

        if not result:
            raise ValueError("No transcript captured")

        return result

    async def detect_topics(
        self, transcript: TranscriptType, target_language: str
    ) -> list[TitleSummary]:
        chunk_size = 300
        topics: list[TitleSummary] = []

        async def on_topic(topic: TitleSummary):
            topics.append(topic)
            return await self.on_topic(topic)

        topic_detector = TranscriptTopicDetectorProcessor(callback=on_topic)
        topic_detector.set_pipeline(self.empty_pipeline)

        for i in range(0, len(transcript.words), chunk_size):
            chunk_words = transcript.words[i : i + chunk_size]
            if not chunk_words:
                continue

            chunk_transcript = TranscriptType(
                words=chunk_words, translation=transcript.translation
            )
            await topic_detector.push(chunk_transcript)

        await topic_detector.flush()
        return topics

    async def generate_title(self, topics: list[TitleSummary]):
        if not topics:
            self.logger.warning("No topics for title generation")
            return

        processor = TranscriptFinalTitleProcessor(callback=self.on_title)
        processor.set_pipeline(self.empty_pipeline)

        for topic in topics:
            await processor.push(topic)

        await processor.flush()

    async def generate_summaries(self, topics: list[TitleSummary]):
        if not topics:
            self.logger.warning("No topics for summary generation")
            return

        transcript = await self.get_transcript()
        processor = TranscriptFinalSummaryProcessor(
            transcript=transcript,
            callback=self.on_long_summary,
            on_short_summary=self.on_short_summary,
        )
        processor.set_pipeline(self.empty_pipeline)

        for topic in topics:
            await processor.push(topic)

        await processor.flush()


@shared_task
@asynctask
async def task_pipeline_multitrack_process(
    *, transcript_id: str, bucket_name: str, track_keys: list[str]
):
    pipeline = PipelineMainMultitrack(transcript_id=transcript_id)
    try:
        await pipeline.set_status(transcript_id, "processing")
        await pipeline.process(bucket_name, track_keys)
    except Exception:
        await pipeline.set_status(transcript_id, "error")
        raise

    post_chain = chain(
        task_cleanup_consent.si(transcript_id=transcript_id),
        task_pipeline_post_to_zulip.si(transcript_id=transcript_id),
        task_send_webhook_if_needed.si(transcript_id=transcript_id),
    )
    post_chain.delay()