Add multitrack pipeline

2025-12-21 04:39:06 +00:00 · 2025-10-10 23:03:25 +02:00
parent 9ac7f0e8e2
commit 6301f2afa6
2 changed files with 429 additions and 0 deletions
--- a/server/reflector/pipelines/main_multitrack_pipeline.py
+++ b/server/reflector/pipelines/main_multitrack_pipeline.py
@@ -0,0 +1,292 @@
+import asyncio
+
+import boto3
+import structlog
+from celery import chain, shared_task
+
+from reflector.asynctask import asynctask
+from reflector.db.transcripts import (
+    TranscriptStatus,
+    TranscriptText,
+    transcripts_controller,
+)
+from reflector.logger import logger
+from reflector.pipelines.main_file_pipeline import task_send_webhook_if_needed
+from reflector.pipelines.main_live_pipeline import (
+    PipelineMainBase,
+    task_cleanup_consent,
+    task_pipeline_post_to_zulip,
+)
+from reflector.processors import (
+    TranscriptFinalSummaryProcessor,
+    TranscriptFinalTitleProcessor,
+    TranscriptTopicDetectorProcessor,
+)
+from reflector.processors.file_transcript import FileTranscriptInput
+from reflector.processors.file_transcript_auto import FileTranscriptAutoProcessor
+from reflector.processors.types import TitleSummary
+from reflector.processors.types import (
+    Transcript as TranscriptType,
+)
+from reflector.settings import settings
+from reflector.storage import get_transcripts_storage
+
+
+class EmptyPipeline:
+    def __init__(self, logger: structlog.BoundLogger):
+        self.logger = logger
+
+    def get_pref(self, k, d=None):
+        return d
+
+    async def emit(self, event):
+        pass
+
+
+class PipelineMainMultitrack(PipelineMainBase):
+    """Process multiple participant tracks for a transcript without mixing audio."""
+
+    def __init__(self, transcript_id: str):
+        super().__init__(transcript_id=transcript_id)
+        self.logger = logger.bind(transcript_id=self.transcript_id)
+        self.empty_pipeline = EmptyPipeline(logger=self.logger)
+
+    async def set_status(self, transcript_id: str, status: TranscriptStatus):
+        async with self.lock_transaction():
+            return await transcripts_controller.set_status(transcript_id, status)
+
+    async def _list_immediate_keys(
+        self, s3, bucket_name: str, prefix: str
+    ) -> list[str]:
+        paginator = s3.get_paginator("list_objects_v2")
+        raw_prefix = prefix.rstrip("/")
+        prefixes = [raw_prefix, raw_prefix + "/"]
+
+        keys: set[str] = set()
+        for pref in prefixes:
+            for page in paginator.paginate(Bucket=bucket_name, Prefix=pref):
+                for obj in page.get("Contents", []):
+                    key = obj["Key"]
+                    if not key.startswith(pref):
+                        continue
+                    if pref.endswith("/"):
+                        rel = key[len(pref) :]
+                        if not rel or rel.endswith("/") or "/" in rel:
+                            continue
+                    else:
+                        if key != pref:
+                            continue
+                    keys.add(key)
+        result = sorted(keys)
+        self.logger.info(
+            "S3 list immediate files",
+            prefixes=prefixes,
+            total_keys=len(result),
+            sample=result[:5],
+        )
+        return result
+
+    async def process(self, bucket_name: str, prefix: str):
+        transcript = await self.get_transcript()
+
+        s3 = boto3.client(
+            "s3",
+            region_name=settings.RECORDING_STORAGE_AWS_REGION,
+            aws_access_key_id=settings.RECORDING_STORAGE_AWS_ACCESS_KEY_ID,
+            aws_secret_access_key=settings.RECORDING_STORAGE_AWS_SECRET_ACCESS_KEY,
+        )
+
+        keys = await self._list_immediate_keys(s3, bucket_name, prefix)
+        if not keys:
+            raise Exception("No audio tracks found under prefix")
+
+        storage = get_transcripts_storage()
+
+        speaker_transcripts: list[TranscriptType] = []
+        for idx, key in enumerate(keys):
+            ext = ".mp4"
+
+            try:
+                obj = s3.get_object(Bucket=bucket_name, Key=key)
+                data = obj["Body"].read()
+            except Exception as e:
+                self.logger.warning(
+                    "Skipping track - cannot read S3 object", key=key, error=str(e)
+                )
+                continue
+
+            storage_path = f"file_pipeline/{transcript.id}/tracks/track_{idx}{ext}"
+            try:
+                await storage.put_file(storage_path, data)
+                audio_url = await storage.get_file_url(storage_path)
+            except Exception as e:
+                self.logger.warning(
+                    "Skipping track - cannot upload to storage", key=key, error=str(e)
+                )
+                continue
+
+            try:
+                t = await self.transcribe_file(audio_url, transcript.source_language)
+            except Exception as e:
+                self.logger.warning(
+                    "Transcription via default backend failed, trying local whisper",
+                    key=key,
+                    url=audio_url,
+                    error=str(e),
+                )
+                try:
+                    fallback = FileTranscriptAutoProcessor(name="whisper")
+                    result = None
+
+                    async def capture_result(r):
+                        nonlocal result
+                        result = r
+
+                    fallback.on(capture_result)
+                    await fallback.push(
+                        FileTranscriptInput(
+                            audio_url=audio_url, language=transcript.source_language
+                        )
+                    )
+                    await fallback.flush()
+                    if not result:
+                        raise Exception("No transcript captured in fallback")
+                    t = result
+                except Exception as e2:
+                    self.logger.warning(
+                        "Skipping track - transcription failed after fallback",
+                        key=key,
+                        url=audio_url,
+                        error=str(e2),
+                    )
+                    continue
+
+            if not t.words:
+                continue
+            for w in t.words:
+                w.speaker = idx
+            speaker_transcripts.append(t)
+
+        if not speaker_transcripts:
+            raise Exception("No valid track transcriptions")
+
+        merged_words = []
+        for t in speaker_transcripts:
+            merged_words.extend(t.words)
+        merged_words.sort(key=lambda w: w.start)
+
+        merged_transcript = TranscriptType(words=merged_words, translation=None)
+
+        await transcripts_controller.append_event(
+            transcript,
+            event="TRANSCRIPT",
+            data=TranscriptText(
+                text=merged_transcript.text, translation=merged_transcript.translation
+            ),
+        )
+
+        topics = await self.detect_topics(merged_transcript, transcript.target_language)
+        await asyncio.gather(
+            self.generate_title(topics),
+            self.generate_summaries(topics),
+            return_exceptions=False,
+        )
+
+        await self.set_status(transcript.id, "ended")
+
+    async def transcribe_file(self, audio_url: str, language: str) -> TranscriptType:
+        processor = FileTranscriptAutoProcessor()
+        input_data = FileTranscriptInput(audio_url=audio_url, language=language)
+
+        result: TranscriptType | None = None
+
+        async def capture_result(transcript):
+            nonlocal result
+            result = transcript
+
+        processor.on(capture_result)
+        await processor.push(input_data)
+        await processor.flush()
+
+        if not result:
+            raise ValueError("No transcript captured")
+
+        return result
+
+    async def detect_topics(
+        self, transcript: TranscriptType, target_language: str
+    ) -> list[TitleSummary]:
+        chunk_size = 300
+        topics: list[TitleSummary] = []
+
+        async def on_topic(topic: TitleSummary):
+            topics.append(topic)
+            return await self.on_topic(topic)
+
+        topic_detector = TranscriptTopicDetectorProcessor(callback=on_topic)
+        topic_detector.set_pipeline(self.empty_pipeline)
+
+        for i in range(0, len(transcript.words), chunk_size):
+            chunk_words = transcript.words[i : i + chunk_size]
+            if not chunk_words:
+                continue
+
+            chunk_transcript = TranscriptType(
+                words=chunk_words, translation=transcript.translation
+            )
+            await topic_detector.push(chunk_transcript)
+
+        await topic_detector.flush()
+        return topics
+
+    async def generate_title(self, topics: list[TitleSummary]):
+        if not topics:
+            self.logger.warning("No topics for title generation")
+            return
+
+        processor = TranscriptFinalTitleProcessor(callback=self.on_title)
+        processor.set_pipeline(self.empty_pipeline)
+
+        for topic in topics:
+            await processor.push(topic)
+
+        await processor.flush()
+
+    async def generate_summaries(self, topics: list[TitleSummary]):
+        if not topics:
+            self.logger.warning("No topics for summary generation")
+            return
+
+        transcript = await self.get_transcript()
+        processor = TranscriptFinalSummaryProcessor(
+            transcript=transcript,
+            callback=self.on_long_summary,
+            on_short_summary=self.on_short_summary,
+        )
+        processor.set_pipeline(self.empty_pipeline)
+
+        for topic in topics:
+            await processor.push(topic)
+
+        await processor.flush()
+
+
+@shared_task
+@asynctask
+async def task_pipeline_multitrack_process(
+    *, transcript_id: str, bucket_name: str, prefix: str
+):
+    pipeline = PipelineMainMultitrack(transcript_id=transcript_id)
+    try:
+        await pipeline.set_status(transcript_id, "processing")
+        await pipeline.process(bucket_name, prefix)
+    except Exception:
+        await pipeline.set_status(transcript_id, "error")
+        raise
+
+    post_chain = chain(
+        task_cleanup_consent.si(transcript_id=transcript_id),
+        task_pipeline_post_to_zulip.si(transcript_id=transcript_id),
+        task_send_webhook_if_needed.si(transcript_id=transcript_id),
+    )
+    post_chain.delay()