feat: search backend (#537)

* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
2026-02-05 18:36:45 +00:00 · 2025-08-13 10:03:38 -04:00
parent a42ed12982
commit 6fb5cb21c2
29 changed files with 3213 additions and 1493 deletions
--- a/server/reflector/pipelines/main_live_pipeline.py
+++ b/server/reflector/pipelines/main_live_pipeline.py
@@ -14,12 +14,15 @@ It is directly linked to our data model.
 import asyncio
 import functools
 from contextlib import asynccontextmanager
+from typing import Generic

+import av
 import boto3
 from celery import chord, current_task, group, shared_task
 from pydantic import BaseModel
 from structlog import BoundLogger as Logger

+from reflector.db import database
 from reflector.db.meetings import meeting_consent_controller, meetings_controller
 from reflector.db.recordings import recordings_controller
 from reflector.db.rooms import rooms_controller
@@ -35,7 +38,7 @@ from reflector.db.transcripts import (
    transcripts_controller,
 )
 from reflector.logger import logger
-from reflector.pipelines.runner import PipelineRunner
+from reflector.pipelines.runner import PipelineMessage, PipelineRunner
 from reflector.processors import (
    AudioChunkerProcessor,
    AudioDiarizationAutoProcessor,
@@ -69,8 +72,6 @@ def asynctask(f):
    @functools.wraps(f)
    def wrapper(*args, **kwargs):
        async def run_with_db():
-            from reflector.db import database
-
            await database.connect()
            try:
                return await f(*args, **kwargs)
@@ -144,7 +145,7 @@ class StrValue(BaseModel):
    value: str


-class PipelineMainBase(PipelineRunner):
+class PipelineMainBase(PipelineRunner[PipelineMessage], Generic[PipelineMessage]):
    transcript_id: str
    ws_room_id: str | None = None
    ws_manager: WebsocketManager | None = None
@@ -164,7 +165,11 @@ class PipelineMainBase(PipelineRunner):
            raise Exception("Transcript not found")
        return result

-    def get_transcript_topics(self, transcript: Transcript) -> list[TranscriptTopic]:
+    @staticmethod
+    def wrap_transcript_topics(
+        topics: list[TranscriptTopic],
+    ) -> list[TitleSummaryWithIdProcessorType]:
+        # transformation to a pipe-supported format
        return [
            TitleSummaryWithIdProcessorType(
                id=topic.id,
@@ -174,7 +179,7 @@ class PipelineMainBase(PipelineRunner):
                duration=topic.duration,
                transcript=TranscriptProcessorType(words=topic.words),
            )
-            for topic in transcript.topics
+            for topic in topics
        ]

    @asynccontextmanager
@@ -380,7 +385,7 @@ class PipelineMainLive(PipelineMainBase):
        pipeline_post(transcript_id=self.transcript_id)


-class PipelineMainDiarization(PipelineMainBase):
+class PipelineMainDiarization(PipelineMainBase[AudioDiarizationInput]):
    """
    Diarize the audio and update topics
    """
@@ -404,11 +409,10 @@ class PipelineMainDiarization(PipelineMainBase):
            pipeline.logger.info("Audio is local, skipping diarization")
            return

-        topics = self.get_transcript_topics(transcript)
        audio_url = await transcript.get_audio_url()
        audio_diarization_input = AudioDiarizationInput(
            audio_url=audio_url,
-            topics=topics,
+            topics=self.wrap_transcript_topics(transcript.topics),
        )

        # as tempting to use pipeline.push, prefer to use the runner
@@ -421,7 +425,7 @@ class PipelineMainDiarization(PipelineMainBase):
        return pipeline


-class PipelineMainFromTopics(PipelineMainBase):
+class PipelineMainFromTopics(PipelineMainBase[TitleSummaryWithIdProcessorType]):
    """
    Pseudo class for generating a pipeline from topics
    """
@@ -443,7 +447,7 @@ class PipelineMainFromTopics(PipelineMainBase):
        pipeline.logger.info(f"{self.__class__.__name__} pipeline created")

        # push topics
-        topics = self.get_transcript_topics(transcript)
+        topics = PipelineMainBase.wrap_transcript_topics(transcript.topics)
        for topic in topics:
            await self.push(topic)

@@ -524,8 +528,6 @@ async def pipeline_convert_to_mp3(transcript: Transcript, logger: Logger):
    # Convert to mp3
    mp3_filename = transcript.audio_mp3_filename

-    import av
-
    with av.open(wav_filename.as_posix()) as in_container:
        in_stream = in_container.streams.audio[0]
        with av.open(mp3_filename.as_posix(), "w") as out_container:
@@ -604,7 +606,7 @@ async def cleanup_consent(transcript: Transcript, logger: Logger):
                        meeting.id
                    )
    except Exception as e:
-        logger.error(f"Failed to get fetch consent: {e}")
+        logger.error(f"Failed to get fetch consent: {e}", exc_info=e)
        consent_denied = True

    if not consent_denied:
@@ -627,7 +629,7 @@ async def cleanup_consent(transcript: Transcript, logger: Logger):
                f"Deleted original Whereby recording: {recording.bucket_name}/{recording.object_key}"
            )
        except Exception as e:
-            logger.error(f"Failed to delete Whereby recording: {e}")
+            logger.error(f"Failed to delete Whereby recording: {e}", exc_info=e)

    # non-transactional, files marked for deletion not actually deleted is possible
    await transcripts_controller.update(transcript, {"audio_deleted": True})
@@ -640,7 +642,7 @@ async def cleanup_consent(transcript: Transcript, logger: Logger):
                f"Deleted processed audio from storage: {transcript.storage_audio_path}"
            )
        except Exception as e:
-            logger.error(f"Failed to delete processed audio: {e}")
+            logger.error(f"Failed to delete processed audio: {e}", exc_info=e)

    # 3. Delete local audio files
    try:
@@ -649,7 +651,7 @@ async def cleanup_consent(transcript: Transcript, logger: Logger):
        if hasattr(transcript, "audio_wav_filename") and transcript.audio_wav_filename:
            transcript.audio_wav_filename.unlink(missing_ok=True)
    except Exception as e:
-        logger.error(f"Failed to delete local audio files: {e}")
+        logger.error(f"Failed to delete local audio files: {e}", exc_info=e)

    logger.info("Consent cleanup done")

@@ -794,8 +796,6 @@ def pipeline_post(*, transcript_id: str):

@get_transcript
 async def pipeline_process(transcript: Transcript, logger: Logger):
-    import av
-
    try:
        if transcript.audio_location == "storage":
            await transcripts_controller.download_mp3_from_storage(transcript)
--- a/server/reflector/pipelines/runner.py
+++ b/server/reflector/pipelines/runner.py
@@ -16,14 +16,17 @@ During its lifecycle, it will emit the following status:
 """

 import asyncio
+from typing import Generic, TypeVar

 from pydantic import BaseModel, ConfigDict

 from reflector.logger import logger
 from reflector.processors import Pipeline

+PipelineMessage = TypeVar("PipelineMessage")

-class PipelineRunner(BaseModel):
+
+class PipelineRunner(BaseModel, Generic[PipelineMessage]):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    status: str = "idle"
@@ -67,7 +70,7 @@ class PipelineRunner(BaseModel):
        coro = self.run()
        asyncio.run(coro)

-    async def push(self, data):
+    async def push(self, data: PipelineMessage):
        """
        Push data to the pipeline
        """
@@ -92,7 +95,11 @@ class PipelineRunner(BaseModel):
        pass

    async def _add_cmd(
-        self, cmd: str, data, max_retries: int = 3, retry_time_limit: int = 3
+        self,
+        cmd: str,
+        data: PipelineMessage,
+        max_retries: int = 3,
+        retry_time_limit: int = 3,
    ):
        """
        Enqueue a command to be executed in the runner.
@@ -143,7 +150,10 @@ class PipelineRunner(BaseModel):
                cmd, data = await self._q_cmd.get()
                func = getattr(self, f"cmd_{cmd.lower()}")
                if func:
-                    await func(data)
+                    if cmd.upper() == "FLUSH":
+                        await func()
+                    else:
+                        await func(data)
                else:
                    raise Exception(f"Unknown command {cmd}")
        except Exception:
@@ -152,13 +162,13 @@ class PipelineRunner(BaseModel):
            self._ev_done.set()
            raise

-    async def cmd_push(self, data):
+    async def cmd_push(self, data: PipelineMessage):
        if self._is_first_push:
            await self._set_status("push")
            self._is_first_push = False
        await self.pipeline.push(data)

-    async def cmd_flush(self, data):
+    async def cmd_flush(self):
        await self._set_status("flush")
        await self.pipeline.flush()
        await self._set_status("ended")