server: full diarization processor implementation based on gokul app

2026-02-05 02:16:46 +00:00 · 2023-10-27 20:00:07 +02:00
parent 07c4d080c2
commit d8a842f099
15 changed files with 186 additions and 110 deletions
--- a/server/reflector/processors/init.py
+++ b/server/reflector/processors/init.py
@@ -1,5 +1,5 @@
 from .audio_chunker import AudioChunkerProcessor  # noqa: F401
-from .audio_diarization import AudioDiarizationProcessor  # noqa: F401
+from .audio_diarization_auto import AudioDiarizationAutoProcessor  # noqa: F401
 from .audio_file_writer import AudioFileWriterProcessor  # noqa: F401
 from .audio_merge import AudioMergeProcessor  # noqa: F401
 from .audio_transcript import AudioTranscriptProcessor  # noqa: F401
--- a/server/reflector/processors/audio_diarization.py
+++ b/server/reflector/processors/audio_diarization.py
@@ -1,65 +0,0 @@
-from reflector.processors.base import Processor
-from reflector.processors.types import AudioDiarizationInput, TitleSummary
-
-
-class AudioDiarizationProcessor(Processor):
-    INPUT_TYPE = AudioDiarizationInput
-    OUTPUT_TYPE = TitleSummary
-
-    async def _push(self, data: AudioDiarizationInput):
-        # Gather diarization data
-        diarization = [
-            {"start": 0.0, "stop": 4.9, "speaker": 2},
-            {"start": 5.6, "stop": 6.7, "speaker": 2},
-            {"start": 7.3, "stop": 8.9, "speaker": 2},
-            {"start": 7.3, "stop": 7.9, "speaker": 0},
-            {"start": 9.4, "stop": 11.2, "speaker": 2},
-            {"start": 9.7, "stop": 10.0, "speaker": 0},
-            {"start": 10.0, "stop": 10.1, "speaker": 0},
-            {"start": 11.7, "stop": 16.1, "speaker": 2},
-            {"start": 11.8, "stop": 12.1, "speaker": 1},
-            {"start": 16.4, "stop": 21.0, "speaker": 2},
-            {"start": 21.1, "stop": 22.6, "speaker": 2},
-            {"start": 24.7, "stop": 31.9, "speaker": 2},
-            {"start": 32.0, "stop": 32.8, "speaker": 1},
-            {"start": 33.4, "stop": 37.8, "speaker": 2},
-            {"start": 37.9, "stop": 40.3, "speaker": 0},
-            {"start": 39.2, "stop": 40.4, "speaker": 2},
-            {"start": 40.7, "stop": 41.4, "speaker": 0},
-            {"start": 41.6, "stop": 45.7, "speaker": 2},
-            {"start": 46.4, "stop": 53.1, "speaker": 2},
-            {"start": 53.6, "stop": 56.5, "speaker": 2},
-            {"start": 54.9, "stop": 75.4, "speaker": 1},
-            {"start": 57.3, "stop": 58.0, "speaker": 2},
-            {"start": 65.7, "stop": 66.0, "speaker": 2},
-            {"start": 75.8, "stop": 78.8, "speaker": 1},
-            {"start": 79.0, "stop": 82.6, "speaker": 1},
-            {"start": 83.2, "stop": 83.3, "speaker": 1},
-            {"start": 84.5, "stop": 94.3, "speaker": 1},
-            {"start": 95.1, "stop": 100.7, "speaker": 1},
-            {"start": 100.7, "stop": 102.0, "speaker": 0},
-            {"start": 100.7, "stop": 101.8, "speaker": 1},
-            {"start": 102.0, "stop": 103.0, "speaker": 1},
-            {"start": 103.0, "stop": 103.7, "speaker": 0},
-            {"start": 103.7, "stop": 103.8, "speaker": 1},
-            {"start": 103.8, "stop": 113.9, "speaker": 0},
-            {"start": 114.7, "stop": 117.0, "speaker": 0},
-            {"start": 117.0, "stop": 117.4, "speaker": 1},
-        ]
-
-        # now reapply speaker to topics (if any)
-        # topics is a list[BaseModel] with an attribute words
-        # words is a list[BaseModel] with text, start and speaker attribute
-
-        print("IN DIARIZATION PROCESSOR", data)
-
-        # mutate in place
-        for topic in data.topics:
-            for word in topic.transcript.words:
-                for d in diarization:
-                    if d["start"] <= word.start <= d["stop"]:
-                        word.speaker = d["speaker"]
-
-        # emit them
-        for topic in data.topics:
-            await self.emit(topic)
--- a/server/reflector/processors/audio_diarization_auto.py
+++ b/server/reflector/processors/audio_diarization_auto.py
@@ -0,0 +1,34 @@
+import importlib
+
+from reflector.processors.base import Processor
+from reflector.settings import settings
+
+
+class AudioDiarizationAutoProcessor(Processor):
+    _registry = {}
+
+    @classmethod
+    def register(cls, name, kclass):
+        cls._registry[name] = kclass
+
+    @classmethod
+    def get_instance(cls, name: str | None = None, **kwargs):
+        if name is None:
+            name = settings.DIARIZATION_BACKEND
+
+        if name not in cls._registry:
+            module_name = f"reflector.processors.audio_diarization_{name}"
+            importlib.import_module(module_name)
+
+        # gather specific configuration for the processor
+        # search `DIARIZATION_BACKEND_XXX_YYY`, push to constructor as `backend_xxx_yyy`
+        config = {}
+        name_upper = name.upper()
+        settings_prefix = "DIARIZATION_"
+        config_prefix = f"{settings_prefix}{name_upper}_"
+        for key, value in settings:
+            if key.startswith(config_prefix):
+                config_name = key[len(settings_prefix) :].lower()
+                config[config_name] = value
+
+        return cls._registry[name](**config | kwargs)
--- a/server/reflector/processors/audio_diarization_base.py
+++ b/server/reflector/processors/audio_diarization_base.py
@@ -0,0 +1,28 @@
+from reflector.processors.base import Processor
+from reflector.processors.types import AudioDiarizationInput, TitleSummary
+
+
+class AudioDiarizationBaseProcessor(Processor):
+    INPUT_TYPE = AudioDiarizationInput
+    OUTPUT_TYPE = TitleSummary
+
+    async def _push(self, data: AudioDiarizationInput):
+        diarization = await self._diarize(data)
+
+        # now reapply speaker to topics (if any)
+        # topics is a list[BaseModel] with an attribute words
+        # words is a list[BaseModel] with text, start and speaker attribute
+
+        # mutate in place
+        for topic in data.topics:
+            for word in topic.transcript.words:
+                for d in diarization:
+                    if d["start"] <= word.start <= d["end"]:
+                        word.speaker = d["speaker"]
+
+        # emit them
+        for topic in data.topics:
+            await self.emit(topic)
+
+    async def _diarize(self, data: AudioDiarizationInput):
+        raise NotImplementedError
--- a/server/reflector/processors/audio_diarization_modal.py
+++ b/server/reflector/processors/audio_diarization_modal.py
@@ -0,0 +1,36 @@
+import httpx
+from reflector.processors.audio_diarization_auto import AudioDiarizationAutoProcessor
+from reflector.processors.audio_diarization_base import AudioDiarizationBaseProcessor
+from reflector.processors.types import AudioDiarizationInput, TitleSummary
+from reflector.settings import settings
+
+
+class AudioDiarizationModalProcessor(AudioDiarizationBaseProcessor):
+    INPUT_TYPE = AudioDiarizationInput
+    OUTPUT_TYPE = TitleSummary
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.diarization_url = settings.DIARIZATION_URL + "/diarize"
+        self.headers = {
+            "Authorization": f"Bearer {settings.LLM_MODAL_API_KEY}",
+        }
+
+    async def _diarize(self, data: AudioDiarizationInput):
+        # Gather diarization data
+        params = {
+            "audio_file_url": data.audio_url,
+            "timestamp": 0,
+        }
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                self.diarization_url,
+                headers=self.headers,
+                params=params,
+                timeout=None,
+            )
+            response.raise_for_status()
+            return response.json()["text"]
+
+
+AudioDiarizationAutoProcessor.register("modal", AudioDiarizationModalProcessor)
--- a/server/reflector/processors/audio_transcript_auto.py
+++ b/server/reflector/processors/audio_transcript_auto.py
@@ -1,8 +1,6 @@
 import importlib

 from reflector.processors.audio_transcript import AudioTranscriptProcessor
-from reflector.processors.base import Pipeline, Processor
-from reflector.processors.types import AudioFile
 from reflector.settings import settings


@@ -14,7 +12,9 @@ class AudioTranscriptAutoProcessor(AudioTranscriptProcessor):
        cls._registry[name] = kclass

    @classmethod
-    def get_instance(cls, name):
+    def get_instance(cls, name: str | None = None, **kwargs):
+        if name is None:
+            name = settings.TRANSCRIPT_BACKEND
        if name not in cls._registry:
            module_name = f"reflector.processors.audio_transcript_{name}"
            importlib.import_module(module_name)
@@ -30,30 +30,4 @@ class AudioTranscriptAutoProcessor(AudioTranscriptProcessor):
                config_name = key[len(settings_prefix) :].lower()
                config[config_name] = value

-        return cls._registry[name](**config)
-
-    def __init__(self, **kwargs):
-        self.processor = self.get_instance(settings.TRANSCRIPT_BACKEND)
-        super().__init__(**kwargs)
-
-    def set_pipeline(self, pipeline: Pipeline):
-        super().set_pipeline(pipeline)
-        self.processor.set_pipeline(pipeline)
-
-    def connect(self, processor: Processor):
-        self.processor.connect(processor)
-
-    def disconnect(self, processor: Processor):
-        self.processor.disconnect(processor)
-
-    def on(self, callback):
-        self.processor.on(callback)
-
-    def off(self, callback):
-        self.processor.off(callback)
-
-    async def _push(self, data: AudioFile):
-        return await self.processor._push(data)
-
-    async def _flush(self):
-        return await self.processor._flush()
+        return cls._registry[name](**config | kwargs)
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -385,5 +385,5 @@ class TranslationLanguages(BaseModel):


 class AudioDiarizationInput(BaseModel):
-    audio_filename: Path
+    audio_url: str
    topics: list[TitleSummary]