server: move out profanity filter to transcript, and implement segmentation

2026-02-04 18:06:48 +00:00 · 2023-10-19 21:05:13 +02:00
parent 0d9f66c097
commit b323254376
6 changed files with 78 additions and 19 deletions
--- a/server/reflector/processors/audio_transcript.py
+++ b/server/reflector/processors/audio_transcript.py
@@ -1,6 +1,4 @@
-from profanityfilter import ProfanityFilter
 from prometheus_client import Counter, Histogram
-
 from reflector.processors.base import Processor
 from reflector.processors.types import AudioFile, Transcript

@@ -40,8 +38,6 @@ class AudioTranscriptProcessor(Processor):
        self.m_transcript_call = self.m_transcript_call.labels(name)
        self.m_transcript_success = self.m_transcript_success.labels(name)
        self.m_transcript_failure = self.m_transcript_failure.labels(name)
-        self.profanity_filter = ProfanityFilter()
-        self.profanity_filter.set_censor("*")
        super().__init__(*args, **kwargs)

    async def _push(self, data: AudioFile):
@@ -60,9 +56,3 @@ class AudioTranscriptProcessor(Processor):

    async def _transcript(self, data: AudioFile):
        raise NotImplementedError
-
-    def filter_profanity(self, text: str) -> str:
-        """
-        Remove censored words from the transcript
-        """
-        return self.profanity_filter.censor(text)
--- a/server/reflector/processors/audio_transcript_modal.py
+++ b/server/reflector/processors/audio_transcript_modal.py
@@ -48,10 +48,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
            )
            response.raise_for_status()
            result = response.json()
-            text = result["text"][source_language]
-            text = self.filter_profanity(text)
            transcript = Transcript(
-                text=text,
                words=[
                    Word(
                        text=word["text"],
--- a/server/reflector/processors/audio_transcript_whisper.py
+++ b/server/reflector/processors/audio_transcript_whisper.py
@@ -30,7 +30,6 @@ class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor):
        ts = data.timestamp

        for segment in segments:
-            transcript.text += segment.text
            for word in segment.words:
                transcript.words.append(
                    Word(
--- a/server/reflector/processors/transcript_liner.py
+++ b/server/reflector/processors/transcript_liner.py
@@ -36,7 +36,6 @@ class TranscriptLinerProcessor(Processor):
        # cut to the next .
        partial = Transcript(words=[])
        for word in self.transcript.words[:]:
-            partial.text += word.text
            partial.words.append(word)
            if not self.is_sentence_terminated(word.text):
                continue
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -2,8 +2,12 @@ import io
 import tempfile
 from pathlib import Path

+from profanityfilter import ProfanityFilter
 from pydantic import BaseModel, PrivateAttr

+profanity_filter = ProfanityFilter()
+profanity_filter.set_censor("*")
+

 class AudioFile(BaseModel):
    name: str
@@ -43,13 +47,29 @@ class Word(BaseModel):
    text: str
    start: float
    end: float
+    speaker: int = 0
+
+
+class TranscriptSegment(BaseModel):
+    text: str
+    start: float
+    speaker: int = 0


 class Transcript(BaseModel):
-    text: str = ""
    translation: str | None = None
    words: list[Word] = None

+    @property
+    def raw_text(self):
+        # Uncensored text
+        return "".join([word.text for word in self.words])
+
+    @property
+    def text(self):
+        # Censored text
+        return profanity_filter.censor(self.raw_text).strip()
+
    @property
    def human_timestamp(self):
        minutes = int(self.timestamp / 60)
@@ -74,7 +94,6 @@ class Transcript(BaseModel):
            self.words = other.words
        else:
            self.words.extend(other.words)
-        self.text += other.text

    def add_offset(self, offset: float):
        for word in self.words:
@@ -87,6 +106,48 @@ class Transcript(BaseModel):
        ]
        return Transcript(text=self.text, translation=self.translation, words=words)

+    def as_segments(self):
+        # from a list of word, create a list of segments
+        # join the word that are less than 2 seconds apart
+        # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
+        segments = []
+        current_segment = None
+        last_word = None
+        BLANK_TIME_SECS = 2
+        MAX_SEGMENT_LENGTH = 80
+        for word in self.words:
+            if current_segment is None:
+                current_segment = TranscriptSegment(
+                    text=word.text,
+                    start=word.start,
+                    speaker=word.speaker,
+                )
+                continue
+            is_blank = False
+            if last_word:
+                is_blank = word.start - last_word.end > BLANK_TIME_SECS
+            if (
+                word.speaker != current_segment.speaker
+                or (
+                    word.text in ".;:?!…"
+                    and len(current_segment.text) > MAX_SEGMENT_LENGTH
+                )
+                or is_blank
+            ):
+                # check which condition triggered
+                segments.append(current_segment)
+                current_segment = TranscriptSegment(
+                    text=word.text,
+                    start=word.start,
+                    speaker=word.speaker,
+                )
+            else:
+                current_segment.text += word.text
+            last_word = word
+        if current_segment:
+            segments.append(current_segment)
+        return segments
+

 class TitleSummary(BaseModel):
    title: str
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -49,12 +49,18 @@ class TranscriptText(BaseModel):
    translation: str | None


+class TranscriptSegmentTopic(BaseModel):
+    speaker: int
+    text: str
+    timestamp: float
+
+
 class TranscriptTopic(BaseModel):
    id: str = Field(default_factory=generate_uuid4)
    title: str
    summary: str
-    transcript: str | None = None
    timestamp: float
+    segments: list[TranscriptSegmentTopic] = []


 class TranscriptFinalShortSummary(BaseModel):
@@ -523,8 +529,15 @@ async def handle_rtc_event(event: PipelineEvent, args, data):
        topic = TranscriptTopic(
            title=data.title,
            summary=data.summary,
-            transcript=data.transcript.text,
            timestamp=data.timestamp,
+            segments=[
+                TranscriptSegmentTopic(
+                    speaker=segment.speaker,
+                    text=segment.text,
+                    timestamp=segment.start,
+                )
+                for segment in data.transcript.as_segments()
+            ],
        )
        resp = transcript.add_event(event=event, data=topic)
        transcript.upsert_topic(topic)