server: move out profanity filter to transcript, and implement segmentation

2025-12-20 20:29:06 +00:00 · 2023-10-19 21:05:13 +02:00
parent 0d9f66c097
commit b323254376
6 changed files with 78 additions and 19 deletions
--- a/server/reflector/processors/audio_transcript.py
+++ b/server/reflector/processors/audio_transcript.py
@@ -1,6 +1,4 @@
 from profanityfilter import ProfanityFilter
 from prometheus_client import Counter, Histogram
 from reflector.processors.base import Processor
 from reflector.processors.types import AudioFile, Transcript
@@ -40,8 +38,6 @@ class AudioTranscriptProcessor(Processor):
        self.m_transcript_call = self.m_transcript_call.labels(name)
        self.m_transcript_success = self.m_transcript_success.labels(name)
        self.m_transcript_failure = self.m_transcript_failure.labels(name)
        self.profanity_filter = ProfanityFilter()
        self.profanity_filter.set_censor("*")
        super().__init__(*args, **kwargs)
    async def _push(self, data: AudioFile):
@@ -60,9 +56,3 @@ class AudioTranscriptProcessor(Processor):
    async def _transcript(self, data: AudioFile):
        raise NotImplementedError
    def filter_profanity(self, text: str) -> str:
        """
        Remove censored words from the transcript
        """
        return self.profanity_filter.censor(text)
--- a/server/reflector/processors/audio_transcript_modal.py
+++ b/server/reflector/processors/audio_transcript_modal.py
@@ -48,10 +48,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
            )
            response.raise_for_status()
            result = response.json()
            text = result["text"][source_language]
            text = self.filter_profanity(text)
            transcript = Transcript(
                text=text,
                words=[
                    Word(
                        text=word["text"],
--- a/server/reflector/processors/audio_transcript_whisper.py
+++ b/server/reflector/processors/audio_transcript_whisper.py
@@ -30,7 +30,6 @@ class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor):
        ts = data.timestamp
        for segment in segments:
            transcript.text += segment.text
            for word in segment.words:
                transcript.words.append(
                    Word(
--- a/server/reflector/processors/transcript_liner.py
+++ b/server/reflector/processors/transcript_liner.py
@@ -36,7 +36,6 @@ class TranscriptLinerProcessor(Processor):
        # cut to the next .
        partial = Transcript(words=[])
        for word in self.transcript.words[:]:
            partial.text += word.text
            partial.words.append(word)
            if not self.is_sentence_terminated(word.text):
                continue
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -2,8 +2,12 @@ import io
 import tempfile
 from pathlib import Path
 from profanityfilter import ProfanityFilter
 from pydantic import BaseModel, PrivateAttr
 profanity_filter = ProfanityFilter()
 profanity_filter.set_censor("*")
 class AudioFile(BaseModel):
    name: str
@@ -43,13 +47,29 @@ class Word(BaseModel):
    text: str
    start: float
    end: float
    speaker: int = 0
 class TranscriptSegment(BaseModel):
    text: str
    start: float
    speaker: int = 0
 class Transcript(BaseModel):
    text: str = ""
    translation: str | None = None
    words: list[Word] = None
    @property
    def raw_text(self):
        # Uncensored text
        return "".join([word.text for word in self.words])
    @property
    def text(self):
        # Censored text
        return profanity_filter.censor(self.raw_text).strip()
    @property
    def human_timestamp(self):
        minutes = int(self.timestamp / 60)
@@ -74,7 +94,6 @@ class Transcript(BaseModel):
            self.words = other.words
        else:
            self.words.extend(other.words)
        self.text += other.text
    def add_offset(self, offset: float):
        for word in self.words:
@@ -87,6 +106,48 @@ class Transcript(BaseModel):
        ]
        return Transcript(text=self.text, translation=self.translation, words=words)
    def as_segments(self):
        # from a list of word, create a list of segments
        # join the word that are less than 2 seconds apart
        # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
        segments = []
        current_segment = None
        last_word = None
        BLANK_TIME_SECS = 2
        MAX_SEGMENT_LENGTH = 80
        for word in self.words:
            if current_segment is None:
                current_segment = TranscriptSegment(
                    text=word.text,
                    start=word.start,
                    speaker=word.speaker,
                )
                continue
            is_blank = False
            if last_word:
                is_blank = word.start - last_word.end > BLANK_TIME_SECS
            if (
                word.speaker != current_segment.speaker
                or (
                    word.text in ".;:?!…"
                    and len(current_segment.text) > MAX_SEGMENT_LENGTH
                )
                or is_blank
            ):
                # check which condition triggered
                segments.append(current_segment)
                current_segment = TranscriptSegment(
                    text=word.text,
                    start=word.start,
                    speaker=word.speaker,
                )
            else:
                current_segment.text += word.text
            last_word = word
        if current_segment:
            segments.append(current_segment)
        return segments
 class TitleSummary(BaseModel):
    title: str
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -49,12 +49,18 @@ class TranscriptText(BaseModel):
    translation: str | None
 class TranscriptSegmentTopic(BaseModel):
    speaker: int
    text: str
    timestamp: float
 class TranscriptTopic(BaseModel):
    id: str = Field(default_factory=generate_uuid4)
    title: str
    summary: str
    transcript: str | None = None
    timestamp: float
    segments: list[TranscriptSegmentTopic] = []
 class TranscriptFinalShortSummary(BaseModel):
@@ -523,8 +529,15 @@ async def handle_rtc_event(event: PipelineEvent, args, data):
        topic = TranscriptTopic(
            title=data.title,
            summary=data.summary,
            transcript=data.transcript.text,
            timestamp=data.timestamp,
            segments=[
                TranscriptSegmentTopic(
                    speaker=segment.speaker,
                    text=segment.text,
                    timestamp=segment.start,
                )
                for segment in data.transcript.as_segments()
            ],
        )
        resp = transcript.add_event(event=event, data=topic)
        transcript.upsert_topic(topic)