add profanity filter, post-process topic/title

2026-02-05 02:16:46 +00:00 · 2023-09-21 11:12:00 +05:30
parent 19dfb1d027
commit ab41ce90e8
8 changed files with 224 additions and 5 deletions
--- a/server/reflector/processors/audio_transcript.py
+++ b/server/reflector/processors/audio_transcript.py
@@ -1,4 +1,6 @@
+from profanityfilter import ProfanityFilter
 from prometheus_client import Counter, Histogram
+
 from reflector.processors.base import Processor
 from reflector.processors.types import AudioFile, Transcript

@@ -38,6 +40,8 @@ class AudioTranscriptProcessor(Processor):
        self.m_transcript_call = self.m_transcript_call.labels(name)
        self.m_transcript_success = self.m_transcript_success.labels(name)
        self.m_transcript_failure = self.m_transcript_failure.labels(name)
+        self.profanity_filter = ProfanityFilter()
+        self.profanity_filter.set_censor("|*|")
        super().__init__(*args, **kwargs)

    async def _push(self, data: AudioFile):
@@ -56,3 +60,11 @@ class AudioTranscriptProcessor(Processor):

    async def _transcript(self, data: AudioFile):
        raise NotImplementedError
+
+    def filter_profanity(self, text: str) -> str:
+        """
+        Remove censored words from the transcript
+        """
+        text = self.profanity_filter.censor(text)
+        text = text.replace("|*|", "")
+        return text
--- a/server/reflector/processors/audio_transcript_modal.py
+++ b/server/reflector/processors/audio_transcript_modal.py
@@ -15,6 +15,7 @@ API will be a POST request to TRANSCRIPT_URL:
 from time import monotonic

 import httpx
+
 from reflector.processors.audio_transcript import AudioTranscriptProcessor
 from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
 from reflector.processors.types import AudioFile, Transcript, TranslationLanguages, Word
@@ -86,7 +87,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
            if source_language != target_language and target_language in result["text"]:
                translation = result["text"][target_language]
            text = result["text"][source_language]
-
+            text = self.filter_profanity(text)
            transcript = Transcript(
                text=text,
                translation=translation,
--- a/server/reflector/processors/transcript_final_title.py
+++ b/server/reflector/processors/transcript_final_title.py
@@ -60,6 +60,8 @@ class TranscriptFinalTitleProcessor(Processor):

        accumulated_titles = ".".join([chunk.title for chunk in self.chunks])
        title_result = await self.get_title(accumulated_titles)
+        final_title = self.llm.ensure_casing(title_result["title"])
+        final_title = self.llm.trim_title(final_title)

-        final_title = FinalTitle(title=title_result["title"])
+        final_title = FinalTitle(title=final_title)
        await self.emit(final_title)
--- a/server/reflector/processors/transcript_topic_detector.py
+++ b/server/reflector/processors/transcript_topic_detector.py
@@ -55,8 +55,11 @@ class TranscriptTopicDetectorProcessor(Processor):
        self.logger.info(f"Topic detector got {len(text)} length transcript")
        topic_result = await self.get_topic(text=text)

+        title = self.llm.ensure_casing(topic_result["title"])
+        title = self.llm.trim_title(title)
+
        summary = TitleSummary(
-            title=self.llm.ensure_casing(topic_result["title"]),
+            title=title,
            summary=topic_result["summary"],
            timestamp=self.transcript.timestamp,
            duration=self.transcript.duration,