add profanity filter, post-process topic/title

This commit is contained in:
Gokul Mohanarangan
2023-09-21 11:12:00 +05:30
parent 19dfb1d027
commit ab41ce90e8
8 changed files with 224 additions and 5 deletions

View File

@@ -1,4 +1,6 @@
from profanityfilter import ProfanityFilter
from prometheus_client import Counter, Histogram
from reflector.processors.base import Processor
from reflector.processors.types import AudioFile, Transcript
@@ -38,6 +40,8 @@ class AudioTranscriptProcessor(Processor):
self.m_transcript_call = self.m_transcript_call.labels(name)
self.m_transcript_success = self.m_transcript_success.labels(name)
self.m_transcript_failure = self.m_transcript_failure.labels(name)
self.profanity_filter = ProfanityFilter()
self.profanity_filter.set_censor("|*|")
super().__init__(*args, **kwargs)
async def _push(self, data: AudioFile):
@@ -56,3 +60,11 @@ class AudioTranscriptProcessor(Processor):
async def _transcript(self, data: AudioFile):
raise NotImplementedError
def filter_profanity(self, text: str) -> str:
"""
Remove censored words from the transcript
"""
text = self.profanity_filter.censor(text)
text = text.replace("|*|", "")
return text

View File

@@ -15,6 +15,7 @@ API will be a POST request to TRANSCRIPT_URL:
from time import monotonic
import httpx
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, TranslationLanguages, Word
@@ -86,7 +87,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
if source_language != target_language and target_language in result["text"]:
translation = result["text"][target_language]
text = result["text"][source_language]
text = self.filter_profanity(text)
transcript = Transcript(
text=text,
translation=translation,

View File

@@ -60,6 +60,8 @@ class TranscriptFinalTitleProcessor(Processor):
accumulated_titles = ".".join([chunk.title for chunk in self.chunks])
title_result = await self.get_title(accumulated_titles)
final_title = self.llm.ensure_casing(title_result["title"])
final_title = self.llm.trim_title(final_title)
final_title = FinalTitle(title=title_result["title"])
final_title = FinalTitle(title=final_title)
await self.emit(final_title)

View File

@@ -55,8 +55,11 @@ class TranscriptTopicDetectorProcessor(Processor):
self.logger.info(f"Topic detector got {len(text)} length transcript")
topic_result = await self.get_topic(text=text)
title = self.llm.ensure_casing(topic_result["title"])
title = self.llm.trim_title(title)
summary = TitleSummary(
title=self.llm.ensure_casing(topic_result["title"]),
title=title,
summary=topic_result["summary"],
timestamp=self.transcript.timestamp,
duration=self.transcript.duration,