mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 20:59:05 +00:00
add profanity filter, post-process topic/title
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
from profanityfilter import ProfanityFilter
|
||||
from prometheus_client import Counter, Histogram
|
||||
|
||||
from reflector.processors.base import Processor
|
||||
from reflector.processors.types import AudioFile, Transcript
|
||||
|
||||
@@ -38,6 +40,8 @@ class AudioTranscriptProcessor(Processor):
|
||||
self.m_transcript_call = self.m_transcript_call.labels(name)
|
||||
self.m_transcript_success = self.m_transcript_success.labels(name)
|
||||
self.m_transcript_failure = self.m_transcript_failure.labels(name)
|
||||
self.profanity_filter = ProfanityFilter()
|
||||
self.profanity_filter.set_censor("|*|")
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def _push(self, data: AudioFile):
|
||||
@@ -56,3 +60,11 @@ class AudioTranscriptProcessor(Processor):
|
||||
|
||||
async def _transcript(self, data: AudioFile):
|
||||
raise NotImplementedError
|
||||
|
||||
def filter_profanity(self, text: str) -> str:
|
||||
"""
|
||||
Remove censored words from the transcript
|
||||
"""
|
||||
text = self.profanity_filter.censor(text)
|
||||
text = text.replace("|*|", "")
|
||||
return text
|
||||
|
||||
@@ -15,6 +15,7 @@ API will be a POST request to TRANSCRIPT_URL:
|
||||
from time import monotonic
|
||||
|
||||
import httpx
|
||||
|
||||
from reflector.processors.audio_transcript import AudioTranscriptProcessor
|
||||
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
|
||||
from reflector.processors.types import AudioFile, Transcript, TranslationLanguages, Word
|
||||
@@ -86,7 +87,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
|
||||
if source_language != target_language and target_language in result["text"]:
|
||||
translation = result["text"][target_language]
|
||||
text = result["text"][source_language]
|
||||
|
||||
text = self.filter_profanity(text)
|
||||
transcript = Transcript(
|
||||
text=text,
|
||||
translation=translation,
|
||||
|
||||
@@ -60,6 +60,8 @@ class TranscriptFinalTitleProcessor(Processor):
|
||||
|
||||
accumulated_titles = ".".join([chunk.title for chunk in self.chunks])
|
||||
title_result = await self.get_title(accumulated_titles)
|
||||
final_title = self.llm.ensure_casing(title_result["title"])
|
||||
final_title = self.llm.trim_title(final_title)
|
||||
|
||||
final_title = FinalTitle(title=title_result["title"])
|
||||
final_title = FinalTitle(title=final_title)
|
||||
await self.emit(final_title)
|
||||
|
||||
@@ -55,8 +55,11 @@ class TranscriptTopicDetectorProcessor(Processor):
|
||||
self.logger.info(f"Topic detector got {len(text)} length transcript")
|
||||
topic_result = await self.get_topic(text=text)
|
||||
|
||||
title = self.llm.ensure_casing(topic_result["title"])
|
||||
title = self.llm.trim_title(title)
|
||||
|
||||
summary = TitleSummary(
|
||||
title=self.llm.ensure_casing(topic_result["title"]),
|
||||
title=title,
|
||||
summary=topic_result["summary"],
|
||||
timestamp=self.transcript.timestamp,
|
||||
duration=self.transcript.duration,
|
||||
|
||||
Reference in New Issue
Block a user