diff --git a/server/reflector/processors/audio_transcript.py b/server/reflector/processors/audio_transcript.py index f029b587..3f9dc85b 100644 --- a/server/reflector/processors/audio_transcript.py +++ b/server/reflector/processors/audio_transcript.py @@ -1,6 +1,4 @@ -from profanityfilter import ProfanityFilter from prometheus_client import Counter, Histogram - from reflector.processors.base import Processor from reflector.processors.types import AudioFile, Transcript @@ -40,8 +38,6 @@ class AudioTranscriptProcessor(Processor): self.m_transcript_call = self.m_transcript_call.labels(name) self.m_transcript_success = self.m_transcript_success.labels(name) self.m_transcript_failure = self.m_transcript_failure.labels(name) - self.profanity_filter = ProfanityFilter() - self.profanity_filter.set_censor("*") super().__init__(*args, **kwargs) async def _push(self, data: AudioFile): @@ -60,9 +56,3 @@ class AudioTranscriptProcessor(Processor): async def _transcript(self, data: AudioFile): raise NotImplementedError - - def filter_profanity(self, text: str) -> str: - """ - Remove censored words from the transcript - """ - return self.profanity_filter.censor(text) diff --git a/server/reflector/processors/audio_transcript_modal.py b/server/reflector/processors/audio_transcript_modal.py index 201ed9d4..23c9d74e 100644 --- a/server/reflector/processors/audio_transcript_modal.py +++ b/server/reflector/processors/audio_transcript_modal.py @@ -48,10 +48,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor): ) response.raise_for_status() result = response.json() - text = result["text"][source_language] - text = self.filter_profanity(text) transcript = Transcript( - text=text, words=[ Word( text=word["text"], diff --git a/server/reflector/processors/audio_transcript_whisper.py b/server/reflector/processors/audio_transcript_whisper.py index e3bd595b..cd96e01a 100644 --- a/server/reflector/processors/audio_transcript_whisper.py +++ b/server/reflector/processors/audio_transcript_whisper.py @@ -30,7 +30,6 @@ class AudioTranscriptWhisperProcessor(AudioTranscriptProcessor): ts = data.timestamp for segment in segments: - transcript.text += segment.text for word in segment.words: transcript.words.append( Word( diff --git a/server/reflector/processors/transcript_liner.py b/server/reflector/processors/transcript_liner.py index c1aa14a0..b4e7b5e3 100644 --- a/server/reflector/processors/transcript_liner.py +++ b/server/reflector/processors/transcript_liner.py @@ -36,7 +36,6 @@ class TranscriptLinerProcessor(Processor): # cut to the next . partial = Transcript(words=[]) for word in self.transcript.words[:]: - partial.text += word.text partial.words.append(word) if not self.is_sentence_terminated(word.text): continue diff --git a/server/reflector/processors/types.py b/server/reflector/processors/types.py index e867becf..686c5785 100644 --- a/server/reflector/processors/types.py +++ b/server/reflector/processors/types.py @@ -2,8 +2,12 @@ import io import tempfile from pathlib import Path +from profanityfilter import ProfanityFilter from pydantic import BaseModel, PrivateAttr +profanity_filter = ProfanityFilter() +profanity_filter.set_censor("*") + class AudioFile(BaseModel): name: str @@ -43,13 +47,29 @@ class Word(BaseModel): text: str start: float end: float + speaker: int = 0 + + +class TranscriptSegment(BaseModel): + text: str + start: float + speaker: int = 0 class Transcript(BaseModel): - text: str = "" translation: str | None = None words: list[Word] = None + @property + def raw_text(self): + # Uncensored text + return "".join([word.text for word in self.words]) + + @property + def text(self): + # Censored text + return profanity_filter.censor(self.raw_text).strip() + @property def human_timestamp(self): minutes = int(self.timestamp / 60) @@ -74,7 +94,6 @@ class Transcript(BaseModel): self.words = other.words else: self.words.extend(other.words) - self.text += other.text def add_offset(self, offset: float): for word in self.words: @@ -87,6 +106,48 @@ class Transcript(BaseModel): ] return Transcript(text=self.text, translation=self.translation, words=words) + def as_segments(self): + # from a list of word, create a list of segments + # join the word that are less than 2 seconds apart + # but separate if the speaker changes, or if the punctuation is a . , ; : ? ! + segments = [] + current_segment = None + last_word = None + BLANK_TIME_SECS = 2 + MAX_SEGMENT_LENGTH = 80 + for word in self.words: + if current_segment is None: + current_segment = TranscriptSegment( + text=word.text, + start=word.start, + speaker=word.speaker, + ) + continue + is_blank = False + if last_word: + is_blank = word.start - last_word.end > BLANK_TIME_SECS + if ( + word.speaker != current_segment.speaker + or ( + word.text in ".;:?!…" + and len(current_segment.text) > MAX_SEGMENT_LENGTH + ) + or is_blank + ): + # check which condition triggered + segments.append(current_segment) + current_segment = TranscriptSegment( + text=word.text, + start=word.start, + speaker=word.speaker, + ) + else: + current_segment.text += word.text + last_word = word + if current_segment: + segments.append(current_segment) + return segments + class TitleSummary(BaseModel): title: str diff --git a/server/reflector/views/transcripts.py b/server/reflector/views/transcripts.py index a7e01b8c..0a068c17 100644 --- a/server/reflector/views/transcripts.py +++ b/server/reflector/views/transcripts.py @@ -49,12 +49,18 @@ class TranscriptText(BaseModel): translation: str | None +class TranscriptSegmentTopic(BaseModel): + speaker: int + text: str + timestamp: float + + class TranscriptTopic(BaseModel): id: str = Field(default_factory=generate_uuid4) title: str summary: str - transcript: str | None = None timestamp: float + segments: list[TranscriptSegmentTopic] = [] class TranscriptFinalShortSummary(BaseModel): @@ -523,8 +529,15 @@ async def handle_rtc_event(event: PipelineEvent, args, data): topic = TranscriptTopic( title=data.title, summary=data.summary, - transcript=data.transcript.text, timestamp=data.timestamp, + segments=[ + TranscriptSegmentTopic( + speaker=segment.speaker, + text=segment.text, + timestamp=segment.start, + ) + for segment in data.transcript.as_segments() + ], ) resp = transcript.add_event(event=event, data=topic) transcript.upsert_topic(topic)