Files
reflector/server/reflector/processors/transcript_liner.py
2023-09-26 19:49:54 +05:30

54 lines
1.6 KiB
Python

from reflector.processors.base import Processor
from reflector.processors.types import Transcript
class TranscriptLinerProcessor(Processor):
"""
Based on stream of transcript, assemble and remove duplicated words
then cut per lines.
"""
INPUT_TYPE = Transcript
OUTPUT_TYPE = Transcript
def __init__(self, max_text=1000, **kwargs):
super().__init__(**kwargs)
self.transcript = Transcript(words=[])
self.max_text = max_text
def is_sentence_terminated(self, sentence) -> bool:
sentence_terminators = [".", "?", "!"]
for terminator in sentence_terminators:
if terminator in sentence:
return True
return False
async def _push(self, data: Transcript):
# merge both transcript
self.transcript.merge(data)
# check if a line is complete
if not self.is_sentence_terminated(self.transcript.text):
# if the transcription text is still not too long, wait for more
if len(self.transcript.text) < self.max_text:
return
# cut to the next .
partial = Transcript(words=[])
for word in self.transcript.words[:]:
partial.text += word.text
partial.words.append(word)
if not self.is_sentence_terminated(word.text):
continue
# emit line
await self.emit(partial)
# create new transcript
partial = Transcript(words=[])
self.transcript = partial
async def _flush(self):
if self.transcript.words:
await self.emit(self.transcript)