server: improve split algorithm

This commit is contained in:
2023-10-20 16:06:35 +02:00
committed by Mathieu Virbel
parent 6d074ed457
commit 00eb9bbf3c

View File

@@ -1,10 +1,13 @@
import io import io
import re
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from profanityfilter import ProfanityFilter from profanityfilter import ProfanityFilter
from pydantic import BaseModel, PrivateAttr from pydantic import BaseModel, PrivateAttr
PUNC_RE = re.compile(r"[.;:?!…]")
profanity_filter = ProfanityFilter() profanity_filter = ProfanityFilter()
profanity_filter.set_censor("*") profanity_filter.set_censor("*")
@@ -106,15 +109,14 @@ class Transcript(BaseModel):
] ]
return Transcript(text=self.text, translation=self.translation, words=words) return Transcript(text=self.text, translation=self.translation, words=words)
def as_segments(self): def as_segments(self) -> list[TranscriptSegment]:
# from a list of word, create a list of segments # from a list of word, create a list of segments
# join the word that are less than 2 seconds apart # join the word that are less than 2 seconds apart
# but separate if the speaker changes, or if the punctuation is a . , ; : ? ! # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
segments = [] segments = []
current_segment = None current_segment = None
last_word = None MAX_SEGMENT_LENGTH = 120
BLANK_TIME_SECS = 2
MAX_SEGMENT_LENGTH = 80
for word in self.words: for word in self.words:
if current_segment is None: if current_segment is None:
current_segment = TranscriptSegment( current_segment = TranscriptSegment(
@@ -123,27 +125,14 @@ class Transcript(BaseModel):
speaker=word.speaker, speaker=word.speaker,
) )
continue continue
is_blank = False current_segment.text += word.text
if last_word:
is_blank = word.start - last_word.end > BLANK_TIME_SECS have_punc = PUNC_RE.search(word.text)
if ( if word.speaker != current_segment.speaker or (
word.speaker != current_segment.speaker have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH)
or (
word.text in ".;:?!…"
and len(current_segment.text) > MAX_SEGMENT_LENGTH
)
or is_blank
): ):
# check which condition triggered
segments.append(current_segment) segments.append(current_segment)
current_segment = TranscriptSegment( current_segment = None
text=word.text,
start=word.start,
speaker=word.speaker,
)
else:
current_segment.text += word.text
last_word = word
if current_segment: if current_segment:
segments.append(current_segment) segments.append(current_segment)
return segments return segments