mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
server: improve split algorithm
This commit is contained in:
@@ -1,10 +1,13 @@
|
|||||||
import io
|
import io
|
||||||
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from profanityfilter import ProfanityFilter
|
from profanityfilter import ProfanityFilter
|
||||||
from pydantic import BaseModel, PrivateAttr
|
from pydantic import BaseModel, PrivateAttr
|
||||||
|
|
||||||
|
PUNC_RE = re.compile(r"[.;:?!…]")
|
||||||
|
|
||||||
profanity_filter = ProfanityFilter()
|
profanity_filter = ProfanityFilter()
|
||||||
profanity_filter.set_censor("*")
|
profanity_filter.set_censor("*")
|
||||||
|
|
||||||
@@ -106,15 +109,14 @@ class Transcript(BaseModel):
|
|||||||
]
|
]
|
||||||
return Transcript(text=self.text, translation=self.translation, words=words)
|
return Transcript(text=self.text, translation=self.translation, words=words)
|
||||||
|
|
||||||
def as_segments(self):
|
def as_segments(self) -> list[TranscriptSegment]:
|
||||||
# from a list of word, create a list of segments
|
# from a list of word, create a list of segments
|
||||||
# join the word that are less than 2 seconds apart
|
# join the word that are less than 2 seconds apart
|
||||||
# but separate if the speaker changes, or if the punctuation is a . , ; : ? !
|
# but separate if the speaker changes, or if the punctuation is a . , ; : ? !
|
||||||
segments = []
|
segments = []
|
||||||
current_segment = None
|
current_segment = None
|
||||||
last_word = None
|
MAX_SEGMENT_LENGTH = 120
|
||||||
BLANK_TIME_SECS = 2
|
|
||||||
MAX_SEGMENT_LENGTH = 80
|
|
||||||
for word in self.words:
|
for word in self.words:
|
||||||
if current_segment is None:
|
if current_segment is None:
|
||||||
current_segment = TranscriptSegment(
|
current_segment = TranscriptSegment(
|
||||||
@@ -123,27 +125,14 @@ class Transcript(BaseModel):
|
|||||||
speaker=word.speaker,
|
speaker=word.speaker,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
is_blank = False
|
current_segment.text += word.text
|
||||||
if last_word:
|
|
||||||
is_blank = word.start - last_word.end > BLANK_TIME_SECS
|
have_punc = PUNC_RE.search(word.text)
|
||||||
if (
|
if word.speaker != current_segment.speaker or (
|
||||||
word.speaker != current_segment.speaker
|
have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH)
|
||||||
or (
|
|
||||||
word.text in ".;:?!…"
|
|
||||||
and len(current_segment.text) > MAX_SEGMENT_LENGTH
|
|
||||||
)
|
|
||||||
or is_blank
|
|
||||||
):
|
):
|
||||||
# check which condition triggered
|
|
||||||
segments.append(current_segment)
|
segments.append(current_segment)
|
||||||
current_segment = TranscriptSegment(
|
current_segment = None
|
||||||
text=word.text,
|
|
||||||
start=word.start,
|
|
||||||
speaker=word.speaker,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
current_segment.text += word.text
|
|
||||||
last_word = word
|
|
||||||
if current_segment:
|
if current_segment:
|
||||||
segments.append(current_segment)
|
segments.append(current_segment)
|
||||||
return segments
|
return segments
|
||||||
|
|||||||
Reference in New Issue
Block a user