server: improve split algorithm

2026-02-04 18:06:48 +00:00 · 2023-10-20 16:06:35 +02:00
parent 6d074ed457
commit 00eb9bbf3c
1 changed files with 12 additions and 23 deletions
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -1,10 +1,13 @@
 import io
 import re
 import tempfile
 from pathlib import Path
 from profanityfilter import ProfanityFilter
 from pydantic import BaseModel, PrivateAttr
 PUNC_RE = re.compile(r"[.;:?!…]")
 profanity_filter = ProfanityFilter()
 profanity_filter.set_censor("*")
@@ -106,15 +109,14 @@ class Transcript(BaseModel):
        ]
        return Transcript(text=self.text, translation=self.translation, words=words)
-    def as_segments(self):
+    def as_segments(self) -> list[TranscriptSegment]:
        # from a list of word, create a list of segments
        # join the word that are less than 2 seconds apart
        # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
        segments = []
        current_segment = None
-        last_word = None
+        MAX_SEGMENT_LENGTH = 120
-        BLANK_TIME_SECS = 2
+
        MAX_SEGMENT_LENGTH = 80
        for word in self.words:
            if current_segment is None:
                current_segment = TranscriptSegment(
@@ -123,27 +125,14 @@ class Transcript(BaseModel):
                    speaker=word.speaker,
                )
                continue
-            is_blank = False
+            current_segment.text += word.text
-            if last_word:
+
-                is_blank = word.start - last_word.end > BLANK_TIME_SECS
+            have_punc = PUNC_RE.search(word.text)
-            if (
+            if word.speaker != current_segment.speaker or (
-                word.speaker != current_segment.speaker
+                have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH)
                or (
                    word.text in ".;:?!…"
                    and len(current_segment.text) > MAX_SEGMENT_LENGTH
                )
                or is_blank
            ):
                # check which condition triggered
                segments.append(current_segment)
-                current_segment = TranscriptSegment(
+                current_segment = None
                    text=word.text,
                    start=word.start,
                    speaker=word.speaker,
                )
            else:
                current_segment.text += word.text
            last_word = word
        if current_segment:
            segments.append(current_segment)
        return segments