From 00eb9bbf3c5ccdb8f7f3f9364ae952d821897759 Mon Sep 17 00:00:00 2001
From: Mathieu Virbel <mat@meltingrocks.com>
Date: Fri, 20 Oct 2023 16:06:35 +0200
Subject: [PATCH] server: improve split algorithm

---
 server/reflector/processors/types.py | 35 ++++++++++------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/server/reflector/processors/types.py b/server/reflector/processors/types.py
index 686c5785..ba0cccf9 100644
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -1,10 +1,13 @@
 import io
+import re
 import tempfile
 from pathlib import Path
 
 from profanityfilter import ProfanityFilter
 from pydantic import BaseModel, PrivateAttr
 
+PUNC_RE = re.compile(r"[.;:?!…]")
+
 profanity_filter = ProfanityFilter()
 profanity_filter.set_censor("*")
 
@@ -106,15 +109,14 @@ class Transcript(BaseModel):
         ]
         return Transcript(text=self.text, translation=self.translation, words=words)
 
-    def as_segments(self):
+    def as_segments(self) -> list[TranscriptSegment]:
         # from a list of word, create a list of segments
         # join the word that are less than 2 seconds apart
         # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
         segments = []
         current_segment = None
-        last_word = None
-        BLANK_TIME_SECS = 2
-        MAX_SEGMENT_LENGTH = 80
+        MAX_SEGMENT_LENGTH = 120
+
         for word in self.words:
             if current_segment is None:
                 current_segment = TranscriptSegment(
@@ -123,27 +125,14 @@ class Transcript(BaseModel):
                     speaker=word.speaker,
                 )
                 continue
-            is_blank = False
-            if last_word:
-                is_blank = word.start - last_word.end > BLANK_TIME_SECS
-            if (
-                word.speaker != current_segment.speaker
-                or (
-                    word.text in ".;:?!…"
-                    and len(current_segment.text) > MAX_SEGMENT_LENGTH
-                )
-                or is_blank
+            current_segment.text += word.text
+
+            have_punc = PUNC_RE.search(word.text)
+            if word.speaker != current_segment.speaker or (
+                have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH)
             ):
-                # check which condition triggered
                 segments.append(current_segment)
-                current_segment = TranscriptSegment(
-                    text=word.text,
-                    start=word.start,
-                    speaker=word.speaker,
-                )
-            else:
-                current_segment.text += word.text
-            last_word = word
+                current_segment = None
         if current_segment:
             segments.append(current_segment)
         return segments