server: add tests on segmentation and fix issue with speaker

This commit is contained in:
2023-10-20 16:14:30 +02:00
committed by Mathieu Virbel
parent 01d7add6cc
commit f4cffc0e66
2 changed files with 34 additions and 5 deletions

View File

@@ -125,16 +125,30 @@ class Transcript(BaseModel):
speaker=word.speaker,
)
continue
# If the word is attach to another speaker, push the current segment
# and start a new one
if word.speaker != current_segment.speaker:
segments.append(current_segment)
current_segment = TranscriptSegment(
text=word.text,
start=word.start,
speaker=word.speaker,
)
continue
# if the word is the end of a sentence, and we have enough content,
# add the word to the current segment and push it
current_segment.text += word.text
have_punc = PUNC_RE.search(word.text)
if word.speaker != current_segment.speaker or (
have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH)
):
if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH):
segments.append(current_segment)
current_segment = None
if current_segment:
segments.append(current_segment)
return segments

View File

@@ -142,5 +142,20 @@ def test_processor_transcript_segment():
]
)
for segment in transcript.as_segments():
print(segment)
segments = transcript.as_segments()
assert len(segments) == 7
# check speaker order
assert segments[0].speaker == 0
assert segments[1].speaker == 0
assert segments[2].speaker == 0
assert segments[3].speaker == 1
assert segments[4].speaker == 2
assert segments[5].speaker == 0
assert segments[6].speaker == 0
# check the timing (first entry, and first of others speakers)
assert segments[0].start == 5.12
assert segments[3].start == 30.72
assert segments[4].start == 31.56
assert segments[5].start == 32.38