diff --git a/server/reflector/processors/types.py b/server/reflector/processors/types.py index ba0cccf9..d2c32d17 100644 --- a/server/reflector/processors/types.py +++ b/server/reflector/processors/types.py @@ -125,16 +125,30 @@ class Transcript(BaseModel): speaker=word.speaker, ) continue + + # If the word is attach to another speaker, push the current segment + # and start a new one + if word.speaker != current_segment.speaker: + segments.append(current_segment) + current_segment = TranscriptSegment( + text=word.text, + start=word.start, + speaker=word.speaker, + ) + continue + + # if the word is the end of a sentence, and we have enough content, + # add the word to the current segment and push it current_segment.text += word.text have_punc = PUNC_RE.search(word.text) - if word.speaker != current_segment.speaker or ( - have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH) - ): + if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH): segments.append(current_segment) current_segment = None + if current_segment: segments.append(current_segment) + return segments diff --git a/server/tests/test_processor_transcript_segment.py b/server/tests/test_processor_transcript_segment.py index 3bb6182f..6fde0dd1 100644 --- a/server/tests/test_processor_transcript_segment.py +++ b/server/tests/test_processor_transcript_segment.py @@ -142,5 +142,20 @@ def test_processor_transcript_segment(): ] ) - for segment in transcript.as_segments(): - print(segment) + segments = transcript.as_segments() + assert len(segments) == 7 + + # check speaker order + assert segments[0].speaker == 0 + assert segments[1].speaker == 0 + assert segments[2].speaker == 0 + assert segments[3].speaker == 1 + assert segments[4].speaker == 2 + assert segments[5].speaker == 0 + assert segments[6].speaker == 0 + + # check the timing (first entry, and first of others speakers) + assert segments[0].start == 5.12 + assert segments[3].start == 30.72 + assert segments[4].start == 31.56 + assert segments[5].start == 32.38