feat: Multitrack segmentation (#747)

* segmentation multitrack (no-mistakes) * segmentation multitrack (no-mistakes) * self review * self review * recording poll daily doc * filter cam_audio tracks to remove screensharing from daily processing * pr review --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-04-24 14:15:19 +00:00 · 2025-11-26 16:21:32 -05:00
parent 8d696aa775
commit d63040e2fd
8 changed files with 485 additions and 81 deletions
--- a/server/reflector/db/recordings.py
+++ b/server/reflector/db/recordings.py
@@ -35,8 +35,15 @@ class Recording(BaseModel):
    status: Literal["pending", "processing", "completed", "failed"] = "pending"
    meeting_id: str | None = None
    # for multitrack reprocessing
    # track_keys can be empty list [] if recording finished but no audio was captured (silence/muted)
    # None means not a multitrack recording, [] means multitrack with no tracks
    track_keys: list[str] | None = None
    @property
    def is_multitrack(self) -> bool:
        """True if recording has separate audio tracks (1+ tracks counts as multitrack)."""
        return self.track_keys is not None and len(self.track_keys) > 0
 class RecordingController:
    async def create(self, recording: Recording):
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -1,6 +1,7 @@
 import io
 import re
 import tempfile
 from collections import defaultdict
 from pathlib import Path
 from typing import Annotated, TypedDict
@@ -16,6 +17,17 @@ class DiarizationSegment(TypedDict):
 PUNC_RE = re.compile(r"[.;:?!…]")
 SENTENCE_END_RE = re.compile(r"[.?!…]$")
 # Max segment length for words_to_segments() - breaks on any punctuation (. ; : ? ! …)
 # when segment exceeds this limit. Used for non-multitrack recordings.
 MAX_SEGMENT_CHARS = 120
 # Max segment length for words_to_segments_by_sentence() - only breaks on sentence-ending
 # punctuation (. ? ! …) when segment exceeds this limit. Higher threshold allows complete
 # sentences in multitrack recordings where speakers overlap.
 # similar number to server/reflector/processors/transcript_liner.py
 MAX_SENTENCE_SEGMENT_CHARS = 1000
 class AudioFile(BaseModel):
@@ -76,7 +88,6 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
    # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
    segments = []
    current_segment = None
    MAX_SEGMENT_LENGTH = 120
    for word in words:
        if current_segment is None:
@@ -106,7 +117,7 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
        current_segment.end = word.end
        have_punc = PUNC_RE.search(word.text)
-        if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH):
+        if have_punc and (len(current_segment.text) > MAX_SEGMENT_CHARS):
            segments.append(current_segment)
            current_segment = None
@@ -116,6 +127,70 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
    return segments
 def words_to_segments_by_sentence(words: list[Word]) -> list[TranscriptSegment]:
    """Group words by speaker, then split into sentences.
    For multitrack recordings where words from different speakers are interleaved
    by timestamp, this function first groups all words by speaker, then creates
    segments based on sentence boundaries within each speaker's words.
    This produces cleaner output than words_to_segments() which breaks on every
    speaker change, resulting in many tiny segments when speakers overlap.
    """
    if not words:
        return []
    # Group words by speaker, preserving order within each speaker
    by_speaker: dict[int, list[Word]] = defaultdict(list)
    for w in words:
        by_speaker[w.speaker].append(w)
    segments: list[TranscriptSegment] = []
    for speaker, speaker_words in by_speaker.items():
        current_text = ""
        current_start: float | None = None
        current_end: float = 0.0
        for word in speaker_words:
            if current_start is None:
                current_start = word.start
            current_text += word.text
            current_end = word.end
            # Check for sentence end or max length
            is_sentence_end = SENTENCE_END_RE.search(word.text.strip())
            is_too_long = len(current_text) >= MAX_SENTENCE_SEGMENT_CHARS
            if is_sentence_end or is_too_long:
                segments.append(
                    TranscriptSegment(
                        text=current_text,
                        start=current_start,
                        end=current_end,
                        speaker=speaker,
                    )
                )
                current_text = ""
                current_start = None
        # Flush remaining words for this speaker
        if current_text and current_start is not None:
            segments.append(
                TranscriptSegment(
                    text=current_text,
                    start=current_start,
                    end=current_end,
                    speaker=speaker,
                )
            )
    # Sort segments by start time
    segments.sort(key=lambda s: s.start)
    return segments
 class Transcript(BaseModel):
    translation: str | None = None
    words: list[Word] = []
@@ -154,7 +229,9 @@ class Transcript(BaseModel):
            word.start += offset
            word.end += offset
-    def as_segments(self) -> list[TranscriptSegment]:
+    def as_segments(self, is_multitrack: bool = False) -> list[TranscriptSegment]:
        if is_multitrack:
            return words_to_segments_by_sentence(self.words)
        return words_to_segments(self.words)
--- a/server/reflector/utils/daily.py
+++ b/server/reflector/utils/daily.py
@@ -64,6 +64,11 @@ def recording_lock_key(recording_id: NonEmptyString) -> NonEmptyString:
    return f"recording:{recording_id}"
 def filter_cam_audio_tracks(track_keys: list[str]) -> list[str]:
    """Filter track keys to cam-audio tracks only (skip screen-audio, etc.)."""
    return [k for k in track_keys if "cam-audio" in k]
 def extract_base_room_name(daily_room_name: DailyRoomName) -> NonEmptyString:
    """
    Extract base room name from Daily.co timestamped room name.
--- a/server/reflector/utils/transcript_formats.py
+++ b/server/reflector/utils/transcript_formats.py
@@ -6,9 +6,6 @@ from reflector.db.transcripts import TranscriptParticipant, TranscriptTopic
 from reflector.processors.types import (
    Transcript as ProcessorTranscript,
 )
 from reflector.processors.types import (
    words_to_segments,
 )
 from reflector.schemas.transcript_formats import TranscriptSegment
 from reflector.utils.webvtt import seconds_to_timestamp
@@ -32,7 +29,9 @@ def format_timestamp_mmss(seconds: float | int) -> str:
 def transcript_to_text(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
    participants: list[TranscriptParticipant] | None,
    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to plain text with speaker names."""
    lines = []
@@ -41,7 +40,7 @@ def transcript_to_text(
            continue
        transcript = ProcessorTranscript(words=topic.words)
-        segments = transcript.as_segments()
+        segments = transcript.as_segments(is_multitrack)
        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -52,7 +51,9 @@ def transcript_to_text(
 def transcript_to_text_timestamped(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
    participants: list[TranscriptParticipant] | None,
    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to timestamped text with speaker names."""
    lines = []
@@ -61,7 +62,7 @@ def transcript_to_text_timestamped(
            continue
        transcript = ProcessorTranscript(words=topic.words)
-        segments = transcript.as_segments()
+        segments = transcript.as_segments(is_multitrack)
        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -73,7 +74,9 @@ def transcript_to_text_timestamped(
 def topics_to_webvtt_named(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
    participants: list[TranscriptParticipant] | None,
    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to WebVTT format with participant names."""
    vtt = webvtt.WebVTT()
@@ -82,7 +85,8 @@ def topics_to_webvtt_named(
        if not topic.words:
            continue
-        segments = words_to_segments(topic.words)
+        transcript = ProcessorTranscript(words=topic.words)
        segments = transcript.as_segments(is_multitrack)
        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -100,19 +104,23 @@ def topics_to_webvtt_named(
 def transcript_to_json_segments(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
    participants: list[TranscriptParticipant] | None,
    is_multitrack: bool = False,
 ) -> list[TranscriptSegment]:
    """Convert transcript topics to a flat list of JSON segments."""
-    segments = []
+    result = []
    for topic in topics:
        if not topic.words:
            continue
        transcript = ProcessorTranscript(words=topic.words)
-        for segment in transcript.as_segments():
+        segments = transcript.as_segments(is_multitrack)
        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
-            segments.append(
+            result.append(
                TranscriptSegment(
                    speaker=segment.speaker,
                    speaker_name=speaker_name,
@@ -122,4 +130,4 @@ def transcript_to_json_segments(
                )
            )
-    return segments
+    return result
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -16,6 +16,7 @@ from pydantic import (
 import reflector.auth as auth
 from reflector.db import get_database
 from reflector.db.recordings import recordings_controller
 from reflector.db.search import (
    DEFAULT_SEARCH_LIMIT,
    SearchLimit,
@@ -60,6 +61,14 @@ ALGORITHM = "HS256"
 DOWNLOAD_EXPIRE_MINUTES = 60
 async def _get_is_multitrack(transcript) -> bool:
    """Detect if transcript is from multitrack recording."""
    if not transcript.recording_id:
        return False
    recording = await recordings_controller.get_by_id(transcript.recording_id)
    return recording is not None and recording.is_multitrack
 def create_access_token(data: dict, expires_delta: timedelta):
    to_encode = data.copy()
    expire = datetime.now(timezone.utc) + expires_delta
@@ -360,7 +369,7 @@ class GetTranscriptTopic(BaseModel):
    segments: list[GetTranscriptSegmentTopic] = []
    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
        if not topic.words:
            # In previous version, words were missing
            # Just output a segment with speaker 0
@@ -384,7 +393,7 @@ class GetTranscriptTopic(BaseModel):
                    start=segment.start,
                    speaker=segment.speaker,
                )
-                for segment in transcript.as_segments()
+                for segment in transcript.as_segments(is_multitrack)
            ]
        return cls(
            id=topic.id,
@@ -401,8 +410,8 @@ class GetTranscriptTopicWithWords(GetTranscriptTopic):
    words: list[Word] = []
    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
-        instance = super().from_transcript_topic(topic)
+        instance = super().from_transcript_topic(topic, is_multitrack)
        if topic.words:
            instance.words = topic.words
        return instance
@@ -417,8 +426,8 @@ class GetTranscriptTopicWithWordsPerSpeaker(GetTranscriptTopic):
    words_per_speaker: list[SpeakerWords] = []
    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
-        instance = super().from_transcript_topic(topic)
+        instance = super().from_transcript_topic(topic, is_multitrack)
        if topic.words:
            words_per_speakers = []
            # group words by speaker
@@ -457,6 +466,8 @@ async def transcript_get(
        transcript_id, user_id=user_id
    )
    is_multitrack = await _get_is_multitrack(transcript)
    base_data = {
        "id": transcript.id,
        "user_id": transcript.user_id,
@@ -483,14 +494,16 @@ async def transcript_get(
        return GetTranscriptWithText(
            **base_data,
            transcript_format="text",
-            transcript=transcript_to_text(transcript.topics, transcript.participants),
+            transcript=transcript_to_text(
                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    elif transcript_format == "text-timestamped":
        return GetTranscriptWithTextTimestamped(
            **base_data,
            transcript_format="text-timestamped",
            transcript=transcript_to_text_timestamped(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    elif transcript_format == "webvtt-named":
@@ -498,7 +511,7 @@ async def transcript_get(
            **base_data,
            transcript_format="webvtt-named",
            transcript=topics_to_webvtt_named(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    elif transcript_format == "json":
@@ -506,7 +519,7 @@ async def transcript_get(
            **base_data,
            transcript_format="json",
            transcript=transcript_to_json_segments(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    else:
@@ -565,9 +578,12 @@ async def transcript_get_topics(
        transcript_id, user_id=user_id
    )
    is_multitrack = await _get_is_multitrack(transcript)
    # convert to GetTranscriptTopic
    return [
-        GetTranscriptTopic.from_transcript_topic(topic) for topic in transcript.topics
+        GetTranscriptTopic.from_transcript_topic(topic, is_multitrack)
        for topic in transcript.topics
    ]
@@ -584,9 +600,11 @@ async def transcript_get_topics_with_words(
        transcript_id, user_id=user_id
    )
    is_multitrack = await _get_is_multitrack(transcript)
    # convert to GetTranscriptTopicWithWords
    return [
-        GetTranscriptTopicWithWords.from_transcript_topic(topic)
+        GetTranscriptTopicWithWords.from_transcript_topic(topic, is_multitrack)
        for topic in transcript.topics
    ]
@@ -605,13 +623,17 @@ async def transcript_get_topics_with_words_per_speaker(
        transcript_id, user_id=user_id
    )
    is_multitrack = await _get_is_multitrack(transcript)
    # get the topic from the transcript
    topic = next((t for t in transcript.topics if t.id == topic_id), None)
    if not topic:
        raise HTTPException(status_code=404, detail="Topic not found")
    # convert to GetTranscriptTopicWithWordsPerSpeaker
-    return GetTranscriptTopicWithWordsPerSpeaker.from_transcript_topic(topic)
+    return GetTranscriptTopicWithWordsPerSpeaker.from_transcript_topic(
        topic, is_multitrack
    )
@router.post("/transcripts/{transcript_id}/zulip")
--- a/server/reflector/worker/process.py
+++ b/server/reflector/worker/process.py
@@ -2,6 +2,7 @@ import json
 import os
 import re
 from datetime import datetime, timezone
 from typing import List
 from urllib.parse import unquote
 import av
@@ -11,7 +12,7 @@ from celery import shared_task
 from celery.utils.log import get_task_logger
 from pydantic import ValidationError
-from reflector.dailyco_api import MeetingParticipantsResponse
+from reflector.dailyco_api import MeetingParticipantsResponse, RecordingResponse
 from reflector.db.daily_participant_sessions import (
    DailyParticipantSession,
    daily_participant_sessions_controller,
@@ -38,6 +39,7 @@ from reflector.storage import get_transcripts_storage
 from reflector.utils.daily import (
    DailyRoomName,
    extract_base_room_name,
    filter_cam_audio_tracks,
    parse_daily_recording_filename,
    recording_lock_key,
 )
@@ -338,7 +340,9 @@ async def _process_multitrack_recording_inner(
                    exc_info=True,
                )
-            for idx, key in enumerate(track_keys):
+            cam_audio_keys = filter_cam_audio_tracks(track_keys)
            for idx, key in enumerate(cam_audio_keys):
                try:
                    parsed = parse_daily_recording_filename(key)
                    participant_id = parsed.participant_id
@@ -366,7 +370,7 @@ async def _process_multitrack_recording_inner(
    task_pipeline_multitrack_process.delay(
        transcript_id=transcript.id,
        bucket_name=bucket_name,
-        track_keys=track_keys,
+        track_keys=filter_cam_audio_tracks(track_keys),
    )
@@ -391,7 +395,7 @@ async def poll_daily_recordings():
    async with create_platform_client("daily") as daily_client:
        # latest 100. TODO cursor-based state
-        api_recordings = await daily_client.list_recordings()
+        api_recordings: List[RecordingResponse] = await daily_client.list_recordings()
    if not api_recordings:
        logger.debug(
@@ -422,17 +426,19 @@ async def poll_daily_recordings():
    for recording in missing_recordings:
        if not recording.tracks:
-            assert recording.status != "finished", (
+            if recording.status == "finished":
-                f"Recording {recording.id} has status='finished' but no tracks. "
+                logger.warning(
-                f"Daily.co API guarantees finished recordings have tracks available. "
+                    "Finished recording has no tracks (no audio captured)",
-                f"room_name={recording.room_name}"
+                    recording_id=recording.id,
-            )
+                    room_name=recording.room_name,
-            logger.debug(
+                )
-                "No tracks in recording yet",
+            else:
-                recording_id=recording.id,
+                logger.debug(
-                room_name=recording.room_name,
+                    "No tracks in recording yet",
-                status=recording.status,
+                    recording_id=recording.id,
-            )
+                    room_name=recording.room_name,
                    status=recording.status,
                )
            continue
        track_keys = [t.s3Key for t in recording.tracks if t.type == "audio"]
--- a/server/tests/test_processor_transcript_segment.py
+++ b/server/tests/test_processor_transcript_segment.py
@@ -159,3 +159,78 @@ def test_processor_transcript_segment():
    assert segments[3].start == 30.72
    assert segments[4].start == 31.56
    assert segments[5].start == 32.38
 def test_processor_transcript_segment_multitrack_interleaved():
    """Test as_segments(is_multitrack=True) with interleaved speakers.
    Multitrack recordings have words from different speakers sorted by start time,
    causing frequent speaker alternation. The multitrack mode should group by
    speaker first, then split into sentences.
    """
    from reflector.processors.types import Transcript, Word
    # Simulate real multitrack data: words sorted by start time, speakers interleave
    # Speaker 0 says: "Hello there."
    # Speaker 1 says: "I'm good."
    # When sorted by time, words interleave
    transcript = Transcript(
        words=[
            Word(text="Hello ", start=0.0, end=0.5, speaker=0),
            Word(text="I'm ", start=0.5, end=0.8, speaker=1),
            Word(text="there.", start=0.5, end=1.0, speaker=0),
            Word(text="good.", start=1.0, end=1.5, speaker=1),
        ]
    )
    # Default behavior (is_multitrack=False): breaks on every speaker change = 4 segments
    segments_default = transcript.as_segments(is_multitrack=False)
    assert len(segments_default) == 4
    # Multitrack behavior: groups by speaker, then sentences = 2 segments
    segments_multitrack = transcript.as_segments(is_multitrack=True)
    assert len(segments_multitrack) == 2
    # Check content - sorted by start time
    assert segments_multitrack[0].speaker == 0
    assert segments_multitrack[0].text == "Hello there."
    assert segments_multitrack[0].start == 0.0
    assert segments_multitrack[0].end == 1.0
    assert segments_multitrack[1].speaker == 1
    assert segments_multitrack[1].text == "I'm good."
    assert segments_multitrack[1].start == 0.5
    assert segments_multitrack[1].end == 1.5
 def test_processor_transcript_segment_multitrack_overlapping_timestamps():
    """Test multitrack with exactly overlapping timestamps (real Daily.co data pattern)."""
    from reflector.processors.types import Transcript, Word
    # Real pattern from transcript 38d84d57: words with identical timestamps
    transcript = Transcript(
        words=[
            Word(text="speaking ", start=6.71, end=7.11, speaker=0),
            Word(text="Speaking ", start=6.71, end=7.11, speaker=1),
            Word(text="at ", start=7.11, end=7.27, speaker=0),
            Word(text="at ", start=7.11, end=7.27, speaker=1),
            Word(text="the ", start=7.27, end=7.43, speaker=0),
            Word(text="the ", start=7.27, end=7.43, speaker=1),
            Word(text="same ", start=7.43, end=7.59, speaker=0),
            Word(text="same ", start=7.43, end=7.59, speaker=1),
            Word(text="time.", start=7.59, end=8.0, speaker=0),
            Word(text="time.", start=7.59, end=8.0, speaker=1),
        ]
    )
    # Default: 10 segments (one per speaker change)
    segments_default = transcript.as_segments(is_multitrack=False)
    assert len(segments_default) == 10
    # Multitrack: 2 segments (one per speaker sentence)
    segments_multitrack = transcript.as_segments(is_multitrack=True)
    assert len(segments_multitrack) == 2
    # Both should have complete sentences
    assert "speaking at the same time." in segments_multitrack[0].text
    assert "Speaking at the same time." in segments_multitrack[1].text
--- a/server/tests/test_transcript_formats.py
+++ b/server/tests/test_transcript_formats.py
@@ -273,8 +273,17 @@ async def test_transcript_formats_with_multiple_speakers():
@pytest.mark.asyncio
-async def test_transcript_formats_with_overlapping_speakers():
+async def test_transcript_formats_with_overlapping_speakers_multitrack():
-    """Test format conversion when multiple speakers speak at the same time (overlapping timestamps)."""
+    """Test format conversion for multitrack recordings with truly interleaved words.
    Multitrack recordings have words from different speakers sorted by start time,
    causing frequent speaker alternation. This tests the sentence-based segmentation
    that groups each speaker's words into complete sentences.
    """
    # Real multitrack data: words sorted by start time, speakers interleave
    # Alice says: "Hello there." (0.0-1.0)
    # Bob says: "I'm good." (0.5-1.5)
    # When sorted by time, words interleave: Hello, I'm, there., good.
    topics = [
        TranscriptTopic(
            id="1",
@@ -282,11 +291,10 @@ async def test_transcript_formats_with_overlapping_speakers():
            summary="Summary 1",
            timestamp=0.0,
            words=[
-                Word(text="Hello", start=0.0, end=0.5, speaker=0),
+                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
-                Word(text=" there.", start=0.5, end=1.0, speaker=0),
+                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
-                # Speaker 1 overlaps with speaker 0 at 0.5-1.0
+                Word(text="there.", start=0.5, end=1.0, speaker=0),
-                Word(text="I'm", start=0.5, end=1.0, speaker=1),
+                Word(text="good.", start=1.0, end=1.5, speaker=1),
                Word(text=" good.", start=1.0, end=1.5, speaker=1),
            ],
        ),
    ]
@@ -296,20 +304,9 @@ async def test_transcript_formats_with_overlapping_speakers():
        TranscriptParticipant(id="2", speaker=1, name="Bob"),
    ]
-    text_result = transcript_to_text(topics, participants)
+    # With is_multitrack=True, should produce 2 segments (one per speaker sentence)
-    lines = text_result.split("\n")
+    # not 4 segments (one per speaker change)
-    assert len(lines) >= 2
+    webvtt_result = topics_to_webvtt_named(topics, participants, is_multitrack=True)
    assert any("Alice:" in line for line in lines)
    assert any("Bob:" in line for line in lines)
    timestamped_result = transcript_to_text_timestamped(topics, participants)
    timestamped_lines = timestamped_result.split("\n")
    assert len(timestamped_lines) >= 2
    assert any("Alice:" in line for line in timestamped_lines)
    assert any("Bob:" in line for line in timestamped_lines)
    assert any("[00:00]" in line for line in timestamped_lines)
    webvtt_result = topics_to_webvtt_named(topics, participants)
    expected_webvtt = """WEBVTT
 00:00:00.000 --> 00:00:01.000
@@ -320,23 +317,26 @@ async def test_transcript_formats_with_overlapping_speakers():
 """
    assert webvtt_result == expected_webvtt
-    segments = transcript_to_json_segments(topics, participants)
+    text_result = transcript_to_text(topics, participants, is_multitrack=True)
-    assert len(segments) >= 2
+    lines = text_result.split("\n")
-    speakers = {seg.speaker for seg in segments}
+    assert len(lines) == 2
-    assert 0 in speakers and 1 in speakers
+    assert "Alice: Hello there." in lines[0]
    assert "Bob: I'm good." in lines[1]
-    alice_seg = next(seg for seg in segments if seg.speaker == 0)
+    timestamped_result = transcript_to_text_timestamped(
-    bob_seg = next(seg for seg in segments if seg.speaker == 1)
+        topics, participants, is_multitrack=True
    )
    timestamped_lines = timestamped_result.split("\n")
    assert len(timestamped_lines) == 2
    assert "[00:00] Alice: Hello there." in timestamped_lines[0]
    assert "[00:00] Bob: I'm good." in timestamped_lines[1]
-    # Verify timestamps overlap: Alice (0.0-1.0) and Bob (0.5-1.5) overlap at 0.5-1.0
+    segments = transcript_to_json_segments(topics, participants, is_multitrack=True)
-    assert alice_seg.start < bob_seg.end, "Alice segment should start before Bob ends"
+    assert len(segments) == 2
-    assert bob_seg.start < alice_seg.end, "Bob segment should start before Alice ends"
+    assert segments[0].speaker_name == "Alice"
-
+    assert segments[0].text == "Hello there."
-    overlap_start = max(alice_seg.start, bob_seg.start)
+    assert segments[1].speaker_name == "Bob"
-    overlap_end = min(alice_seg.end, bob_seg.end)
+    assert segments[1].text == "I'm good."
    assert (
        overlap_start < overlap_end
    ), f"Segments should overlap between {overlap_start} and {overlap_end}"
@pytest.mark.asyncio
@@ -573,3 +573,207 @@ async def test_api_transcript_format_default_is_text(client):
    assert data["transcript_format"] == "text"
    assert "transcript" in data
@pytest.mark.asyncio
 async def test_api_topics_endpoint_multitrack_segmentation(client):
    """Test GET /transcripts/{id}/topics uses sentence-based segmentation for multitrack.
    This tests the fix for TASKS2.md - ensuring /topics endpoints correctly detect
    multitrack recordings and use sentence-based segmentation instead of fragmenting
    on every speaker change.
    """
    from datetime import datetime, timezone
    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.transcripts import (
        TranscriptParticipant,
        TranscriptTopic,
        transcripts_controller,
    )
    from reflector.processors.types import Word
    # Create a multitrack recording (has track_keys)
    recording = Recording(
        bucket_name="test-bucket",
        object_key="test-key",
        recorded_at=datetime.now(timezone.utc),
        track_keys=["track1.webm", "track2.webm"],  # This makes it multitrack
    )
    await recordings_controller.create(recording)
    # Create transcript linked to the recording
    transcript = await transcripts_controller.add(
        name="Multitrack Test",
        source_kind="file",
        recording_id=recording.id,
    )
    await transcripts_controller.update(
        transcript,
        {
            "participants": [
                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
            ]
        },
    )
    # Add interleaved words (as they appear in real multitrack data)
    await transcripts_controller.upsert_topic(
        transcript,
        TranscriptTopic(
            title="Topic 1",
            summary="Summary 1",
            timestamp=0,
            words=[
                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
                Word(text="there.", start=0.5, end=1.0, speaker=0),
                Word(text="good.", start=1.0, end=1.5, speaker=1),
            ],
        ),
    )
    # Test /topics endpoint
    response = await client.get(f"/transcripts/{transcript.id}/topics")
    assert response.status_code == 200
    data = response.json()
    assert len(data) == 1
    topic = data[0]
    # Key assertion: multitrack should produce 2 segments (one per speaker sentence)
    # Not 4 segments (one per speaker change)
    assert len(topic["segments"]) == 2
    # Check content
    segment_texts = [s["text"] for s in topic["segments"]]
    assert "Hello there." in segment_texts
    assert "I'm good." in segment_texts
@pytest.mark.asyncio
 async def test_api_topics_endpoint_non_multitrack_segmentation(client):
    """Test GET /transcripts/{id}/topics uses default segmentation for non-multitrack.
    Ensures backward compatibility - transcripts without multitrack recordings
    should continue using the default speaker-change-based segmentation.
    """
    from reflector.db.transcripts import (
        TranscriptParticipant,
        TranscriptTopic,
        transcripts_controller,
    )
    from reflector.processors.types import Word
    # Create transcript WITHOUT recording (defaulted as not multitrack) TODO better heuristic
    response = await client.post("/transcripts", json={"name": "Test transcript"})
    assert response.status_code == 200
    tid = response.json()["id"]
    transcript = await transcripts_controller.get_by_id(tid)
    await transcripts_controller.update(
        transcript,
        {
            "participants": [
                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
            ]
        },
    )
    # Add interleaved words
    await transcripts_controller.upsert_topic(
        transcript,
        TranscriptTopic(
            title="Topic 1",
            summary="Summary 1",
            timestamp=0,
            words=[
                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
                Word(text="there.", start=0.5, end=1.0, speaker=0),
                Word(text="good.", start=1.0, end=1.5, speaker=1),
            ],
        ),
    )
    # Test /topics endpoint
    response = await client.get(f"/transcripts/{tid}/topics")
    assert response.status_code == 200
    data = response.json()
    assert len(data) == 1
    topic = data[0]
    # Non-multitrack: should produce 4 segments (one per speaker change)
    assert len(topic["segments"]) == 4
@pytest.mark.asyncio
 async def test_api_topics_with_words_endpoint_multitrack(client):
    """Test GET /transcripts/{id}/topics/with-words uses multitrack segmentation."""
    from datetime import datetime, timezone
    from reflector.db.recordings import Recording, recordings_controller
    from reflector.db.transcripts import (
        TranscriptParticipant,
        TranscriptTopic,
        transcripts_controller,
    )
    from reflector.processors.types import Word
    # Create multitrack recording
    recording = Recording(
        bucket_name="test-bucket",
        object_key="test-key-2",
        recorded_at=datetime.now(timezone.utc),
        track_keys=["track1.webm", "track2.webm"],
    )
    await recordings_controller.create(recording)
    transcript = await transcripts_controller.add(
        name="Multitrack Test 2",
        source_kind="file",
        recording_id=recording.id,
    )
    await transcripts_controller.update(
        transcript,
        {
            "participants": [
                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
            ]
        },
    )
    await transcripts_controller.upsert_topic(
        transcript,
        TranscriptTopic(
            title="Topic 1",
            summary="Summary 1",
            timestamp=0,
            words=[
                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
                Word(text="there.", start=0.5, end=1.0, speaker=0),
                Word(text="good.", start=1.0, end=1.5, speaker=1),
            ],
        ),
    )
    response = await client.get(f"/transcripts/{transcript.id}/topics/with-words")
    assert response.status_code == 200
    data = response.json()
    assert len(data) == 1
    topic = data[0]
    # Should have 2 segments (multitrack sentence-based)
    assert len(topic["segments"]) == 2
    # Should also have words field
    assert "words" in topic
    assert len(topic["words"]) == 4