feat: Multitrack segmentation (#747)

* segmentation multitrack (no-mistakes) * segmentation multitrack (no-mistakes) * self review * self review * recording poll daily doc * filter cam_audio tracks to remove screensharing from daily processing * pr review --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-05-06 03:15:17 +00:00 · 2025-11-26 16:21:32 -05:00
parent 8d696aa775
commit d63040e2fd
8 changed files with 485 additions and 81 deletions
--- a/server/reflector/db/recordings.py
+++ b/server/reflector/db/recordings.py
@@ -35,8 +35,15 @@ class Recording(BaseModel):
    status: Literal["pending", "processing", "completed", "failed"] = "pending"
    meeting_id: str | None = None
    # for multitrack reprocessing
+    # track_keys can be empty list [] if recording finished but no audio was captured (silence/muted)
+    # None means not a multitrack recording, [] means multitrack with no tracks
    track_keys: list[str] | None = None

+    @property
+    def is_multitrack(self) -> bool:
+        """True if recording has separate audio tracks (1+ tracks counts as multitrack)."""
+        return self.track_keys is not None and len(self.track_keys) > 0
+

 class RecordingController:
    async def create(self, recording: Recording):
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -1,6 +1,7 @@
 import io
 import re
 import tempfile
+from collections import defaultdict
 from pathlib import Path
 from typing import Annotated, TypedDict

@@ -16,6 +17,17 @@ class DiarizationSegment(TypedDict):


 PUNC_RE = re.compile(r"[.;:?!…]")
+SENTENCE_END_RE = re.compile(r"[.?!…]$")
+
+# Max segment length for words_to_segments() - breaks on any punctuation (. ; : ? ! …)
+# when segment exceeds this limit. Used for non-multitrack recordings.
+MAX_SEGMENT_CHARS = 120
+
+# Max segment length for words_to_segments_by_sentence() - only breaks on sentence-ending
+# punctuation (. ? ! …) when segment exceeds this limit. Higher threshold allows complete
+# sentences in multitrack recordings where speakers overlap.
+# similar number to server/reflector/processors/transcript_liner.py
+MAX_SENTENCE_SEGMENT_CHARS = 1000


 class AudioFile(BaseModel):
@@ -76,7 +88,6 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
    # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
    segments = []
    current_segment = None
-    MAX_SEGMENT_LENGTH = 120

    for word in words:
        if current_segment is None:
@@ -106,7 +117,7 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
        current_segment.end = word.end

        have_punc = PUNC_RE.search(word.text)
-        if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH):
+        if have_punc and (len(current_segment.text) > MAX_SEGMENT_CHARS):
            segments.append(current_segment)
            current_segment = None

@@ -116,6 +127,70 @@ def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
    return segments


+def words_to_segments_by_sentence(words: list[Word]) -> list[TranscriptSegment]:
+    """Group words by speaker, then split into sentences.
+
+    For multitrack recordings where words from different speakers are interleaved
+    by timestamp, this function first groups all words by speaker, then creates
+    segments based on sentence boundaries within each speaker's words.
+
+    This produces cleaner output than words_to_segments() which breaks on every
+    speaker change, resulting in many tiny segments when speakers overlap.
+    """
+    if not words:
+        return []
+
+    # Group words by speaker, preserving order within each speaker
+    by_speaker: dict[int, list[Word]] = defaultdict(list)
+    for w in words:
+        by_speaker[w.speaker].append(w)
+
+    segments: list[TranscriptSegment] = []
+
+    for speaker, speaker_words in by_speaker.items():
+        current_text = ""
+        current_start: float | None = None
+        current_end: float = 0.0
+
+        for word in speaker_words:
+            if current_start is None:
+                current_start = word.start
+
+            current_text += word.text
+            current_end = word.end
+
+            # Check for sentence end or max length
+            is_sentence_end = SENTENCE_END_RE.search(word.text.strip())
+            is_too_long = len(current_text) >= MAX_SENTENCE_SEGMENT_CHARS
+
+            if is_sentence_end or is_too_long:
+                segments.append(
+                    TranscriptSegment(
+                        text=current_text,
+                        start=current_start,
+                        end=current_end,
+                        speaker=speaker,
+                    )
+                )
+                current_text = ""
+                current_start = None
+
+        # Flush remaining words for this speaker
+        if current_text and current_start is not None:
+            segments.append(
+                TranscriptSegment(
+                    text=current_text,
+                    start=current_start,
+                    end=current_end,
+                    speaker=speaker,
+                )
+            )
+
+    # Sort segments by start time
+    segments.sort(key=lambda s: s.start)
+    return segments
+
+
 class Transcript(BaseModel):
    translation: str | None = None
    words: list[Word] = []
@@ -154,7 +229,9 @@ class Transcript(BaseModel):
            word.start += offset
            word.end += offset

-    def as_segments(self) -> list[TranscriptSegment]:
+    def as_segments(self, is_multitrack: bool = False) -> list[TranscriptSegment]:
+        if is_multitrack:
+            return words_to_segments_by_sentence(self.words)
        return words_to_segments(self.words)


--- a/server/reflector/utils/daily.py
+++ b/server/reflector/utils/daily.py
@@ -64,6 +64,11 @@ def recording_lock_key(recording_id: NonEmptyString) -> NonEmptyString:
    return f"recording:{recording_id}"


+def filter_cam_audio_tracks(track_keys: list[str]) -> list[str]:
+    """Filter track keys to cam-audio tracks only (skip screen-audio, etc.)."""
+    return [k for k in track_keys if "cam-audio" in k]
+
+
 def extract_base_room_name(daily_room_name: DailyRoomName) -> NonEmptyString:
    """
    Extract base room name from Daily.co timestamped room name.
--- a/server/reflector/utils/transcript_formats.py
+++ b/server/reflector/utils/transcript_formats.py
@@ -6,9 +6,6 @@ from reflector.db.transcripts import TranscriptParticipant, TranscriptTopic
 from reflector.processors.types import (
    Transcript as ProcessorTranscript,
 )
-from reflector.processors.types import (
-    words_to_segments,
-)
 from reflector.schemas.transcript_formats import TranscriptSegment
 from reflector.utils.webvtt import seconds_to_timestamp

@@ -32,7 +29,9 @@ def format_timestamp_mmss(seconds: float | int) -> str:


 def transcript_to_text(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
+    participants: list[TranscriptParticipant] | None,
+    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to plain text with speaker names."""
    lines = []
@@ -41,7 +40,7 @@ def transcript_to_text(
            continue

        transcript = ProcessorTranscript(words=topic.words)
-        segments = transcript.as_segments()
+        segments = transcript.as_segments(is_multitrack)

        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -52,7 +51,9 @@ def transcript_to_text(


 def transcript_to_text_timestamped(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
+    participants: list[TranscriptParticipant] | None,
+    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to timestamped text with speaker names."""
    lines = []
@@ -61,7 +62,7 @@ def transcript_to_text_timestamped(
            continue

        transcript = ProcessorTranscript(words=topic.words)
-        segments = transcript.as_segments()
+        segments = transcript.as_segments(is_multitrack)

        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -73,7 +74,9 @@ def transcript_to_text_timestamped(


 def topics_to_webvtt_named(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
+    participants: list[TranscriptParticipant] | None,
+    is_multitrack: bool = False,
 ) -> str:
    """Convert transcript topics to WebVTT format with participant names."""
    vtt = webvtt.WebVTT()
@@ -82,7 +85,8 @@ def topics_to_webvtt_named(
        if not topic.words:
            continue

-        segments = words_to_segments(topic.words)
+        transcript = ProcessorTranscript(words=topic.words)
+        segments = transcript.as_segments(is_multitrack)

        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
@@ -100,19 +104,23 @@ def topics_to_webvtt_named(


 def transcript_to_json_segments(
-    topics: list[TranscriptTopic], participants: list[TranscriptParticipant] | None
+    topics: list[TranscriptTopic],
+    participants: list[TranscriptParticipant] | None,
+    is_multitrack: bool = False,
 ) -> list[TranscriptSegment]:
    """Convert transcript topics to a flat list of JSON segments."""
-    segments = []
+    result = []

    for topic in topics:
        if not topic.words:
            continue

        transcript = ProcessorTranscript(words=topic.words)
-        for segment in transcript.as_segments():
+        segments = transcript.as_segments(is_multitrack)
+
+        for segment in segments:
            speaker_name = get_speaker_name(segment.speaker, participants)
-            segments.append(
+            result.append(
                TranscriptSegment(
                    speaker=segment.speaker,
                    speaker_name=speaker_name,
@@ -122,4 +130,4 @@ def transcript_to_json_segments(
                )
            )

-    return segments
+    return result
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -16,6 +16,7 @@ from pydantic import (

 import reflector.auth as auth
 from reflector.db import get_database
+from reflector.db.recordings import recordings_controller
 from reflector.db.search import (
    DEFAULT_SEARCH_LIMIT,
    SearchLimit,
@@ -60,6 +61,14 @@ ALGORITHM = "HS256"
 DOWNLOAD_EXPIRE_MINUTES = 60


+async def _get_is_multitrack(transcript) -> bool:
+    """Detect if transcript is from multitrack recording."""
+    if not transcript.recording_id:
+        return False
+    recording = await recordings_controller.get_by_id(transcript.recording_id)
+    return recording is not None and recording.is_multitrack
+
+
 def create_access_token(data: dict, expires_delta: timedelta):
    to_encode = data.copy()
    expire = datetime.now(timezone.utc) + expires_delta
@@ -360,7 +369,7 @@ class GetTranscriptTopic(BaseModel):
    segments: list[GetTranscriptSegmentTopic] = []

    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
        if not topic.words:
            # In previous version, words were missing
            # Just output a segment with speaker 0
@@ -384,7 +393,7 @@ class GetTranscriptTopic(BaseModel):
                    start=segment.start,
                    speaker=segment.speaker,
                )
-                for segment in transcript.as_segments()
+                for segment in transcript.as_segments(is_multitrack)
            ]
        return cls(
            id=topic.id,
@@ -401,8 +410,8 @@ class GetTranscriptTopicWithWords(GetTranscriptTopic):
    words: list[Word] = []

    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
-        instance = super().from_transcript_topic(topic)
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
+        instance = super().from_transcript_topic(topic, is_multitrack)
        if topic.words:
            instance.words = topic.words
        return instance
@@ -417,8 +426,8 @@ class GetTranscriptTopicWithWordsPerSpeaker(GetTranscriptTopic):
    words_per_speaker: list[SpeakerWords] = []

    @classmethod
-    def from_transcript_topic(cls, topic: TranscriptTopic):
-        instance = super().from_transcript_topic(topic)
+    def from_transcript_topic(cls, topic: TranscriptTopic, is_multitrack: bool = False):
+        instance = super().from_transcript_topic(topic, is_multitrack)
        if topic.words:
            words_per_speakers = []
            # group words by speaker
@@ -457,6 +466,8 @@ async def transcript_get(
        transcript_id, user_id=user_id
    )

+    is_multitrack = await _get_is_multitrack(transcript)
+
    base_data = {
        "id": transcript.id,
        "user_id": transcript.user_id,
@@ -483,14 +494,16 @@ async def transcript_get(
        return GetTranscriptWithText(
            **base_data,
            transcript_format="text",
-            transcript=transcript_to_text(transcript.topics, transcript.participants),
+            transcript=transcript_to_text(
+                transcript.topics, transcript.participants, is_multitrack
+            ),
        )
    elif transcript_format == "text-timestamped":
        return GetTranscriptWithTextTimestamped(
            **base_data,
            transcript_format="text-timestamped",
            transcript=transcript_to_text_timestamped(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    elif transcript_format == "webvtt-named":
@@ -498,7 +511,7 @@ async def transcript_get(
            **base_data,
            transcript_format="webvtt-named",
            transcript=topics_to_webvtt_named(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    elif transcript_format == "json":
@@ -506,7 +519,7 @@ async def transcript_get(
            **base_data,
            transcript_format="json",
            transcript=transcript_to_json_segments(
-                transcript.topics, transcript.participants
+                transcript.topics, transcript.participants, is_multitrack
            ),
        )
    else:
@@ -565,9 +578,12 @@ async def transcript_get_topics(
        transcript_id, user_id=user_id
    )

+    is_multitrack = await _get_is_multitrack(transcript)
+
    # convert to GetTranscriptTopic
    return [
-        GetTranscriptTopic.from_transcript_topic(topic) for topic in transcript.topics
+        GetTranscriptTopic.from_transcript_topic(topic, is_multitrack)
+        for topic in transcript.topics
    ]


@@ -584,9 +600,11 @@ async def transcript_get_topics_with_words(
        transcript_id, user_id=user_id
    )

+    is_multitrack = await _get_is_multitrack(transcript)
+
    # convert to GetTranscriptTopicWithWords
    return [
-        GetTranscriptTopicWithWords.from_transcript_topic(topic)
+        GetTranscriptTopicWithWords.from_transcript_topic(topic, is_multitrack)
        for topic in transcript.topics
    ]

@@ -605,13 +623,17 @@ async def transcript_get_topics_with_words_per_speaker(
        transcript_id, user_id=user_id
    )

+    is_multitrack = await _get_is_multitrack(transcript)
+
    # get the topic from the transcript
    topic = next((t for t in transcript.topics if t.id == topic_id), None)
    if not topic:
        raise HTTPException(status_code=404, detail="Topic not found")

    # convert to GetTranscriptTopicWithWordsPerSpeaker
-    return GetTranscriptTopicWithWordsPerSpeaker.from_transcript_topic(topic)
+    return GetTranscriptTopicWithWordsPerSpeaker.from_transcript_topic(
+        topic, is_multitrack
+    )


@router.post("/transcripts/{transcript_id}/zulip")
--- a/server/reflector/worker/process.py
+++ b/server/reflector/worker/process.py
@@ -2,6 +2,7 @@ import json
 import os
 import re
 from datetime import datetime, timezone
+from typing import List
 from urllib.parse import unquote

 import av
@@ -11,7 +12,7 @@ from celery import shared_task
 from celery.utils.log import get_task_logger
 from pydantic import ValidationError

-from reflector.dailyco_api import MeetingParticipantsResponse
+from reflector.dailyco_api import MeetingParticipantsResponse, RecordingResponse
 from reflector.db.daily_participant_sessions import (
    DailyParticipantSession,
    daily_participant_sessions_controller,
@@ -38,6 +39,7 @@ from reflector.storage import get_transcripts_storage
 from reflector.utils.daily import (
    DailyRoomName,
    extract_base_room_name,
+    filter_cam_audio_tracks,
    parse_daily_recording_filename,
    recording_lock_key,
 )
@@ -338,7 +340,9 @@ async def _process_multitrack_recording_inner(
                    exc_info=True,
                )

-            for idx, key in enumerate(track_keys):
+            cam_audio_keys = filter_cam_audio_tracks(track_keys)
+
+            for idx, key in enumerate(cam_audio_keys):
                try:
                    parsed = parse_daily_recording_filename(key)
                    participant_id = parsed.participant_id
@@ -366,7 +370,7 @@ async def _process_multitrack_recording_inner(
    task_pipeline_multitrack_process.delay(
        transcript_id=transcript.id,
        bucket_name=bucket_name,
-        track_keys=track_keys,
+        track_keys=filter_cam_audio_tracks(track_keys),
    )


@@ -391,7 +395,7 @@ async def poll_daily_recordings():

    async with create_platform_client("daily") as daily_client:
        # latest 100. TODO cursor-based state
-        api_recordings = await daily_client.list_recordings()
+        api_recordings: List[RecordingResponse] = await daily_client.list_recordings()

    if not api_recordings:
        logger.debug(
@@ -422,17 +426,19 @@ async def poll_daily_recordings():

    for recording in missing_recordings:
        if not recording.tracks:
-            assert recording.status != "finished", (
-                f"Recording {recording.id} has status='finished' but no tracks. "
-                f"Daily.co API guarantees finished recordings have tracks available. "
-                f"room_name={recording.room_name}"
-            )
-            logger.debug(
-                "No tracks in recording yet",
-                recording_id=recording.id,
-                room_name=recording.room_name,
-                status=recording.status,
-            )
+            if recording.status == "finished":
+                logger.warning(
+                    "Finished recording has no tracks (no audio captured)",
+                    recording_id=recording.id,
+                    room_name=recording.room_name,
+                )
+            else:
+                logger.debug(
+                    "No tracks in recording yet",
+                    recording_id=recording.id,
+                    room_name=recording.room_name,
+                    status=recording.status,
+                )
            continue

        track_keys = [t.s3Key for t in recording.tracks if t.type == "audio"]
--- a/server/tests/test_processor_transcript_segment.py
+++ b/server/tests/test_processor_transcript_segment.py
@@ -159,3 +159,78 @@ def test_processor_transcript_segment():
    assert segments[3].start == 30.72
    assert segments[4].start == 31.56
    assert segments[5].start == 32.38
+
+
+def test_processor_transcript_segment_multitrack_interleaved():
+    """Test as_segments(is_multitrack=True) with interleaved speakers.
+
+    Multitrack recordings have words from different speakers sorted by start time,
+    causing frequent speaker alternation. The multitrack mode should group by
+    speaker first, then split into sentences.
+    """
+    from reflector.processors.types import Transcript, Word
+
+    # Simulate real multitrack data: words sorted by start time, speakers interleave
+    # Speaker 0 says: "Hello there."
+    # Speaker 1 says: "I'm good."
+    # When sorted by time, words interleave
+    transcript = Transcript(
+        words=[
+            Word(text="Hello ", start=0.0, end=0.5, speaker=0),
+            Word(text="I'm ", start=0.5, end=0.8, speaker=1),
+            Word(text="there.", start=0.5, end=1.0, speaker=0),
+            Word(text="good.", start=1.0, end=1.5, speaker=1),
+        ]
+    )
+
+    # Default behavior (is_multitrack=False): breaks on every speaker change = 4 segments
+    segments_default = transcript.as_segments(is_multitrack=False)
+    assert len(segments_default) == 4
+
+    # Multitrack behavior: groups by speaker, then sentences = 2 segments
+    segments_multitrack = transcript.as_segments(is_multitrack=True)
+    assert len(segments_multitrack) == 2
+
+    # Check content - sorted by start time
+    assert segments_multitrack[0].speaker == 0
+    assert segments_multitrack[0].text == "Hello there."
+    assert segments_multitrack[0].start == 0.0
+    assert segments_multitrack[0].end == 1.0
+
+    assert segments_multitrack[1].speaker == 1
+    assert segments_multitrack[1].text == "I'm good."
+    assert segments_multitrack[1].start == 0.5
+    assert segments_multitrack[1].end == 1.5
+
+
+def test_processor_transcript_segment_multitrack_overlapping_timestamps():
+    """Test multitrack with exactly overlapping timestamps (real Daily.co data pattern)."""
+    from reflector.processors.types import Transcript, Word
+
+    # Real pattern from transcript 38d84d57: words with identical timestamps
+    transcript = Transcript(
+        words=[
+            Word(text="speaking ", start=6.71, end=7.11, speaker=0),
+            Word(text="Speaking ", start=6.71, end=7.11, speaker=1),
+            Word(text="at ", start=7.11, end=7.27, speaker=0),
+            Word(text="at ", start=7.11, end=7.27, speaker=1),
+            Word(text="the ", start=7.27, end=7.43, speaker=0),
+            Word(text="the ", start=7.27, end=7.43, speaker=1),
+            Word(text="same ", start=7.43, end=7.59, speaker=0),
+            Word(text="same ", start=7.43, end=7.59, speaker=1),
+            Word(text="time.", start=7.59, end=8.0, speaker=0),
+            Word(text="time.", start=7.59, end=8.0, speaker=1),
+        ]
+    )
+
+    # Default: 10 segments (one per speaker change)
+    segments_default = transcript.as_segments(is_multitrack=False)
+    assert len(segments_default) == 10
+
+    # Multitrack: 2 segments (one per speaker sentence)
+    segments_multitrack = transcript.as_segments(is_multitrack=True)
+    assert len(segments_multitrack) == 2
+
+    # Both should have complete sentences
+    assert "speaking at the same time." in segments_multitrack[0].text
+    assert "Speaking at the same time." in segments_multitrack[1].text
--- a/server/tests/test_transcript_formats.py
+++ b/server/tests/test_transcript_formats.py
@@ -273,8 +273,17 @@ async def test_transcript_formats_with_multiple_speakers():


@pytest.mark.asyncio
-async def test_transcript_formats_with_overlapping_speakers():
-    """Test format conversion when multiple speakers speak at the same time (overlapping timestamps)."""
+async def test_transcript_formats_with_overlapping_speakers_multitrack():
+    """Test format conversion for multitrack recordings with truly interleaved words.
+
+    Multitrack recordings have words from different speakers sorted by start time,
+    causing frequent speaker alternation. This tests the sentence-based segmentation
+    that groups each speaker's words into complete sentences.
+    """
+    # Real multitrack data: words sorted by start time, speakers interleave
+    # Alice says: "Hello there." (0.0-1.0)
+    # Bob says: "I'm good." (0.5-1.5)
+    # When sorted by time, words interleave: Hello, I'm, there., good.
    topics = [
        TranscriptTopic(
            id="1",
@@ -282,11 +291,10 @@ async def test_transcript_formats_with_overlapping_speakers():
            summary="Summary 1",
            timestamp=0.0,
            words=[
-                Word(text="Hello", start=0.0, end=0.5, speaker=0),
-                Word(text=" there.", start=0.5, end=1.0, speaker=0),
-                # Speaker 1 overlaps with speaker 0 at 0.5-1.0
-                Word(text="I'm", start=0.5, end=1.0, speaker=1),
-                Word(text=" good.", start=1.0, end=1.5, speaker=1),
+                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
+                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
+                Word(text="there.", start=0.5, end=1.0, speaker=0),
+                Word(text="good.", start=1.0, end=1.5, speaker=1),
            ],
        ),
    ]
@@ -296,20 +304,9 @@ async def test_transcript_formats_with_overlapping_speakers():
        TranscriptParticipant(id="2", speaker=1, name="Bob"),
    ]

-    text_result = transcript_to_text(topics, participants)
-    lines = text_result.split("\n")
-    assert len(lines) >= 2
-    assert any("Alice:" in line for line in lines)
-    assert any("Bob:" in line for line in lines)
-
-    timestamped_result = transcript_to_text_timestamped(topics, participants)
-    timestamped_lines = timestamped_result.split("\n")
-    assert len(timestamped_lines) >= 2
-    assert any("Alice:" in line for line in timestamped_lines)
-    assert any("Bob:" in line for line in timestamped_lines)
-    assert any("[00:00]" in line for line in timestamped_lines)
-
-    webvtt_result = topics_to_webvtt_named(topics, participants)
+    # With is_multitrack=True, should produce 2 segments (one per speaker sentence)
+    # not 4 segments (one per speaker change)
+    webvtt_result = topics_to_webvtt_named(topics, participants, is_multitrack=True)
    expected_webvtt = """WEBVTT

 00:00:00.000 --> 00:00:01.000
@@ -320,23 +317,26 @@ async def test_transcript_formats_with_overlapping_speakers():
 """
    assert webvtt_result == expected_webvtt

-    segments = transcript_to_json_segments(topics, participants)
-    assert len(segments) >= 2
-    speakers = {seg.speaker for seg in segments}
-    assert 0 in speakers and 1 in speakers
+    text_result = transcript_to_text(topics, participants, is_multitrack=True)
+    lines = text_result.split("\n")
+    assert len(lines) == 2
+    assert "Alice: Hello there." in lines[0]
+    assert "Bob: I'm good." in lines[1]

-    alice_seg = next(seg for seg in segments if seg.speaker == 0)
-    bob_seg = next(seg for seg in segments if seg.speaker == 1)
+    timestamped_result = transcript_to_text_timestamped(
+        topics, participants, is_multitrack=True
+    )
+    timestamped_lines = timestamped_result.split("\n")
+    assert len(timestamped_lines) == 2
+    assert "[00:00] Alice: Hello there." in timestamped_lines[0]
+    assert "[00:00] Bob: I'm good." in timestamped_lines[1]

-    # Verify timestamps overlap: Alice (0.0-1.0) and Bob (0.5-1.5) overlap at 0.5-1.0
-    assert alice_seg.start < bob_seg.end, "Alice segment should start before Bob ends"
-    assert bob_seg.start < alice_seg.end, "Bob segment should start before Alice ends"
-
-    overlap_start = max(alice_seg.start, bob_seg.start)
-    overlap_end = min(alice_seg.end, bob_seg.end)
-    assert (
-        overlap_start < overlap_end
-    ), f"Segments should overlap between {overlap_start} and {overlap_end}"
+    segments = transcript_to_json_segments(topics, participants, is_multitrack=True)
+    assert len(segments) == 2
+    assert segments[0].speaker_name == "Alice"
+    assert segments[0].text == "Hello there."
+    assert segments[1].speaker_name == "Bob"
+    assert segments[1].text == "I'm good."


@pytest.mark.asyncio
@@ -573,3 +573,207 @@ async def test_api_transcript_format_default_is_text(client):

    assert data["transcript_format"] == "text"
    assert "transcript" in data
+
+
+@pytest.mark.asyncio
+async def test_api_topics_endpoint_multitrack_segmentation(client):
+    """Test GET /transcripts/{id}/topics uses sentence-based segmentation for multitrack.
+
+    This tests the fix for TASKS2.md - ensuring /topics endpoints correctly detect
+    multitrack recordings and use sentence-based segmentation instead of fragmenting
+    on every speaker change.
+    """
+    from datetime import datetime, timezone
+
+    from reflector.db.recordings import Recording, recordings_controller
+    from reflector.db.transcripts import (
+        TranscriptParticipant,
+        TranscriptTopic,
+        transcripts_controller,
+    )
+    from reflector.processors.types import Word
+
+    # Create a multitrack recording (has track_keys)
+    recording = Recording(
+        bucket_name="test-bucket",
+        object_key="test-key",
+        recorded_at=datetime.now(timezone.utc),
+        track_keys=["track1.webm", "track2.webm"],  # This makes it multitrack
+    )
+    await recordings_controller.create(recording)
+
+    # Create transcript linked to the recording
+    transcript = await transcripts_controller.add(
+        name="Multitrack Test",
+        source_kind="file",
+        recording_id=recording.id,
+    )
+
+    await transcripts_controller.update(
+        transcript,
+        {
+            "participants": [
+                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
+                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
+            ]
+        },
+    )
+
+    # Add interleaved words (as they appear in real multitrack data)
+    await transcripts_controller.upsert_topic(
+        transcript,
+        TranscriptTopic(
+            title="Topic 1",
+            summary="Summary 1",
+            timestamp=0,
+            words=[
+                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
+                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
+                Word(text="there.", start=0.5, end=1.0, speaker=0),
+                Word(text="good.", start=1.0, end=1.5, speaker=1),
+            ],
+        ),
+    )
+
+    # Test /topics endpoint
+    response = await client.get(f"/transcripts/{transcript.id}/topics")
+    assert response.status_code == 200
+    data = response.json()
+
+    assert len(data) == 1
+    topic = data[0]
+
+    # Key assertion: multitrack should produce 2 segments (one per speaker sentence)
+    # Not 4 segments (one per speaker change)
+    assert len(topic["segments"]) == 2
+
+    # Check content
+    segment_texts = [s["text"] for s in topic["segments"]]
+    assert "Hello there." in segment_texts
+    assert "I'm good." in segment_texts
+
+
+@pytest.mark.asyncio
+async def test_api_topics_endpoint_non_multitrack_segmentation(client):
+    """Test GET /transcripts/{id}/topics uses default segmentation for non-multitrack.
+
+    Ensures backward compatibility - transcripts without multitrack recordings
+    should continue using the default speaker-change-based segmentation.
+    """
+    from reflector.db.transcripts import (
+        TranscriptParticipant,
+        TranscriptTopic,
+        transcripts_controller,
+    )
+    from reflector.processors.types import Word
+
+    # Create transcript WITHOUT recording (defaulted as not multitrack) TODO better heuristic
+    response = await client.post("/transcripts", json={"name": "Test transcript"})
+    assert response.status_code == 200
+    tid = response.json()["id"]
+
+    transcript = await transcripts_controller.get_by_id(tid)
+
+    await transcripts_controller.update(
+        transcript,
+        {
+            "participants": [
+                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
+                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
+            ]
+        },
+    )
+
+    # Add interleaved words
+    await transcripts_controller.upsert_topic(
+        transcript,
+        TranscriptTopic(
+            title="Topic 1",
+            summary="Summary 1",
+            timestamp=0,
+            words=[
+                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
+                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
+                Word(text="there.", start=0.5, end=1.0, speaker=0),
+                Word(text="good.", start=1.0, end=1.5, speaker=1),
+            ],
+        ),
+    )
+
+    # Test /topics endpoint
+    response = await client.get(f"/transcripts/{tid}/topics")
+    assert response.status_code == 200
+    data = response.json()
+
+    assert len(data) == 1
+    topic = data[0]
+
+    # Non-multitrack: should produce 4 segments (one per speaker change)
+    assert len(topic["segments"]) == 4
+
+
+@pytest.mark.asyncio
+async def test_api_topics_with_words_endpoint_multitrack(client):
+    """Test GET /transcripts/{id}/topics/with-words uses multitrack segmentation."""
+    from datetime import datetime, timezone
+
+    from reflector.db.recordings import Recording, recordings_controller
+    from reflector.db.transcripts import (
+        TranscriptParticipant,
+        TranscriptTopic,
+        transcripts_controller,
+    )
+    from reflector.processors.types import Word
+
+    # Create multitrack recording
+    recording = Recording(
+        bucket_name="test-bucket",
+        object_key="test-key-2",
+        recorded_at=datetime.now(timezone.utc),
+        track_keys=["track1.webm", "track2.webm"],
+    )
+    await recordings_controller.create(recording)
+
+    transcript = await transcripts_controller.add(
+        name="Multitrack Test 2",
+        source_kind="file",
+        recording_id=recording.id,
+    )
+
+    await transcripts_controller.update(
+        transcript,
+        {
+            "participants": [
+                TranscriptParticipant(id="1", speaker=0, name="Alice").model_dump(),
+                TranscriptParticipant(id="2", speaker=1, name="Bob").model_dump(),
+            ]
+        },
+    )
+
+    await transcripts_controller.upsert_topic(
+        transcript,
+        TranscriptTopic(
+            title="Topic 1",
+            summary="Summary 1",
+            timestamp=0,
+            words=[
+                Word(text="Hello ", start=0.0, end=0.5, speaker=0),
+                Word(text="I'm ", start=0.5, end=0.8, speaker=1),
+                Word(text="there.", start=0.5, end=1.0, speaker=0),
+                Word(text="good.", start=1.0, end=1.5, speaker=1),
+            ],
+        ),
+    )
+
+    response = await client.get(f"/transcripts/{transcript.id}/topics/with-words")
+    assert response.status_code == 200
+    data = response.json()
+
+    assert len(data) == 1
+    topic = data[0]
+
+    # Should have 2 segments (multitrack sentence-based)
+    assert len(topic["segments"]) == 2
+    # Should also have words field
+    assert "words" in topic
+    assert len(topic["words"]) == 4