mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-03-21 22:56:47 +00:00
Remove ~6.5MB of redundant Word data from Hatchet task boundaries: - Remove words from TopicChunkInput/TopicChunkResult (child workflow I/O) - detect_topics maps words from local chunks by chunk_index instead - TopicsResult carries empty transcript words (persisted to DB already) - extract_subjects refetches topics from DB instead of task output - Clear topics at detect_topics start for retry idempotency
186 lines
6.5 KiB
Python
186 lines
6.5 KiB
Python
"""
|
|
Tests for Hatchet payload thinning optimizations.
|
|
|
|
Verifies that:
|
|
1. TopicChunkInput no longer carries words
|
|
2. TopicChunkResult no longer carries words
|
|
3. words_to_segments() matches Transcript.as_segments(is_multitrack=False) — behavioral equivalence
|
|
for the extract_subjects refactoring
|
|
4. TopicsResult can be constructed with empty transcript words
|
|
"""
|
|
|
|
from reflector.hatchet.workflows.models import TopicChunkResult
|
|
from reflector.hatchet.workflows.topic_chunk_processing import TopicChunkInput
|
|
from reflector.processors.types import Word
|
|
|
|
|
|
def _make_words(speaker: int = 0, start: float = 0.0) -> list[Word]:
|
|
return [
|
|
Word(text="Hello", start=start, end=start + 0.5, speaker=speaker),
|
|
Word(text=" world.", start=start + 0.5, end=start + 1.0, speaker=speaker),
|
|
]
|
|
|
|
|
|
class TestTopicChunkInputNoWords:
|
|
"""TopicChunkInput must not have a words field."""
|
|
|
|
def test_no_words_field(self):
|
|
assert "words" not in TopicChunkInput.model_fields
|
|
|
|
def test_construction_without_words(self):
|
|
inp = TopicChunkInput(
|
|
chunk_index=0, chunk_text="Hello world.", timestamp=0.0, duration=1.0
|
|
)
|
|
assert inp.chunk_index == 0
|
|
assert inp.chunk_text == "Hello world."
|
|
|
|
def test_rejects_words_kwarg(self):
|
|
"""Passing words= should raise a validation error (field doesn't exist)."""
|
|
import pydantic
|
|
|
|
try:
|
|
TopicChunkInput(
|
|
chunk_index=0,
|
|
chunk_text="text",
|
|
timestamp=0.0,
|
|
duration=1.0,
|
|
words=_make_words(),
|
|
)
|
|
# If pydantic is configured to ignore extra, this won't raise.
|
|
# Verify the field is still absent from the model.
|
|
assert "words" not in TopicChunkInput.model_fields
|
|
except pydantic.ValidationError:
|
|
pass # Expected
|
|
|
|
|
|
class TestTopicChunkResultNoWords:
|
|
"""TopicChunkResult must not have a words field."""
|
|
|
|
def test_no_words_field(self):
|
|
assert "words" not in TopicChunkResult.model_fields
|
|
|
|
def test_construction_without_words(self):
|
|
result = TopicChunkResult(
|
|
chunk_index=0,
|
|
title="Test",
|
|
summary="Summary",
|
|
timestamp=0.0,
|
|
duration=1.0,
|
|
)
|
|
assert result.title == "Test"
|
|
assert result.chunk_index == 0
|
|
|
|
def test_serialization_roundtrip(self):
|
|
"""Serialized TopicChunkResult has no words key."""
|
|
result = TopicChunkResult(
|
|
chunk_index=0,
|
|
title="Test",
|
|
summary="Summary",
|
|
timestamp=0.0,
|
|
duration=1.0,
|
|
)
|
|
data = result.model_dump()
|
|
assert "words" not in data
|
|
reconstructed = TopicChunkResult(**data)
|
|
assert reconstructed == result
|
|
|
|
|
|
class TestWordsToSegmentsBehavioralEquivalence:
|
|
"""words_to_segments() must produce same output as Transcript.as_segments(is_multitrack=False).
|
|
|
|
This ensures the extract_subjects refactoring (from task output topic.transcript.as_segments()
|
|
to words_to_segments(db_topic.words)) preserves identical behavior.
|
|
"""
|
|
|
|
def test_single_speaker(self):
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
from reflector.processors.types import words_to_segments
|
|
|
|
words = _make_words(speaker=0)
|
|
direct = words_to_segments(words)
|
|
via_transcript = TranscriptType(words=words).as_segments(is_multitrack=False)
|
|
|
|
assert len(direct) == len(via_transcript)
|
|
for d, v in zip(direct, via_transcript):
|
|
assert d.text == v.text
|
|
assert d.speaker == v.speaker
|
|
assert d.start == v.start
|
|
assert d.end == v.end
|
|
|
|
def test_multiple_speakers(self):
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
from reflector.processors.types import words_to_segments
|
|
|
|
words = [
|
|
Word(text="Hello", start=0.0, end=0.5, speaker=0),
|
|
Word(text=" world.", start=0.5, end=1.0, speaker=0),
|
|
Word(text=" How", start=1.0, end=1.5, speaker=1),
|
|
Word(text=" are", start=1.5, end=2.0, speaker=1),
|
|
Word(text=" you?", start=2.0, end=2.5, speaker=1),
|
|
]
|
|
|
|
direct = words_to_segments(words)
|
|
via_transcript = TranscriptType(words=words).as_segments(is_multitrack=False)
|
|
|
|
assert len(direct) == len(via_transcript)
|
|
for d, v in zip(direct, via_transcript):
|
|
assert d.text == v.text
|
|
assert d.speaker == v.speaker
|
|
|
|
def test_empty_words(self):
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
from reflector.processors.types import words_to_segments
|
|
|
|
assert words_to_segments([]) == []
|
|
assert TranscriptType(words=[]).as_segments(is_multitrack=False) == []
|
|
|
|
|
|
class TestTopicsResultEmptyWords:
|
|
"""TopicsResult can carry topics with empty transcript words."""
|
|
|
|
def test_construction_with_empty_words(self):
|
|
from reflector.hatchet.workflows.models import TopicsResult
|
|
from reflector.processors.types import TitleSummary
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
|
|
topics = [
|
|
TitleSummary(
|
|
title="Topic A",
|
|
summary="Summary A",
|
|
timestamp=0.0,
|
|
duration=5.0,
|
|
transcript=TranscriptType(words=[]),
|
|
),
|
|
TitleSummary(
|
|
title="Topic B",
|
|
summary="Summary B",
|
|
timestamp=5.0,
|
|
duration=5.0,
|
|
transcript=TranscriptType(words=[]),
|
|
),
|
|
]
|
|
result = TopicsResult(topics=topics)
|
|
assert len(result.topics) == 2
|
|
for t in result.topics:
|
|
assert t.transcript.words == []
|
|
|
|
def test_serialization_roundtrip(self):
|
|
from reflector.hatchet.workflows.models import TopicsResult
|
|
from reflector.processors.types import TitleSummary
|
|
from reflector.processors.types import Transcript as TranscriptType
|
|
|
|
topics = [
|
|
TitleSummary(
|
|
title="Topic",
|
|
summary="Summary",
|
|
timestamp=0.0,
|
|
duration=1.0,
|
|
transcript=TranscriptType(words=[]),
|
|
)
|
|
]
|
|
result = TopicsResult(topics=topics)
|
|
data = result.model_dump()
|
|
reconstructed = TopicsResult(**data)
|
|
assert len(reconstructed.topics) == 1
|
|
assert reconstructed.topics[0].transcript.words == []
|