Files
reflector/server/reflector/utils/webvtt.py
Igor Loskutov 6fb5cb21c2 feat: search backend (#537)
* docs: transient docs

* chore: cleanup

* webvtt WIP

* webvtt field

* chore: webvtt tests comments

* chore: remove useless tests

* feat: search TASK.md

* feat: full text search by title/webvtt

* chore: search api task

* feat: search api

* feat: search API

* chore: rm task md

* chore: roll back unnecessary validators

* chore: pr review WIP

* chore: pr review WIP

* chore: pr review

* chore: top imports

* feat: better lint + ci

* feat: better lint + ci

* feat: better lint + ci

* feat: better lint + ci

* chore: lint

* chore: lint

* fix: db datetime definitions

* fix: flush() params

* fix: update transcript mutability expectation / test

* fix: update transcript mutability expectation / test

* chore: auto review

* chore: new controller extraction

* chore: new controller extraction

* chore: cleanup

* chore: review WIP

* chore: pr WIP

* chore: remove ci lint

* chore: openapi regeneration

* chore: openapi regeneration

* chore: postgres test doc

* fix: .dockerignore for arm binaries

* fix: .dockerignore for arm binaries

* fix: cap test loops

* fix: cap test loops

* fix: cap test loops

* fix: get_transcript_topics

* chore: remove flow.md docs and claude guidance

* chore: remove claude.md db doc

* chore: remove claude.md db doc

* chore: remove claude.md db doc

* chore: remove claude.md db doc
2025-08-13 10:03:38 -04:00

64 lines
1.8 KiB
Python

"""WebVTT utilities for generating subtitle files from transcript data."""
from typing import TYPE_CHECKING, Annotated
import webvtt
from reflector.processors.types import Seconds, Word, words_to_segments
if TYPE_CHECKING:
from reflector.db.transcripts import TranscriptTopic
VttTimestamp = Annotated[str, "vtt_timestamp"]
WebVTTStr = Annotated[str, "webvtt_str"]
def _seconds_to_timestamp(seconds: Seconds) -> VttTimestamp:
# lib doesn't do that
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
milliseconds = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
def words_to_webvtt(words: list[Word]) -> WebVTTStr:
"""Convert words to WebVTT using existing segmentation logic."""
vtt = webvtt.WebVTT()
if not words:
return vtt.content
segments = words_to_segments(words)
for segment in segments:
text = segment.text.strip()
# lib doesn't do that
text = f"<v Speaker{segment.speaker}>{text}"
caption = webvtt.Caption(
start=_seconds_to_timestamp(segment.start),
end=_seconds_to_timestamp(segment.end),
text=text,
)
vtt.captions.append(caption)
return vtt.content
def topics_to_webvtt(topics: list["TranscriptTopic"]) -> WebVTTStr:
if not topics:
return webvtt.WebVTT().content
all_words: list[Word] = []
for topic in topics:
all_words.extend(topic.words)
# assert it's in sequence
for i in range(len(all_words) - 1):
assert (
all_words[i].start <= all_words[i + 1].start
), f"Words are not in sequence: {all_words[i].text} and {all_words[i + 1].text} are not consecutive: {all_words[i].start} > {all_words[i + 1].start}"
return words_to_webvtt(all_words)