mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
64 lines
1.8 KiB
Python
64 lines
1.8 KiB
Python
"""WebVTT utilities for generating subtitle files from transcript data."""
|
|
|
|
from typing import TYPE_CHECKING, Annotated
|
|
|
|
import webvtt
|
|
|
|
from reflector.processors.types import Seconds, Word, words_to_segments
|
|
|
|
if TYPE_CHECKING:
|
|
from reflector.db.transcripts import TranscriptTopic
|
|
|
|
VttTimestamp = Annotated[str, "vtt_timestamp"]
|
|
WebVTTStr = Annotated[str, "webvtt_str"]
|
|
|
|
|
|
def _seconds_to_timestamp(seconds: Seconds) -> VttTimestamp:
|
|
# lib doesn't do that
|
|
hours = int(seconds // 3600)
|
|
minutes = int((seconds % 3600) // 60)
|
|
secs = int(seconds % 60)
|
|
milliseconds = int((seconds % 1) * 1000)
|
|
|
|
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
|
|
|
|
|
|
def words_to_webvtt(words: list[Word]) -> WebVTTStr:
|
|
"""Convert words to WebVTT using existing segmentation logic."""
|
|
vtt = webvtt.WebVTT()
|
|
if not words:
|
|
return vtt.content
|
|
|
|
segments = words_to_segments(words)
|
|
|
|
for segment in segments:
|
|
text = segment.text.strip()
|
|
# lib doesn't do that
|
|
text = f"<v Speaker{segment.speaker}>{text}"
|
|
|
|
caption = webvtt.Caption(
|
|
start=_seconds_to_timestamp(segment.start),
|
|
end=_seconds_to_timestamp(segment.end),
|
|
text=text,
|
|
)
|
|
vtt.captions.append(caption)
|
|
|
|
return vtt.content
|
|
|
|
|
|
def topics_to_webvtt(topics: list["TranscriptTopic"]) -> WebVTTStr:
|
|
if not topics:
|
|
return webvtt.WebVTT().content
|
|
|
|
all_words: list[Word] = []
|
|
for topic in topics:
|
|
all_words.extend(topic.words)
|
|
|
|
# assert it's in sequence
|
|
for i in range(len(all_words) - 1):
|
|
assert (
|
|
all_words[i].start <= all_words[i + 1].start
|
|
), f"Words are not in sequence: {all_words[i].text} and {all_words[i + 1].text} are not consecutive: {all_words[i].start} > {all_words[i + 1].start}"
|
|
|
|
return words_to_webvtt(all_words)
|