mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-22 13:19:05 +00:00
feat: search backend (#537)
* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
This commit is contained in:
63
server/reflector/utils/webvtt.py
Normal file
63
server/reflector/utils/webvtt.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""WebVTT utilities for generating subtitle files from transcript data."""
|
||||
|
||||
from typing import TYPE_CHECKING, Annotated
|
||||
|
||||
import webvtt
|
||||
|
||||
from reflector.processors.types import Seconds, Word, words_to_segments
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from reflector.db.transcripts import TranscriptTopic
|
||||
|
||||
VttTimestamp = Annotated[str, "vtt_timestamp"]
|
||||
WebVTTStr = Annotated[str, "webvtt_str"]
|
||||
|
||||
|
||||
def _seconds_to_timestamp(seconds: Seconds) -> VttTimestamp:
|
||||
# lib doesn't do that
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
milliseconds = int((seconds % 1) * 1000)
|
||||
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
|
||||
|
||||
|
||||
def words_to_webvtt(words: list[Word]) -> WebVTTStr:
|
||||
"""Convert words to WebVTT using existing segmentation logic."""
|
||||
vtt = webvtt.WebVTT()
|
||||
if not words:
|
||||
return vtt.content
|
||||
|
||||
segments = words_to_segments(words)
|
||||
|
||||
for segment in segments:
|
||||
text = segment.text.strip()
|
||||
# lib doesn't do that
|
||||
text = f"<v Speaker{segment.speaker}>{text}"
|
||||
|
||||
caption = webvtt.Caption(
|
||||
start=_seconds_to_timestamp(segment.start),
|
||||
end=_seconds_to_timestamp(segment.end),
|
||||
text=text,
|
||||
)
|
||||
vtt.captions.append(caption)
|
||||
|
||||
return vtt.content
|
||||
|
||||
|
||||
def topics_to_webvtt(topics: list["TranscriptTopic"]) -> WebVTTStr:
|
||||
if not topics:
|
||||
return webvtt.WebVTT().content
|
||||
|
||||
all_words: list[Word] = []
|
||||
for topic in topics:
|
||||
all_words.extend(topic.words)
|
||||
|
||||
# assert it's in sequence
|
||||
for i in range(len(all_words) - 1):
|
||||
assert (
|
||||
all_words[i].start <= all_words[i + 1].start
|
||||
), f"Words are not in sequence: {all_words[i].text} and {all_words[i + 1].text} are not consecutive: {all_words[i].start} > {all_words[i + 1].start}"
|
||||
|
||||
return words_to_webvtt(all_words)
|
||||
Reference in New Issue
Block a user