feat: search backend (#537)

* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
2026-04-23 13:45:18 +00:00 · 2025-08-13 10:03:38 -04:00
parent a42ed12982
commit 6fb5cb21c2
29 changed files with 3213 additions and 1493 deletions
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -2,9 +2,10 @@ import io
 import re
 import tempfile
 from pathlib import Path
+from typing import Annotated

 from profanityfilter import ProfanityFilter
-from pydantic import BaseModel, PrivateAttr
+from pydantic import BaseModel, Field, PrivateAttr

 from reflector.redis_cache import redis_cache

@@ -48,20 +49,70 @@ class AudioFile(BaseModel):
            self._path.unlink()


+# non-negative seconds with float part
+Seconds = Annotated[float, Field(ge=0.0, description="Time in seconds with float part")]
+
+
 class Word(BaseModel):
    text: str
-    start: float
-    end: float
+    start: Seconds
+    end: Seconds
    speaker: int = 0


 class TranscriptSegment(BaseModel):
    text: str
-    start: float
-    end: float
+    start: Seconds
+    end: Seconds
    speaker: int = 0


+def words_to_segments(words: list[Word]) -> list[TranscriptSegment]:
+    # from a list of word, create a list of segments
+    # join the word that are less than 2 seconds apart
+    # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
+    segments = []
+    current_segment = None
+    MAX_SEGMENT_LENGTH = 120
+
+    for word in words:
+        if current_segment is None:
+            current_segment = TranscriptSegment(
+                text=word.text,
+                start=word.start,
+                end=word.end,
+                speaker=word.speaker,
+            )
+            continue
+
+        # If the word is attach to another speaker, push the current segment
+        # and start a new one
+        if word.speaker != current_segment.speaker:
+            segments.append(current_segment)
+            current_segment = TranscriptSegment(
+                text=word.text,
+                start=word.start,
+                end=word.end,
+                speaker=word.speaker,
+            )
+            continue
+
+        # if the word is the end of a sentence, and we have enough content,
+        # add the word to the current segment and push it
+        current_segment.text += word.text
+        current_segment.end = word.end
+
+        have_punc = PUNC_RE.search(word.text)
+        if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH):
+            segments.append(current_segment)
+            current_segment = None
+
+    if current_segment:
+        segments.append(current_segment)
+
+    return segments
+
+
 class Transcript(BaseModel):
    translation: str | None = None
    words: list[Word] = None
@@ -117,49 +168,7 @@ class Transcript(BaseModel):
        return Transcript(text=self.text, translation=self.translation, words=words)

    def as_segments(self) -> list[TranscriptSegment]:
-        # from a list of word, create a list of segments
-        # join the word that are less than 2 seconds apart
-        # but separate if the speaker changes, or if the punctuation is a . , ; : ? !
-        segments = []
-        current_segment = None
-        MAX_SEGMENT_LENGTH = 120
-
-        for word in self.words:
-            if current_segment is None:
-                current_segment = TranscriptSegment(
-                    text=word.text,
-                    start=word.start,
-                    end=word.end,
-                    speaker=word.speaker,
-                )
-                continue
-
-            # If the word is attach to another speaker, push the current segment
-            # and start a new one
-            if word.speaker != current_segment.speaker:
-                segments.append(current_segment)
-                current_segment = TranscriptSegment(
-                    text=word.text,
-                    start=word.start,
-                    end=word.end,
-                    speaker=word.speaker,
-                )
-                continue
-
-            # if the word is the end of a sentence, and we have enough content,
-            # add the word to the current segment and push it
-            current_segment.text += word.text
-            current_segment.end = word.end
-
-            have_punc = PUNC_RE.search(word.text)
-            if have_punc and (len(current_segment.text) > MAX_SEGMENT_LENGTH):
-                segments.append(current_segment)
-                current_segment = None
-
-        if current_segment:
-            segments.append(current_segment)
-
-        return segments
+        return words_to_segments(self.words)


 class TitleSummary(BaseModel):