feat: search frontend (#551)

* feat: better highlight * feat(search): add long_summary to search vector for improved search results - Update search vector to include long_summary with weight B (between title A and webvtt C) - Modify SearchController to fetch long_summary and prioritize its snippets - Generate snippets from long_summary first (max 2), then from webvtt for remaining slots - Add comprehensive tests for long_summary search functionality - Create migration to update search_vector_en column in PostgreSQL This improves search quality by including summarized content which often contains key topics and themes that may not be explicitly mentioned in the transcript. * fix: address code review feedback for search enhancements - Fix test file inconsistencies by removing references to non-existent model fields - Comment out tests for unimplemented features (room_ids, status filters, date ranges) - Update tests to only use currently available fields (room_id singular, no room_name/processing_status) - Mark future functionality tests with @pytest.mark.skip - Make snippet counts configurable - Add LONG_SUMMARY_MAX_SNIPPETS constant (default: 2) - Replace hardcoded value with configurable constant - Improve error handling consistency in WebVTT parsing - Use different log levels for different error types (debug for malformed, warning for decode, error for unexpected) - Add catch-all exception handler for unexpected errors - Include stack trace for critical errors All existing tests pass with these changes. * fix: correct datetime test to include required duration field * feat: better highlight * feat: search room names * feat: acknowledge deleted room * feat: search filters fix and rank removal * chore: minor refactoring * feat: better matches frontend * chore: self-review (vibe) * chore: self-review WIP * chore: self-review WIP * chore: self-review WIP * chore: self-review WIP * chore: self-review WIP * chore: self-review WIP * chore: self-review WIP * remove swc (vibe) * search url query sync (vibe) * search url query sync (vibe) * better casts and cap while * PR review + simplify frontend hook * pr: remove search db timeouts * cleanup tests * tests cleanup * frontend cleanup * index declarations * refactor frontend (self-review) * fix search pagination * clear "x" for search input * pagination max pages fix * chore: cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup * lockfile * pr review
2025-12-22 21:29:05 +00:00 · 2025-08-20 20:56:45 -04:00
parent fe5d344cff
commit 009590c080
32 changed files with 2311 additions and 618 deletions
--- a/server/reflector/db/search.py
+++ b/server/reflector/db/search.py
@@ -1,24 +1,37 @@
 """Search functionality for transcripts and other entities."""

+import itertools
+from dataclasses import dataclass
 from datetime import datetime
 from io import StringIO
-from typing import Annotated, Any, Dict
+from typing import Annotated, Any, Dict, Iterator

 import sqlalchemy
 import webvtt
-from pydantic import BaseModel, Field, constr, field_serializer
+from fastapi import HTTPException
+from pydantic import (
+    BaseModel,
+    Field,
+    NonNegativeFloat,
+    NonNegativeInt,
+    ValidationError,
+    constr,
+    field_serializer,
+)

 from reflector.db import get_database
+from reflector.db.rooms import rooms
 from reflector.db.transcripts import SourceKind, transcripts
 from reflector.db.utils import is_postgresql
 from reflector.logger import logger

 DEFAULT_SEARCH_LIMIT = 20
 SNIPPET_CONTEXT_LENGTH = 50  # Characters before/after match to include
-DEFAULT_SNIPPET_MAX_LENGTH = 150
-DEFAULT_MAX_SNIPPETS = 3
+DEFAULT_SNIPPET_MAX_LENGTH = NonNegativeInt(150)
+DEFAULT_MAX_SNIPPETS = NonNegativeInt(3)
+LONG_SUMMARY_MAX_SNIPPETS = 2

-SearchQueryBase = constr(min_length=1, strip_whitespace=True)
+SearchQueryBase = constr(min_length=0, strip_whitespace=True)
 SearchLimitBase = Annotated[int, Field(ge=1, le=100)]
 SearchOffsetBase = Annotated[int, Field(ge=0)]
 SearchTotalBase = Annotated[int, Field(ge=0)]
@@ -32,6 +45,82 @@ SearchTotal = Annotated[
    SearchTotalBase, Field(description="Total number of search results")
 ]

+WEBVTT_SPEC_HEADER = "WEBVTT\n\n"
+
+WebVTTContent = Annotated[
+    str,
+    Field(min_length=len(WEBVTT_SPEC_HEADER), description="WebVTT content"),
+]
+
+
+class WebVTTProcessor:
+    """Stateless processor for WebVTT content operations."""
+
+    @staticmethod
+    def parse(raw_content: str) -> WebVTTContent:
+        """Parse WebVTT content and return it as a string."""
+        if not raw_content.startswith(WEBVTT_SPEC_HEADER):
+            raise ValueError(f"Invalid WebVTT content, no header {WEBVTT_SPEC_HEADER}")
+        return raw_content
+
+    @staticmethod
+    def extract_text(webvtt_content: WebVTTContent) -> str:
+        """Extract plain text from WebVTT content using webvtt library."""
+        try:
+            buffer = StringIO(webvtt_content)
+            vtt = webvtt.read_buffer(buffer)
+            return " ".join(caption.text for caption in vtt if caption.text)
+        except webvtt.errors.MalformedFileError as e:
+            logger.warning(f"Malformed WebVTT content: {e}")
+            return ""
+        except (UnicodeDecodeError, ValueError) as e:
+            logger.warning(f"Failed to decode WebVTT content: {e}")
+            return ""
+        except AttributeError as e:
+            logger.error(
+                f"WebVTT parsing error - unexpected format: {e}", exc_info=True
+            )
+            return ""
+        except Exception as e:
+            logger.error(f"Unexpected error parsing WebVTT: {e}", exc_info=True)
+            return ""
+
+    @staticmethod
+    def generate_snippets(
+        webvtt_content: WebVTTContent,
+        query: str,
+        max_snippets: NonNegativeInt = DEFAULT_MAX_SNIPPETS,
+    ) -> list[str]:
+        """Generate snippets from WebVTT content."""
+        return SnippetGenerator.generate(
+            WebVTTProcessor.extract_text(webvtt_content),
+            query,
+            max_snippets=max_snippets,
+        )
+
+
+@dataclass(frozen=True)
+class SnippetCandidate:
+    """Represents a candidate snippet with its position."""
+
+    _text: str
+    start: NonNegativeInt
+    _original_text_length: int
+
+    @property
+    def end(self) -> NonNegativeInt:
+        """Calculate end position from start and raw text length."""
+        return self.start + len(self._text)
+
+    def text(self) -> str:
+        """Get display text with ellipses added if needed."""
+        result = self._text.strip()
+        if self.start > 0:
+            result = "..." + result
+        if self.end < self._original_text_length:
+            result = result + "..."
+        return result
+

 class SearchParameters(BaseModel):
    """Validated search parameters for full-text search."""
@@ -41,6 +130,7 @@ class SearchParameters(BaseModel):
    offset: SearchOffset = 0
    user_id: str | None = None
    room_id: str | None = None
+    source_kind: SourceKind | None = None


 class SearchResultDB(BaseModel):
@@ -64,13 +154,18 @@ class SearchResult(BaseModel):
    title: str | None = None
    user_id: str | None = None
    room_id: str | None = None
+    room_name: str | None = None
+    source_kind: SourceKind
    created_at: datetime
    status: str = Field(..., min_length=1)
    rank: float = Field(..., ge=0, le=1)
-    duration: float | None = Field(..., ge=0, description="Duration in seconds")
+    duration: NonNegativeFloat | None = Field(..., description="Duration in seconds")
    search_snippets: list[str] = Field(
        description="Text snippets around search matches"
    )
+    total_match_count: NonNegativeInt = Field(
+        default=0, description="Total number of matches found in the transcript"
+    )

    @field_serializer("created_at", when_used="json")
    def serialize_datetime(self, dt: datetime) -> str:
@@ -79,84 +174,153 @@ class SearchResult(BaseModel):
        return dt.isoformat()


-class SearchController:
-    """Controller for search operations across different entities."""
+class SnippetGenerator:
+    """Stateless generator for text snippets and match operations."""

    @staticmethod
-    def _extract_webvtt_text(webvtt_content: str) -> str:
-        """Extract plain text from WebVTT content using webvtt library."""
-        if not webvtt_content:
-            return ""
+    def find_all_matches(text: str, query: str) -> Iterator[int]:
+        """Generate all match positions for a query in text."""
+        if not text:
+            logger.warning("Empty text for search query in find_all_matches")
+            return
+        if not query:
+            logger.warning("Empty query for search text in find_all_matches")
+            return

-        try:
-            buffer = StringIO(webvtt_content)
-            vtt = webvtt.read_buffer(buffer)
-            return " ".join(caption.text for caption in vtt if caption.text)
-        except (webvtt.errors.MalformedFileError, UnicodeDecodeError, ValueError) as e:
-            logger.warning(f"Failed to parse WebVTT content: {e}", exc_info=e)
-            return ""
-        except AttributeError as e:
-            logger.warning(f"WebVTT parsing error - unexpected format: {e}", exc_info=e)
-            return ""
+        text_lower = text.lower()
+        query_lower = query.lower()
+        start = 0
+        prev_start = start
+        while (pos := text_lower.find(query_lower, start)) != -1:
+            yield pos
+            start = pos + len(query_lower)
+            if start <= prev_start:
+                raise ValueError("panic! find_all_matches is not incremental")
+            prev_start = start

    @staticmethod
-    def _generate_snippets(
+    def count_matches(text: str, query: str) -> NonNegativeInt:
+        """Count total number of matches for a query in text."""
+        ZERO = NonNegativeInt(0)
+        if not text:
+            logger.warning("Empty text for search query in count_matches")
+            return ZERO
+        if not query:
+            logger.warning("Empty query for search text in count_matches")
+            return ZERO
+        return NonNegativeInt(
+            sum(1 for _ in SnippetGenerator.find_all_matches(text, query))
+        )
+
+    @staticmethod
+    def create_snippet(
+        text: str, match_pos: int, max_length: int = DEFAULT_SNIPPET_MAX_LENGTH
+    ) -> SnippetCandidate:
+        """Create a snippet from a match position."""
+        snippet_start = NonNegativeInt(max(0, match_pos - SNIPPET_CONTEXT_LENGTH))
+        snippet_end = min(len(text), match_pos + max_length - SNIPPET_CONTEXT_LENGTH)
+
+        snippet_text = text[snippet_start:snippet_end]
+
+        return SnippetCandidate(
+            _text=snippet_text, start=snippet_start, _original_text_length=len(text)
+        )
+
+    @staticmethod
+    def filter_non_overlapping(
+        candidates: Iterator[SnippetCandidate],
+    ) -> Iterator[str]:
+        """Filter out overlapping snippets and return only display text."""
+        last_end = 0
+        for candidate in candidates:
+            display_text = candidate.text()
+            # it means that next overlapping snippets simply don't get included
+            # it's fine as simplistic logic and users probably won't care much because they already have their search results just fin
+            if candidate.start >= last_end and display_text:
+                yield display_text
+                last_end = candidate.end
+
+    @staticmethod
+    def generate(
        text: str,
-        q: SearchQuery,
-        max_length: int = DEFAULT_SNIPPET_MAX_LENGTH,
-        max_snippets: int = DEFAULT_MAX_SNIPPETS,
+        query: str,
+        max_length: NonNegativeInt = DEFAULT_SNIPPET_MAX_LENGTH,
+        max_snippets: NonNegativeInt = DEFAULT_MAX_SNIPPETS,
    ) -> list[str]:
-        """Generate multiple snippets around all occurrences of search term."""
-        if not text or not q:
+        """Generate snippets from text."""
+        if not text or not query:
+            logger.warning("Empty text or query for generate_snippets")
            return []

-        snippets = []
-        lower_text = text.lower()
-        search_lower = q.lower()
+        candidates = (
+            SnippetGenerator.create_snippet(text, pos, max_length)
+            for pos in SnippetGenerator.find_all_matches(text, query)
+        )
+        filtered = SnippetGenerator.filter_non_overlapping(candidates)
+        snippets = list(itertools.islice(filtered, max_snippets))

-        last_snippet_end = 0
-        start_pos = 0
-
-        while len(snippets) < max_snippets:
-            match_pos = lower_text.find(search_lower, start_pos)
-
-            if match_pos == -1:
-                if not snippets and search_lower.split():
-                    first_word = search_lower.split()[0]
-                    match_pos = lower_text.find(first_word, start_pos)
-                    if match_pos == -1:
-                        break
-                else:
-                    break
-
-            snippet_start = max(0, match_pos - SNIPPET_CONTEXT_LENGTH)
-            snippet_end = min(
-                len(text), match_pos + max_length - SNIPPET_CONTEXT_LENGTH
-            )
-
-            if snippet_start < last_snippet_end:
-                start_pos = match_pos + len(search_lower)
-                continue
-
-            snippet = text[snippet_start:snippet_end]
-
-            if snippet_start > 0:
-                snippet = "..." + snippet
-            if snippet_end < len(text):
-                snippet = snippet + "..."
-
-            snippet = snippet.strip()
-
-            if snippet:
-                snippets.append(snippet)
-                last_snippet_end = snippet_end
-
-            start_pos = match_pos + len(search_lower)
-            if start_pos >= len(text):
-                break
+        # Fallback to first word search if no full matches
+        # it's another assumption: proper snippet logic generation is quite complicated and tied to db logic, so simplification is used here
+        if not snippets and " " in query:
+            first_word = query.split()[0]
+            return SnippetGenerator.generate(text, first_word, max_length, max_snippets)

        return snippets

+    @staticmethod
+    def from_summary(
+        summary: str,
+        query: str,
+        max_snippets: NonNegativeInt = LONG_SUMMARY_MAX_SNIPPETS,
+    ) -> list[str]:
+        """Generate snippets from summary text."""
+        return SnippetGenerator.generate(summary, query, max_snippets=max_snippets)
+
+    @staticmethod
+    def combine_sources(
+        summary: str | None,
+        webvtt: WebVTTContent | None,
+        query: str,
+        max_total: NonNegativeInt = DEFAULT_MAX_SNIPPETS,
+    ) -> tuple[list[str], NonNegativeInt]:
+        """Combine snippets from multiple sources and return total match count.
+
+        Returns (snippets, total_match_count) tuple.
+
+        snippets can be empty for real in case of e.g. title match
+        """
+        webvtt_matches = 0
+        summary_matches = 0
+
+        if webvtt:
+            webvtt_text = WebVTTProcessor.extract_text(webvtt)
+            webvtt_matches = SnippetGenerator.count_matches(webvtt_text, query)
+
+        if summary:
+            summary_matches = SnippetGenerator.count_matches(summary, query)
+
+        total_matches = NonNegativeInt(webvtt_matches + summary_matches)
+
+        summary_snippets = (
+            SnippetGenerator.from_summary(summary, query) if summary else []
+        )
+
+        if len(summary_snippets) >= max_total:
+            return summary_snippets[:max_total], total_matches
+
+        remaining = max_total - len(summary_snippets)
+        webvtt_snippets = (
+            WebVTTProcessor.generate_snippets(webvtt, query, remaining)
+            if webvtt
+            else []
+        )
+
+        return summary_snippets + webvtt_snippets, total_matches
+
+
+class SearchController:
+    """Controller for search operations across different entities."""
+
    @classmethod
    async def search_transcripts(
        cls, params: SearchParameters
@@ -172,39 +336,64 @@ class SearchController:
            )
            return [], 0

-        search_query = sqlalchemy.func.websearch_to_tsquery(
-            "english", params.query_text
+        base_columns = [
+            transcripts.c.id,
+            transcripts.c.title,
+            transcripts.c.created_at,
+            transcripts.c.duration,
+            transcripts.c.status,
+            transcripts.c.user_id,
+            transcripts.c.room_id,
+            transcripts.c.source_kind,
+            transcripts.c.webvtt,
+            transcripts.c.long_summary,
+            sqlalchemy.case(
+                (
+                    transcripts.c.room_id.isnot(None) & rooms.c.id.is_(None),
+                    "Deleted Room",
+                ),
+                else_=rooms.c.name,
+            ).label("room_name"),
+        ]
+
+        if params.query_text:
+            search_query = sqlalchemy.func.websearch_to_tsquery(
+                "english", params.query_text
+            )
+            rank_column = sqlalchemy.func.ts_rank(
+                transcripts.c.search_vector_en,
+                search_query,
+                32,  # normalization flag: rank/(rank+1) for 0-1 range
+            ).label("rank")
+        else:
+            rank_column = sqlalchemy.cast(1.0, sqlalchemy.Float).label("rank")
+
+        columns = base_columns + [rank_column]
+        base_query = sqlalchemy.select(columns).select_from(
+            transcripts.join(rooms, transcripts.c.room_id == rooms.c.id, isouter=True)
        )

-        base_query = sqlalchemy.select(
-            [
-                transcripts.c.id,
-                transcripts.c.title,
-                transcripts.c.created_at,
-                transcripts.c.duration,
-                transcripts.c.status,
-                transcripts.c.user_id,
-                transcripts.c.room_id,
-                transcripts.c.source_kind,
-                transcripts.c.webvtt,
-                sqlalchemy.func.ts_rank(
-                    transcripts.c.search_vector_en,
-                    search_query,
-                    32,  # normalization flag: rank/(rank+1) for 0-1 range
-                ).label("rank"),
-            ]
-        ).where(transcripts.c.search_vector_en.op("@@")(search_query))
+        if params.query_text:
+            base_query = base_query.where(
+                transcripts.c.search_vector_en.op("@@")(search_query)
+            )

        if params.user_id:
            base_query = base_query.where(transcripts.c.user_id == params.user_id)
        if params.room_id:
            base_query = base_query.where(transcripts.c.room_id == params.room_id)
+        if params.source_kind:
+            base_query = base_query.where(
+                transcripts.c.source_kind == params.source_kind
+            )
+
+        if params.query_text:
+            order_by = sqlalchemy.desc(sqlalchemy.text("rank"))
+        else:
+            order_by = sqlalchemy.desc(transcripts.c.created_at)
+
+        query = base_query.order_by(order_by).limit(params.limit).offset(params.offset)

-        query = (
-            base_query.order_by(sqlalchemy.desc(sqlalchemy.text("rank")))
-            .limit(params.limit)
-            .offset(params.offset)
-        )
        rs = await get_database().fetch_all(query)

        count_query = sqlalchemy.select([sqlalchemy.func.count()]).select_from(
@@ -214,18 +403,40 @@ class SearchController:

        def _process_result(r) -> SearchResult:
            r_dict: Dict[str, Any] = dict(r)
-            webvtt: str | None = r_dict.pop("webvtt", None)
+            webvtt_raw: str | None = r_dict.pop("webvtt", None)
+            if webvtt_raw:
+                webvtt = WebVTTProcessor.parse(webvtt_raw)
+            else:
+                webvtt = None
+            long_summary: str | None = r_dict.pop("long_summary", None)
+            room_name: str | None = r_dict.pop("room_name", None)
            db_result = SearchResultDB.model_validate(r_dict)

-            snippets = []
-            if webvtt:
-                plain_text = cls._extract_webvtt_text(webvtt)
-                snippets = cls._generate_snippets(plain_text, params.query_text)
+            snippets, total_match_count = SnippetGenerator.combine_sources(
+                long_summary, webvtt, params.query_text, DEFAULT_MAX_SNIPPETS
+            )

-            return SearchResult(**db_result.model_dump(), search_snippets=snippets)
+            return SearchResult(
+                **db_result.model_dump(),
+                room_name=room_name,
+                search_snippets=snippets,
+                total_match_count=total_match_count,
+            )
+
+        try:
+            results = [_process_result(r) for r in rs]
+        except ValidationError as e:
+            logger.error(f"Invalid search result data: {e}", exc_info=True)
+            raise HTTPException(
+                status_code=500, detail="Internal search result data consistency error"
+            )
+        except Exception as e:
+            logger.error(f"Error processing search results: {e}", exc_info=True)
+            raise

-        results = [_process_result(r) for r in rs]
        return results, total


 search_controller = SearchController()
+webvtt_processor = WebVTTProcessor()
+snippet_generator = SnippetGenerator()