Files
reflector/server/tests/test_search_long_summary.py
Igor Loskutov d70beee51b fix: include shared rooms to search (#558)
* include shared rooms to search

* tests vibe

* tests vibe

* tests vibe

* tests vibe

* tests vibe

* tests vibe

* tests vibe

* remove tests, thats too much
2025-08-21 14:52:29 -04:00

167 lines
6.0 KiB
Python

"""Tests for long_summary in search functionality."""
import json
from datetime import datetime, timezone
import pytest
from reflector.db import get_database
from reflector.db.search import SearchParameters, search_controller
from reflector.db.transcripts import transcripts
@pytest.mark.asyncio
async def test_long_summary_snippet_prioritization():
"""Test that snippets from long_summary are prioritized over webvtt content."""
test_id = "test-snippet-priority-3f9a2b8c"
try:
# Clean up any existing test data
await get_database().execute(
transcripts.delete().where(transcripts.c.id == test_id)
)
test_data = {
"id": test_id,
"name": "Test Snippet Priority",
"title": "Meeting About Projects",
"status": "completed",
"locked": False,
"duration": 1800.0,
"created_at": datetime.now(timezone.utc),
"short_summary": "Project discussion",
"long_summary": (
"The team discussed advanced robotics applications including "
"autonomous navigation systems and sensor fusion techniques. "
"Robotics development will focus on real-time processing."
),
"topics": json.dumps([]),
"events": json.dumps([]),
"participants": json.dumps([]),
"source_language": "en",
"target_language": "en",
"reviewed": False,
"audio_location": "local",
"share_mode": "private",
"source_kind": "room",
"webvtt": """WEBVTT
00:00:00.000 --> 00:00:10.000
We talked about many different topics today.
00:00:10.000 --> 00:00:20.000
The robotics project is making good progress.
00:00:20.000 --> 00:00:30.000
We need to consider various implementation approaches.""",
"user_id": "test-user-priority",
}
await get_database().execute(transcripts.insert().values(**test_data))
# Search for "robotics" which appears in both long_summary and webvtt
params = SearchParameters(query_text="robotics", user_id="test-user-priority")
results, total = await search_controller.search_transcripts(params)
assert total >= 1
test_result = next((r for r in results if r.id == test_id), None)
assert test_result, "Should find the test transcript"
snippets = test_result.search_snippets
assert len(snippets) > 0, "Should have at least one snippet"
# The first snippets should be from long_summary (more detailed content)
first_snippet = snippets[0].lower()
assert (
"advanced robotics" in first_snippet or "autonomous" in first_snippet
), f"First snippet should be from long_summary with detailed content. Got: {snippets[0]}"
# With max 3 snippets, we should get both from long_summary and webvtt
assert len(snippets) <= 3, "Should respect max snippets limit"
# All snippets should contain the search term
for snippet in snippets:
assert (
"robotics" in snippet.lower()
), f"Snippet should contain search term: {snippet}"
finally:
await get_database().execute(
transcripts.delete().where(transcripts.c.id == test_id)
)
await get_database().disconnect()
@pytest.mark.asyncio
async def test_long_summary_only_search():
"""Test searching for content that only exists in long_summary."""
test_id = "test-long-only-8b3c9f2a"
try:
await get_database().execute(
transcripts.delete().where(transcripts.c.id == test_id)
)
test_data = {
"id": test_id,
"name": "Test Long Only",
"title": "Standard Meeting",
"status": "completed",
"locked": False,
"duration": 1800.0,
"created_at": datetime.now(timezone.utc),
"short_summary": "Team sync",
"long_summary": (
"Detailed analysis of cryptocurrency market trends and "
"decentralized finance protocols. Discussion included "
"yield farming strategies and liquidity pool mechanics."
),
"topics": json.dumps([]),
"events": json.dumps([]),
"participants": json.dumps([]),
"source_language": "en",
"target_language": "en",
"reviewed": False,
"audio_location": "local",
"share_mode": "private",
"source_kind": "room",
"webvtt": """WEBVTT
00:00:00.000 --> 00:00:10.000
Team meeting about general project updates.
00:00:10.000 --> 00:00:20.000
Discussion of timeline and deliverables.""",
"user_id": "test-user-long",
}
await get_database().execute(transcripts.insert().values(**test_data))
# Search for terms only in long_summary
params = SearchParameters(query_text="cryptocurrency", user_id="test-user-long")
results, total = await search_controller.search_transcripts(params)
found = any(r.id == test_id for r in results)
assert found, "Should find transcript by long_summary-only content"
test_result = next((r for r in results if r.id == test_id), None)
assert test_result
assert len(test_result.search_snippets) > 0
# Verify the snippet is about cryptocurrency
snippet = test_result.search_snippets[0].lower()
assert "cryptocurrency" in snippet, "Snippet should contain the search term"
# Search for "yield farming" - a more specific term
params2 = SearchParameters(query_text="yield farming", user_id="test-user-long")
results2, total2 = await search_controller.search_transcripts(params2)
found2 = any(r.id == test_id for r in results2)
assert found2, "Should find transcript by specific long_summary phrase"
finally:
await get_database().execute(
transcripts.delete().where(transcripts.c.id == test_id)
)
await get_database().disconnect()