mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
feat: search backend (#537)
* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
This commit is contained in:
@@ -1 +1,3 @@
|
||||
Generic single-database configuration.
|
||||
Generic single-database configuration.
|
||||
|
||||
Both data migrations and schema migrations must be in migrations.
|
||||
@@ -0,0 +1,27 @@
|
||||
"""add_webvtt_field_to_transcript
|
||||
|
||||
Revision ID: 0bc0f3ff0111
|
||||
Revises: b7df9609542c
|
||||
Create Date: 2025-08-05 19:36:41.740957
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
revision: str = '0bc0f3ff0111'
|
||||
down_revision: Union[str, None] = 'b7df9609542c'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column('transcript',
|
||||
sa.Column('webvtt', sa.Text(), nullable=True)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column('transcript', 'webvtt')
|
||||
@@ -0,0 +1,47 @@
|
||||
"""add_full_text_search
|
||||
|
||||
Revision ID: 116b2f287eab
|
||||
Revises: 0bc0f3ff0111
|
||||
Create Date: 2025-08-07 11:27:38.473517
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
revision: str = '116b2f287eab'
|
||||
down_revision: Union[str, None] = '0bc0f3ff0111'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
if conn.dialect.name != 'postgresql':
|
||||
return
|
||||
|
||||
op.execute("""
|
||||
ALTER TABLE transcript ADD COLUMN search_vector_en tsvector
|
||||
GENERATED ALWAYS AS (
|
||||
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(webvtt, '')), 'B')
|
||||
) STORED
|
||||
""")
|
||||
|
||||
op.create_index(
|
||||
'idx_transcript_search_vector_en',
|
||||
'transcript',
|
||||
['search_vector_en'],
|
||||
postgresql_using='gin'
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
if conn.dialect.name != 'postgresql':
|
||||
return
|
||||
|
||||
op.drop_index('idx_transcript_search_vector_en', table_name='transcript')
|
||||
op.drop_column('transcript', 'search_vector_en')
|
||||
@@ -0,0 +1,109 @@
|
||||
"""populate_webvtt_from_topics
|
||||
|
||||
Revision ID: 8120ebc75366
|
||||
Revises: 116b2f287eab
|
||||
Create Date: 2025-08-11 19:11:01.316947
|
||||
|
||||
"""
|
||||
import json
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import text
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = '8120ebc75366'
|
||||
down_revision: Union[str, None] = '116b2f287eab'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def topics_to_webvtt(topics):
|
||||
"""Convert topics list to WebVTT format string."""
|
||||
if not topics:
|
||||
return None
|
||||
|
||||
lines = ["WEBVTT", ""]
|
||||
|
||||
for topic in topics:
|
||||
start_time = format_timestamp(topic.get("start"))
|
||||
end_time = format_timestamp(topic.get("end"))
|
||||
text = topic.get("text", "").strip()
|
||||
|
||||
if start_time and end_time and text:
|
||||
lines.append(f"{start_time} --> {end_time}")
|
||||
lines.append(text)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def format_timestamp(seconds):
|
||||
"""Format seconds to WebVTT timestamp format (HH:MM:SS.mmm)."""
|
||||
if seconds is None:
|
||||
return None
|
||||
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = seconds % 60
|
||||
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Populate WebVTT field for all transcripts with topics."""
|
||||
|
||||
# Get connection
|
||||
connection = op.get_bind()
|
||||
|
||||
# Query all transcripts with topics
|
||||
result = connection.execute(
|
||||
text("SELECT id, topics FROM transcript WHERE topics IS NOT NULL")
|
||||
)
|
||||
|
||||
rows = result.fetchall()
|
||||
print(f"Found {len(rows)} transcripts with topics")
|
||||
|
||||
updated_count = 0
|
||||
error_count = 0
|
||||
|
||||
for row in rows:
|
||||
transcript_id = row[0]
|
||||
topics_data = row[1]
|
||||
|
||||
if not topics_data:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Parse JSON if it's a string
|
||||
if isinstance(topics_data, str):
|
||||
topics_data = json.loads(topics_data)
|
||||
|
||||
# Convert topics to WebVTT format
|
||||
webvtt_content = topics_to_webvtt(topics_data)
|
||||
|
||||
if webvtt_content:
|
||||
# Update the webvtt field
|
||||
connection.execute(
|
||||
text("UPDATE transcript SET webvtt = :webvtt WHERE id = :id"),
|
||||
{"webvtt": webvtt_content, "id": transcript_id}
|
||||
)
|
||||
updated_count += 1
|
||||
print(f"✓ Updated transcript {transcript_id}")
|
||||
|
||||
except Exception as e:
|
||||
error_count += 1
|
||||
print(f"✗ Error updating transcript {transcript_id}: {e}")
|
||||
|
||||
print(f"\nMigration complete!")
|
||||
print(f" Updated: {updated_count}")
|
||||
print(f" Errors: {error_count}")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Clear WebVTT field for all transcripts."""
|
||||
op.execute(
|
||||
text("UPDATE transcript SET webvtt = NULL")
|
||||
)
|
||||
Reference in New Issue
Block a user