refactor: improve transcript list performance (#480)

* refactor: improve transcript list performance

* fix: sync openapi

* fix: frontend types

* fix: remove drop table _alembic_tmp_meeting

* fix: remove create table too

* fix: remove uq_recording_object_key
This commit is contained in:
2025-07-15 15:10:05 -06:00
committed by GitHub
parent 3d370336cc
commit 9deb717e5b
21 changed files with 470 additions and 126 deletions

View File

@@ -0,0 +1,59 @@
"""add_performance_indexes
Revision ID: ccd68dc784ff
Revises: 20250618140000
Create Date: 2025-07-15 11:48:42.854741
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = "ccd68dc784ff"
down_revision: Union[str, None] = "20250618140000"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("meeting", schema=None) as batch_op:
batch_op.create_index("idx_meeting_room_id", ["room_id"], unique=False)
with op.batch_alter_table("recording", schema=None) as batch_op:
batch_op.create_index("idx_recording_meeting_id", ["meeting_id"], unique=False)
with op.batch_alter_table("room", schema=None) as batch_op:
batch_op.create_index("idx_room_is_shared", ["is_shared"], unique=False)
with op.batch_alter_table("transcript", schema=None) as batch_op:
batch_op.create_index("idx_transcript_created_at", ["created_at"], unique=False)
batch_op.create_index(
"idx_transcript_recording_id", ["recording_id"], unique=False
)
batch_op.create_index("idx_transcript_user_id", ["user_id"], unique=False)
# ### end Alembic commands ###
def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table("transcript", schema=None) as batch_op:
batch_op.drop_index("idx_transcript_user_id")
batch_op.drop_index("idx_transcript_recording_id")
batch_op.drop_index("idx_transcript_created_at")
with op.batch_alter_table("room", schema=None) as batch_op:
batch_op.drop_index("idx_room_is_shared")
with op.batch_alter_table("recording", schema=None) as batch_op:
batch_op.drop_index("idx_recording_meeting_id")
with op.batch_alter_table("meeting", schema=None) as batch_op:
batch_op.drop_index("idx_meeting_room_id")
# ### end Alembic commands ###

View File

@@ -40,6 +40,7 @@ meetings = sa.Table(
nullable=False,
server_default=sa.true(),
),
sa.Index("idx_meeting_room_id", "room_id"),
)
meeting_consent = sa.Table(

View File

@@ -20,6 +20,7 @@ recordings = sa.Table(
server_default="pending",
),
sa.Column("meeting_id", sa.String),
sa.Index("idx_recording_meeting_id", "meeting_id"),
)

View File

@@ -39,6 +39,7 @@ rooms = sqlalchemy.Table(
sqlalchemy.Column(
"is_shared", sqlalchemy.Boolean, nullable=False, server_default=false()
),
sqlalchemy.Index("idx_room_is_shared", "is_shared"),
)

View File

@@ -6,7 +6,6 @@ from contextlib import asynccontextmanager
from datetime import datetime
from pathlib import Path
from typing import Any, Literal
from reflector.utils import generate_uuid4
import sqlalchemy
from fastapi import HTTPException
@@ -15,6 +14,7 @@ from reflector.db import database, metadata
from reflector.processors.types import Word as ProcessorWord
from reflector.settings import settings
from reflector.storage import get_transcripts_storage
from reflector.utils import generate_uuid4
from sqlalchemy import Enum
from sqlalchemy.sql import false, or_
@@ -74,6 +74,9 @@ transcripts = sqlalchemy.Table(
# the main "audio deleted" is the presence of the audio itself / consents not-given
# same field could've been in recording/meeting, and it's maybe even ok to dupe it at need
sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean, nullable=True),
sqlalchemy.Index("idx_transcript_recording_id", "recording_id"),
sqlalchemy.Index("idx_transcript_user_id", "user_id"),
sqlalchemy.Index("idx_transcript_created_at", "created_at"),
)
@@ -306,6 +309,7 @@ class TranscriptController:
room_id: str | None = None,
search_term: str | None = None,
return_query: bool = False,
exclude_columns: list[str] = ["topics", "events", "participants"],
) -> list[Transcript]:
"""
Get all transcripts
@@ -348,9 +352,14 @@ class TranscriptController:
if search_term:
query = query.where(transcripts.c.title.ilike(f"%{search_term}%"))
# Exclude heavy JSON columns from list queries
transcript_columns = [
col for col in transcripts.c if col.name not in exclude_columns
]
query = query.with_only_columns(
[
transcripts,
transcript_columns
+ [
rooms.c.id.label("room_id"),
rooms.c.name.label("room_name"),
]

View File

@@ -45,7 +45,7 @@ def create_access_token(data: dict, expires_delta: timedelta):
# ==============================================================
class GetTranscript(BaseModel):
class GetTranscriptMinimal(BaseModel):
id: str
user_id: str | None
name: str
@@ -59,7 +59,6 @@ class GetTranscript(BaseModel):
share_mode: str = Field("private")
source_language: str | None
target_language: str | None
participants: list[TranscriptParticipant] | None
reviewed: bool
meeting_id: str | None
source_kind: SourceKind
@@ -68,6 +67,10 @@ class GetTranscript(BaseModel):
audio_deleted: bool | None = None
class GetTranscript(GetTranscriptMinimal):
participants: list[TranscriptParticipant] | None
class CreateTranscript(BaseModel):
name: str
source_language: str = Field("en")
@@ -90,7 +93,7 @@ class DeletionStatus(BaseModel):
status: str
@router.get("/transcripts", response_model=Page[GetTranscript])
@router.get("/transcripts", response_model=Page[GetTranscriptMinimal])
async def transcripts_list(
user: Annotated[Optional[auth.UserInfo], Depends(auth.current_user_optional)],
source_kind: SourceKind | None = None,