From 033bd4bc48af7ed664bc10ff60dcf20df7293604 Mon Sep 17 00:00:00 2001 From: Mathieu Virbel Date: Thu, 17 Jul 2025 15:43:36 -0600 Subject: [PATCH] feat: improve transcript listing with room_id (#496) Added a new field in transcript for room_id, and set room_id/meeting_id in a transcript now. Use this field to list the transcripts. URL is now very fast. --- .../d7fbb74b673b_add_room_id_to_transcript.py | 59 +++++++++++++++++++ server/reflector/db/transcripts.py | 37 ++++++++---- server/reflector/worker/process.py | 2 + 3 files changed, 87 insertions(+), 11 deletions(-) create mode 100644 server/migrations/versions/d7fbb74b673b_add_room_id_to_transcript.py diff --git a/server/migrations/versions/d7fbb74b673b_add_room_id_to_transcript.py b/server/migrations/versions/d7fbb74b673b_add_room_id_to_transcript.py new file mode 100644 index 00000000..337fbc94 --- /dev/null +++ b/server/migrations/versions/d7fbb74b673b_add_room_id_to_transcript.py @@ -0,0 +1,59 @@ +"""Add room_id to transcript + +Revision ID: d7fbb74b673b +Revises: a9c9c229ee36 +Create Date: 2025-07-17 12:00:00.000000 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "d7fbb74b673b" +down_revision: Union[str, None] = "a9c9c229ee36" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add room_id column to transcript table + op.add_column("transcript", sa.Column("room_id", sa.String(), nullable=True)) + + # Add index for room_id for better query performance + op.create_index("idx_transcript_room_id", "transcript", ["room_id"]) + + # Populate room_id for existing ROOM-type transcripts + # This joins through recording -> meeting -> room to get the room_id + op.execute(""" + UPDATE transcript AS t + SET room_id = r.id + FROM recording rec + JOIN meeting m ON rec.meeting_id = m.id + JOIN room r ON m.room_id = r.id + WHERE t.recording_id = rec.id + AND t.source_kind = 'room' + AND t.room_id IS NULL + """) + + # Fix missing meeting_id for ROOM-type transcripts + # The meeting_id field exists but was never populated + op.execute(""" + UPDATE transcript AS t + SET meeting_id = rec.meeting_id + FROM recording rec + WHERE t.recording_id = rec.id + AND t.source_kind = 'room' + AND t.meeting_id IS NULL + AND rec.meeting_id IS NOT NULL + """) + + +def downgrade() -> None: + # Drop the index first + op.drop_index("idx_transcript_room_id", "transcript") + + # Drop the room_id column + op.drop_column("transcript", "room_id") \ No newline at end of file diff --git a/server/reflector/db/transcripts.py b/server/reflector/db/transcripts.py index de55cac9..7974df8b 100644 --- a/server/reflector/db/transcripts.py +++ b/server/reflector/db/transcripts.py @@ -74,10 +74,12 @@ transcripts = sqlalchemy.Table( # the main "audio deleted" is the presence of the audio itself / consents not-given # same field could've been in recording/meeting, and it's maybe even ok to dupe it at need sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean), + sqlalchemy.Column("room_id", sqlalchemy.String), sqlalchemy.Index("idx_transcript_recording_id", "recording_id"), sqlalchemy.Index("idx_transcript_user_id", "user_id"), sqlalchemy.Index("idx_transcript_created_at", "created_at"), sqlalchemy.Index("idx_transcript_user_id_recording_id", "user_id", "recording_id"), + sqlalchemy.Index("idx_transcript_room_id", "room_id"), ) @@ -167,6 +169,7 @@ class Transcript(BaseModel): zulip_message_id: int | None = None source_kind: SourceKind audio_deleted: bool | None = None + room_id: str | None = None @field_serializer("created_at", when_used="json") def serialize_datetime(self, dt: datetime) -> str: @@ -331,17 +334,10 @@ class TranscriptController: - `room_id`: filter transcripts by room ID - `search_term`: filter transcripts by search term """ - from reflector.db.meetings import meetings - from reflector.db.recordings import recordings from reflector.db.rooms import rooms - query = ( - transcripts.select() - .join( - recordings, transcripts.c.recording_id == recordings.c.id, isouter=True - ) - .join(meetings, recordings.c.meeting_id == meetings.c.id, isouter=True) - .join(rooms, meetings.c.room_id == rooms.c.id, isouter=True) + query = transcripts.select().join( + rooms, transcripts.c.room_id == rooms.c.id, isouter=True ) if user_id: @@ -355,7 +351,7 @@ class TranscriptController: query = query.where(transcripts.c.source_kind == source_kind) if room_id: - query = query.where(rooms.c.id == room_id) + query = query.where(transcripts.c.room_id == room_id) if search_term: query = query.where(transcripts.c.title.ilike(f"%{search_term}%")) @@ -368,7 +364,6 @@ class TranscriptController: query = query.with_only_columns( transcript_columns + [ - rooms.c.id.label("room_id"), rooms.c.name.label("room_name"), ] ) @@ -419,6 +414,22 @@ class TranscriptController: return None return Transcript(**result) + async def get_by_room_id(self, room_id: str, **kwargs) -> list[Transcript]: + """ + Get transcripts by room_id (direct access without joins) + """ + query = transcripts.select().where(transcripts.c.room_id == room_id) + if "user_id" in kwargs: + query = query.where(transcripts.c.user_id == kwargs["user_id"]) + if "order_by" in kwargs: + order_by = kwargs["order_by"] + field = getattr(transcripts.c, order_by[1:]) + if order_by.startswith("-"): + field = field.desc() + query = query.order_by(field) + results = await database.fetch_all(query) + return [Transcript(**result) for result in results] + async def get_by_id_for_http( self, transcript_id: str, @@ -469,6 +480,8 @@ class TranscriptController: user_id: str | None = None, recording_id: str | None = None, share_mode: str = "private", + meeting_id: str | None = None, + room_id: str | None = None, ): """ Add a new transcript @@ -481,6 +494,8 @@ class TranscriptController: user_id=user_id, recording_id=recording_id, share_mode=share_mode, + meeting_id=meeting_id, + room_id=room_id, ) query = transcripts.insert().values(**transcript.model_dump()) await database.execute(query) diff --git a/server/reflector/worker/process.py b/server/reflector/worker/process.py index 75623de1..24d7359a 100644 --- a/server/reflector/worker/process.py +++ b/server/reflector/worker/process.py @@ -101,6 +101,8 @@ async def process_recording(bucket_name: str, object_key: str): user_id=room.user_id, recording_id=recording.id, share_mode="public", + meeting_id=meeting.id, + room_id=room.id, ) _, extension = os.path.splitext(object_key)