feat: download files, show cloud video, solf deletion with no reprocessing (#920)

* fix: move upd ports out of MacOS internal Range * feat: download files, show cloud video, solf deletion with no reprocessing
2026-05-06 11:15:18 +00:00 · 2026-03-20 11:04:53 -05:00
parent cb1beae90d
commit a76f114378
21 changed files with 1413 additions and 77 deletions
--- a/server/migrations/versions/501c73a6b0d5_add_soft_delete_fields.py
+++ b/server/migrations/versions/501c73a6b0d5_add_soft_delete_fields.py
@@ -0,0 +1,47 @@
+"""add soft delete fields to transcript and recording
+
+Revision ID: 501c73a6b0d5
+Revises: e1f093f7f124
+Create Date: 2026-03-19 00:00:00.000000
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+revision: str = "501c73a6b0d5"
+down_revision: Union[str, None] = "e1f093f7f124"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "transcript",
+        sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "recording",
+        sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.create_index(
+        "idx_transcript_not_deleted",
+        "transcript",
+        ["id"],
+        postgresql_where=sa.text("deleted_at IS NULL"),
+    )
+    op.create_index(
+        "idx_recording_not_deleted",
+        "recording",
+        ["id"],
+        postgresql_where=sa.text("deleted_at IS NULL"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("idx_recording_not_deleted", table_name="recording")
+    op.drop_index("idx_transcript_not_deleted", table_name="transcript")
+    op.drop_column("recording", "deleted_at")
+    op.drop_column("transcript", "deleted_at")
--- a/server/reflector/app.py
+++ b/server/reflector/app.py
@@ -19,12 +19,14 @@ from reflector.views.rooms import router as rooms_router
 from reflector.views.rtc_offer import router as rtc_offer_router
 from reflector.views.transcripts import router as transcripts_router
 from reflector.views.transcripts_audio import router as transcripts_audio_router
+from reflector.views.transcripts_download import router as transcripts_download_router
 from reflector.views.transcripts_participants import (
    router as transcripts_participants_router,
 )
 from reflector.views.transcripts_process import router as transcripts_process_router
 from reflector.views.transcripts_speaker import router as transcripts_speaker_router
 from reflector.views.transcripts_upload import router as transcripts_upload_router
+from reflector.views.transcripts_video import router as transcripts_video_router
 from reflector.views.transcripts_webrtc import router as transcripts_webrtc_router
 from reflector.views.transcripts_websocket import router as transcripts_websocket_router
 from reflector.views.user import router as user_router
@@ -97,6 +99,8 @@ app.include_router(transcripts_audio_router, prefix="/v1")
 app.include_router(transcripts_participants_router, prefix="/v1")
 app.include_router(transcripts_speaker_router, prefix="/v1")
 app.include_router(transcripts_upload_router, prefix="/v1")
+app.include_router(transcripts_download_router, prefix="/v1")
+app.include_router(transcripts_video_router, prefix="/v1")
 app.include_router(transcripts_websocket_router, prefix="/v1")
 app.include_router(transcripts_webrtc_router, prefix="/v1")
 app.include_router(transcripts_process_router, prefix="/v1")
--- a/server/reflector/db/recordings.py
+++ b/server/reflector/db/recordings.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Literal

 import sqlalchemy as sa
@@ -24,6 +24,7 @@ recordings = sa.Table(
    ),
    sa.Column("meeting_id", sa.String),
    sa.Column("track_keys", sa.JSON, nullable=True),
+    sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
    sa.Index("idx_recording_meeting_id", "meeting_id"),
 )

@@ -40,6 +41,7 @@ class Recording(BaseModel):
    # track_keys can be empty list [] if recording finished but no audio was captured (silence/muted)
    # None means not a multitrack recording, [] means multitrack with no tracks
    track_keys: list[str] | None = None
+    deleted_at: datetime | None = None

    @property
    def is_multitrack(self) -> bool:
@@ -69,7 +71,11 @@ class RecordingController:
        return Recording(**result) if result else None

    async def remove_by_id(self, id: str) -> None:
-        query = recordings.delete().where(recordings.c.id == id)
+        query = (
+            recordings.update()
+            .where(recordings.c.id == id)
+            .values(deleted_at=datetime.now(timezone.utc))
+        )
        await get_database().execute(query)

    async def set_meeting_id(
@@ -114,6 +120,7 @@ class RecordingController:
            .where(
                recordings.c.bucket_name == bucket_name,
                recordings.c.track_keys.isnot(None),
+                recordings.c.deleted_at.is_(None),
                or_(
                    transcripts.c.id.is_(None),
                    transcripts.c.status == "error",
--- a/server/reflector/db/search.py
+++ b/server/reflector/db/search.py
@@ -387,6 +387,8 @@ class SearchController:
            transcripts.join(rooms, transcripts.c.room_id == rooms.c.id, isouter=True)
        )

+        base_query = base_query.where(transcripts.c.deleted_at.is_(None))
+
        if params.query_text is not None:
            # because already initialized based on params.query_text presence above
            assert search_query is not None
--- a/server/reflector/db/transcripts.py
+++ b/server/reflector/db/transcripts.py
@@ -91,6 +91,7 @@ transcripts = sqlalchemy.Table(
    sqlalchemy.Column("webvtt", sqlalchemy.Text),
    # Hatchet workflow run ID for resumption of failed workflows
    sqlalchemy.Column("workflow_run_id", sqlalchemy.String),
+    sqlalchemy.Column("deleted_at", sqlalchemy.DateTime(timezone=True), nullable=True),
    sqlalchemy.Column(
        "change_seq",
        sqlalchemy.BigInteger,
@@ -238,6 +239,7 @@ class Transcript(BaseModel):
    webvtt: str | None = None
    workflow_run_id: str | None = None  # Hatchet workflow run ID for resumption
    change_seq: int | None = None
+    deleted_at: datetime | None = None

    @field_serializer("created_at", when_used="json")
    def serialize_datetime(self, dt: datetime) -> str:
@@ -418,6 +420,8 @@ class TranscriptController:
            rooms, transcripts.c.room_id == rooms.c.id, isouter=True
        )

+        query = query.where(transcripts.c.deleted_at.is_(None))
+
        if user_id:
            query = query.where(
                or_(transcripts.c.user_id == user_id, rooms.c.is_shared)
@@ -500,7 +504,10 @@ class TranscriptController:
        """
        Get transcripts by room_id (direct access without joins)
        """
-        query = transcripts.select().where(transcripts.c.room_id == room_id)
+        query = transcripts.select().where(
+            transcripts.c.room_id == room_id,
+            transcripts.c.deleted_at.is_(None),
+        )
        if "user_id" in kwargs:
            query = query.where(transcripts.c.user_id == kwargs["user_id"])
        if "order_by" in kwargs:
@@ -531,8 +538,11 @@ class TranscriptController:
        if not result:
            raise HTTPException(status_code=404, detail="Transcript not found")

-        # if the transcript is anonymous, share mode is not checked
        transcript = Transcript(**result)
+        if transcript.deleted_at is not None:
+            raise HTTPException(status_code=404, detail="Transcript not found")
+
+        # if the transcript is anonymous, share mode is not checked
        if transcript.user_id is None:
            return transcript

@@ -632,56 +642,49 @@ class TranscriptController:
        user_id: str | None = None,
    ) -> None:
        """
-        Remove a transcript by id
+        Soft-delete a transcript by id.
+
+        Sets deleted_at on the transcript and its associated recording.
+        All files (S3 and local) are preserved for later retrieval.
        """
        transcript = await self.get_by_id(transcript_id)
        if not transcript:
            return
        if user_id is not None and transcript.user_id != user_id:
            return
-        if transcript.audio_location == "storage" and not transcript.audio_deleted:
-            try:
-                await get_transcripts_storage().delete_file(
-                    transcript.storage_audio_path
-                )
-            except Exception as e:
-                logger.warning(
-                    "Failed to delete transcript audio from storage",
-                    exc_info=e,
-                    transcript_id=transcript.id,
-                )
-        transcript.unlink()
+        if transcript.deleted_at is not None:
+            return
+
+        now = datetime.now(timezone.utc)
+
+        # Soft-delete the associated recording (keeps S3 files intact)
        if transcript.recording_id:
            try:
-                recording = await recordings_controller.get_by_id(
-                    transcript.recording_id
-                )
-                if recording:
-                    try:
-                        await get_transcripts_storage().delete_file(
-                            recording.object_key, bucket=recording.bucket_name
-                        )
-                    except Exception as e:
-                        logger.warning(
-                            "Failed to delete recording object from S3",
-                            exc_info=e,
-                            recording_id=transcript.recording_id,
-                        )
-                    await recordings_controller.remove_by_id(transcript.recording_id)
+                await recordings_controller.remove_by_id(transcript.recording_id)
            except Exception as e:
                logger.warning(
-                    "Failed to delete recording row",
+                    "Failed to soft-delete recording",
                    exc_info=e,
                    recording_id=transcript.recording_id,
                )
-        query = transcripts.delete().where(transcripts.c.id == transcript_id)
+
+        # Soft-delete the transcript (keeps all files intact)
+        query = (
+            transcripts.update()
+            .where(transcripts.c.id == transcript_id)
+            .values(deleted_at=now)
+        )
        await get_database().execute(query)

    async def remove_by_recording_id(self, recording_id: str):
        """
-        Remove a transcript by recording_id
+        Soft-delete a transcript by recording_id
        """
-        query = transcripts.delete().where(transcripts.c.recording_id == recording_id)
+        query = (
+            transcripts.update()
+            .where(transcripts.c.recording_id == recording_id)
+            .values(deleted_at=datetime.now(timezone.utc))
+        )
        await get_database().execute(query)

    @staticmethod
--- a/server/reflector/tools/deleted_transcripts.py
+++ b/server/reflector/tools/deleted_transcripts.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+"""
+CLI tool for managing soft-deleted transcripts.
+
+Usage:
+    uv run python -m reflector.tools.deleted_transcripts list
+    uv run python -m reflector.tools.deleted_transcripts files <transcript_id>
+    uv run python -m reflector.tools.deleted_transcripts download <transcript_id> [--output-dir ./]
+"""
+
+import argparse
+import asyncio
+import json
+import os
+
+import structlog
+
+from reflector.db import get_database
+from reflector.db.meetings import meetings_controller
+from reflector.db.recordings import recordings_controller
+from reflector.db.transcripts import Transcript, transcripts
+from reflector.storage import get_source_storage, get_transcripts_storage
+
+logger = structlog.get_logger(__name__)
+
+
+async def list_deleted():
+    """List all soft-deleted transcripts."""
+    database = get_database()
+    await database.connect()
+    try:
+        query = (
+            transcripts.select()
+            .where(transcripts.c.deleted_at.isnot(None))
+            .order_by(transcripts.c.deleted_at.desc())
+        )
+        results = await database.fetch_all(query)
+
+        if not results:
+            print("No deleted transcripts found.")
+            return
+
+        print(
+            f"{'ID':<40} {'Title':<40} {'Deleted At':<28} {'Recording ID':<40} {'Meeting ID'}"
+        )
+        print("-" * 180)
+        for row in results:
+            t = Transcript(**row)
+            title = (t.title or "")[:38]
+            deleted = t.deleted_at.isoformat() if t.deleted_at else ""
+            print(
+                f"{t.id:<40} {title:<40} {deleted:<28} {t.recording_id or '':<40} {t.meeting_id or ''}"
+            )
+
+        print(f"\nTotal: {len(results)} deleted transcript(s)")
+    finally:
+        await database.disconnect()
+
+
+async def list_files(transcript_id: str):
+    """List all S3 keys associated with a deleted transcript."""
+    database = get_database()
+    await database.connect()
+    try:
+        query = transcripts.select().where(transcripts.c.id == transcript_id)
+        result = await database.fetch_one(query)
+        if not result:
+            print(f"Transcript {transcript_id} not found.")
+            return
+
+        t = Transcript(**result)
+        if t.deleted_at is None:
+            print(f"Transcript {transcript_id} is not deleted.")
+            return
+
+        print(f"Transcript: {t.id}")
+        print(f"Title: {t.title}")
+        print(f"Deleted at: {t.deleted_at}")
+        print()
+
+        files = []
+
+        # Transcript audio
+        if t.audio_location == "storage" and not t.audio_deleted:
+            files.append(("Transcript audio", t.storage_audio_path, None))
+
+        # Recording files
+        if t.recording_id:
+            recording = await recordings_controller.get_by_id(t.recording_id)
+            if recording:
+                if recording.object_key:
+                    files.append(
+                        (
+                            "Recording object_key",
+                            recording.object_key,
+                            recording.bucket_name,
+                        )
+                    )
+                if recording.track_keys:
+                    for i, key in enumerate(recording.track_keys):
+                        files.append((f"Track {i}", key, recording.bucket_name))
+
+        # Cloud video
+        if t.meeting_id:
+            meeting = await meetings_controller.get_by_id(t.meeting_id)
+            if meeting and meeting.daily_composed_video_s3_key:
+                files.append(("Cloud video", meeting.daily_composed_video_s3_key, None))
+
+        if not files:
+            print("No associated files found.")
+            return
+
+        print(f"{'Type':<25} {'Bucket':<30} {'S3 Key'}")
+        print("-" * 120)
+        for label, key, bucket in files:
+            print(f"{label:<25} {bucket or '(default)':<30} {key}")
+
+        # Generate presigned URLs
+        print("\nPresigned URLs (valid for 1 hour):")
+        print("-" * 120)
+        storage = get_transcripts_storage()
+        for label, key, bucket in files:
+            try:
+                url = await storage.get_file_url(key, bucket=bucket, expires_in=3600)
+                print(f"{label}: {url}")
+            except Exception as e:
+                print(f"{label}: ERROR - {e}")
+    finally:
+        await database.disconnect()
+
+
+async def download_files(transcript_id: str, output_dir: str):
+    """Download all files associated with a deleted transcript."""
+    database = get_database()
+    await database.connect()
+    try:
+        query = transcripts.select().where(transcripts.c.id == transcript_id)
+        result = await database.fetch_one(query)
+        if not result:
+            print(f"Transcript {transcript_id} not found.")
+            return
+
+        t = Transcript(**result)
+        if t.deleted_at is None:
+            print(f"Transcript {transcript_id} is not deleted.")
+            return
+
+        dest = os.path.join(output_dir, t.id)
+        os.makedirs(dest, exist_ok=True)
+
+        storage = get_transcripts_storage()
+
+        # Download transcript audio
+        if t.audio_location == "storage" and not t.audio_deleted:
+            try:
+                data = await storage.get_file(t.storage_audio_path)
+                path = os.path.join(dest, "audio.mp3")
+                with open(path, "wb") as f:
+                    f.write(data)
+                print(f"Downloaded: {path}")
+            except Exception as e:
+                print(f"Failed to download audio: {e}")
+
+        # Download recording files
+        if t.recording_id:
+            recording = await recordings_controller.get_by_id(t.recording_id)
+            if recording and recording.track_keys:
+                tracks_dir = os.path.join(dest, "tracks")
+                os.makedirs(tracks_dir, exist_ok=True)
+                for i, key in enumerate(recording.track_keys):
+                    try:
+                        data = await storage.get_file(key, bucket=recording.bucket_name)
+                        filename = os.path.basename(key) or f"track_{i}"
+                        path = os.path.join(tracks_dir, filename)
+                        with open(path, "wb") as f:
+                            f.write(data)
+                        print(f"Downloaded: {path}")
+                    except Exception as e:
+                        print(f"Failed to download track {i}: {e}")
+
+        # Download cloud video
+        if t.meeting_id:
+            meeting = await meetings_controller.get_by_id(t.meeting_id)
+            if meeting and meeting.daily_composed_video_s3_key:
+                try:
+                    source_storage = get_source_storage("daily")
+                    data = await source_storage.get_file(
+                        meeting.daily_composed_video_s3_key
+                    )
+                    path = os.path.join(dest, "cloud_video.mp4")
+                    with open(path, "wb") as f:
+                        f.write(data)
+                    print(f"Downloaded: {path}")
+                except Exception as e:
+                    print(f"Failed to download cloud video: {e}")
+
+        # Write metadata
+        metadata = {
+            "id": t.id,
+            "title": t.title,
+            "created_at": t.created_at.isoformat() if t.created_at else None,
+            "deleted_at": t.deleted_at.isoformat() if t.deleted_at else None,
+            "duration": t.duration,
+            "source_language": t.source_language,
+            "target_language": t.target_language,
+            "short_summary": t.short_summary,
+            "long_summary": t.long_summary,
+            "topics": [topic.model_dump() for topic in t.topics] if t.topics else [],
+            "participants": [p.model_dump() for p in t.participants]
+            if t.participants
+            else [],
+            "action_items": t.action_items,
+            "webvtt": t.webvtt,
+            "recording_id": t.recording_id,
+            "meeting_id": t.meeting_id,
+        }
+        path = os.path.join(dest, "metadata.json")
+        with open(path, "w") as f:
+            json.dump(metadata, f, indent=2, default=str)
+        print(f"Downloaded: {path}")
+
+        print(f"\nAll files saved to: {dest}")
+    finally:
+        await database.disconnect()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Manage soft-deleted transcripts")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    subparsers.add_parser("list", help="List all deleted transcripts")
+
+    files_parser = subparsers.add_parser(
+        "files", help="List S3 keys for a deleted transcript"
+    )
+    files_parser.add_argument("transcript_id", help="Transcript ID")
+
+    download_parser = subparsers.add_parser(
+        "download", help="Download files for a deleted transcript"
+    )
+    download_parser.add_argument("transcript_id", help="Transcript ID")
+    download_parser.add_argument(
+        "--output-dir", default=".", help="Output directory (default: .)"
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "list":
+        asyncio.run(list_deleted())
+    elif args.command == "files":
+        asyncio.run(list_files(args.transcript_id))
+    elif args.command == "download":
+        asyncio.run(download_files(args.transcript_id, args.output_dir))
+
+
+if __name__ == "__main__":
+    main()
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -16,6 +16,7 @@ from pydantic import (

 import reflector.auth as auth
 from reflector.db import get_database
+from reflector.db.meetings import meetings_controller
 from reflector.db.recordings import recordings_controller
 from reflector.db.rooms import rooms_controller
 from reflector.db.search import (
@@ -112,6 +113,8 @@ class GetTranscriptMinimal(BaseModel):
    room_name: str | None = None
    audio_deleted: bool | None = None
    change_seq: int | None = None
+    has_cloud_video: bool = False
+    cloud_video_duration: int | None = None


 class TranscriptParticipantWithEmail(TranscriptParticipant):
@@ -501,6 +504,14 @@ async def transcript_get(
                )
            )

+    has_cloud_video = False
+    cloud_video_duration = None
+    if transcript.meeting_id:
+        meeting = await meetings_controller.get_by_id(transcript.meeting_id)
+        if meeting and meeting.daily_composed_video_s3_key:
+            has_cloud_video = True
+            cloud_video_duration = meeting.daily_composed_video_duration
+
    base_data = {
        "id": transcript.id,
        "user_id": transcript.user_id,
@@ -524,6 +535,8 @@ async def transcript_get(
        "audio_deleted": transcript.audio_deleted,
        "change_seq": transcript.change_seq,
        "participants": participants,
+        "has_cloud_video": has_cloud_video,
+        "cloud_video_duration": cloud_video_duration,
    }

    if transcript_format == "text":
--- a/server/reflector/views/transcripts_download.py
+++ b/server/reflector/views/transcripts_download.py
@@ -0,0 +1,169 @@
+"""
+Transcript download endpoint — generates a zip archive with all transcript files.
+"""
+
+import json
+import os
+import tempfile
+import zipfile
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import StreamingResponse
+
+import reflector.auth as auth
+from reflector.db.meetings import meetings_controller
+from reflector.db.recordings import recordings_controller
+from reflector.db.transcripts import transcripts_controller
+from reflector.logger import logger
+from reflector.storage import get_source_storage, get_transcripts_storage
+
+router = APIRouter()
+
+
+@router.get(
+    "/transcripts/{transcript_id}/download/zip",
+    operation_id="transcript_download_zip",
+)
+async def transcript_download_zip(
+    transcript_id: str,
+    user: Annotated[auth.UserInfo, Depends(auth.current_user)],
+):
+    user_id = user["sub"]
+    transcript = await transcripts_controller.get_by_id_for_http(
+        transcript_id, user_id=user_id
+    )
+    if not transcripts_controller.user_can_mutate(transcript, user_id):
+        raise HTTPException(status_code=403, detail="Not authorized")
+
+    recording = None
+    if transcript.recording_id:
+        recording = await recordings_controller.get_by_id(transcript.recording_id)
+
+    meeting = None
+    if transcript.meeting_id:
+        meeting = await meetings_controller.get_by_id(transcript.meeting_id)
+
+    truncated_id = str(transcript.id).split("-")[0]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        zip_path = os.path.join(tmpdir, f"transcript_{truncated_id}.zip")
+
+        with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
+            # Transcript audio
+            if transcript.audio_location == "storage" and not transcript.audio_deleted:
+                try:
+                    storage = get_transcripts_storage()
+                    data = await storage.get_file(transcript.storage_audio_path)
+                    audio_path = os.path.join(tmpdir, "audio.mp3")
+                    with open(audio_path, "wb") as f:
+                        f.write(data)
+                    zf.write(audio_path, "audio.mp3")
+                except Exception as e:
+                    logger.warning(
+                        "Failed to download transcript audio for zip",
+                        exc_info=e,
+                        transcript_id=transcript.id,
+                    )
+            elif (
+                not transcript.audio_deleted
+                and hasattr(transcript, "audio_mp3_filename")
+                and transcript.audio_mp3_filename
+                and transcript.audio_mp3_filename.exists()
+            ):
+                zf.write(str(transcript.audio_mp3_filename), "audio.mp3")
+
+            # Recording tracks (multitrack)
+            if recording and recording.track_keys:
+                try:
+                    source_storage = get_source_storage(
+                        "daily" if recording.track_keys else None
+                    )
+                except Exception:
+                    source_storage = get_transcripts_storage()
+
+                for i, key in enumerate(recording.track_keys):
+                    try:
+                        data = await source_storage.get_file(
+                            key, bucket=recording.bucket_name
+                        )
+                        filename = os.path.basename(key) or f"track_{i}"
+                        track_path = os.path.join(tmpdir, f"track_{i}")
+                        with open(track_path, "wb") as f:
+                            f.write(data)
+                        zf.write(track_path, f"tracks/{filename}")
+                    except Exception as e:
+                        logger.warning(
+                            "Failed to download track for zip",
+                            exc_info=e,
+                            track_key=key,
+                        )
+
+            # Cloud video
+            if meeting and meeting.daily_composed_video_s3_key:
+                try:
+                    source_storage = get_source_storage("daily")
+                    data = await source_storage.get_file(
+                        meeting.daily_composed_video_s3_key
+                    )
+                    video_path = os.path.join(tmpdir, "cloud_video.mp4")
+                    with open(video_path, "wb") as f:
+                        f.write(data)
+                    zf.write(video_path, "cloud_video.mp4")
+                except Exception as e:
+                    logger.warning(
+                        "Failed to download cloud video for zip",
+                        exc_info=e,
+                        s3_key=meeting.daily_composed_video_s3_key,
+                    )
+
+            # Metadata JSON
+            metadata = {
+                "id": transcript.id,
+                "title": transcript.title,
+                "created_at": (
+                    transcript.created_at.isoformat() if transcript.created_at else None
+                ),
+                "duration": transcript.duration,
+                "source_language": transcript.source_language,
+                "target_language": transcript.target_language,
+                "short_summary": transcript.short_summary,
+                "long_summary": transcript.long_summary,
+                "topics": (
+                    [t.model_dump() for t in transcript.topics]
+                    if transcript.topics
+                    else []
+                ),
+                "participants": (
+                    [p.model_dump() for p in transcript.participants]
+                    if transcript.participants
+                    else []
+                ),
+                "action_items": transcript.action_items,
+                "webvtt": transcript.webvtt,
+                "recording_id": transcript.recording_id,
+                "meeting_id": transcript.meeting_id,
+            }
+            meta_path = os.path.join(tmpdir, "metadata.json")
+            with open(meta_path, "w") as f:
+                json.dump(metadata, f, indent=2, default=str)
+            zf.write(meta_path, "metadata.json")
+
+        # Read zip into memory before tmpdir is cleaned up
+        with open(zip_path, "rb") as f:
+            zip_bytes = f.read()
+
+    def iter_zip():
+        offset = 0
+        chunk_size = 64 * 1024
+        while offset < len(zip_bytes):
+            yield zip_bytes[offset : offset + chunk_size]
+            offset += chunk_size
+
+    return StreamingResponse(
+        iter_zip(),
+        media_type="application/zip",
+        headers={
+            "Content-Disposition": f"attachment; filename=transcript_{truncated_id}.zip"
+        },
+    )
--- a/server/reflector/views/transcripts_video.py
+++ b/server/reflector/views/transcripts_video.py
@@ -0,0 +1,75 @@
+"""
+Transcript cloud video endpoint — returns a presigned URL for streaming playback.
+"""
+
+from typing import Annotated, Optional
+
+import jwt
+from fastapi import APIRouter, Depends, HTTPException, status
+from pydantic import BaseModel
+
+import reflector.auth as auth
+from reflector.db.meetings import meetings_controller
+from reflector.db.transcripts import transcripts_controller
+from reflector.settings import settings
+from reflector.storage import get_source_storage
+
+router = APIRouter()
+
+
+class VideoUrlResponse(BaseModel):
+    url: str
+    duration: int | None = None
+    content_type: str = "video/mp4"
+
+
+@router.get(
+    "/transcripts/{transcript_id}/video/url",
+    operation_id="transcript_get_video_url",
+    response_model=VideoUrlResponse,
+)
+async def transcript_get_video_url(
+    transcript_id: str,
+    user: Annotated[Optional[auth.UserInfo], Depends(auth.current_user_optional)],
+    token: str | None = None,
+):
+    user_id = user["sub"] if user else None
+    if not user_id and token:
+        try:
+            token_user = await auth.verify_raw_token(token)
+        except Exception:
+            token_user = None
+        if not token_user:
+            try:
+                payload = jwt.decode(token, settings.SECRET_KEY, algorithms=["HS256"])
+                user_id = payload.get("sub")
+            except jwt.PyJWTError:
+                raise HTTPException(
+                    status_code=status.HTTP_401_UNAUTHORIZED,
+                    detail="Invalid or expired token",
+                )
+        else:
+            user_id = token_user["sub"]
+
+    transcript = await transcripts_controller.get_by_id_for_http(
+        transcript_id, user_id=user_id
+    )
+
+    if not transcript.meeting_id:
+        raise HTTPException(status_code=404, detail="No video available")
+
+    meeting = await meetings_controller.get_by_id(transcript.meeting_id)
+    if not meeting or not meeting.daily_composed_video_s3_key:
+        raise HTTPException(status_code=404, detail="No video available")
+
+    source_storage = get_source_storage("daily")
+    url = await source_storage.get_file_url(
+        meeting.daily_composed_video_s3_key,
+        operation="get_object",
+        expires_in=3600,
+    )
+
+    return VideoUrlResponse(
+        url=url,
+        duration=meeting.daily_composed_video_duration,
+    )
--- a/server/reflector/worker/cleanup.py
+++ b/server/reflector/worker/cleanup.py
@@ -90,7 +90,9 @@ async def cleanup_old_transcripts(
 ):
    """Delete old anonymous transcripts and their associated recordings/meetings."""
    query = transcripts.select().where(
-        (transcripts.c.created_at < cutoff_date) & (transcripts.c.user_id.is_(None))
+        (transcripts.c.created_at < cutoff_date)
+        & (transcripts.c.user_id.is_(None))
+        & (transcripts.c.deleted_at.is_(None))
    )
    old_transcripts = await db.fetch_all(query)

--- a/server/reflector/worker/process.py
+++ b/server/reflector/worker/process.py
@@ -104,6 +104,12 @@ async def process_recording(bucket_name: str, object_key: str):
    room = await rooms_controller.get_by_id(meeting.room_id)

    recording = await recordings_controller.get_by_object_key(bucket_name, object_key)
+    if recording and recording.deleted_at is not None:
+        logger.info(
+            "Skipping soft-deleted recording",
+            recording_id=recording.id,
+        )
+        return
    if not recording:
        recording = await recordings_controller.create(
            Recording(
@@ -115,6 +121,13 @@ async def process_recording(bucket_name: str, object_key: str):
        )

    transcript = await transcripts_controller.get_by_recording_id(recording.id)
+    if transcript and transcript.deleted_at is not None:
+        logger.info(
+            "Skipping soft-deleted transcript for recording",
+            recording_id=recording.id,
+            transcript_id=transcript.id,
+        )
+        return
    if transcript:
        await transcripts_controller.update(
            transcript,
@@ -262,6 +275,13 @@ async def _process_multitrack_recording_inner(
    # Check if recording already exists (reprocessing path)
    recording = await recordings_controller.get_by_id(recording_id)

+    if recording and recording.deleted_at is not None:
+        logger.info(
+            "Skipping soft-deleted recording",
+            recording_id=recording_id,
+        )
+        return
+
    if recording and recording.meeting_id:
        # Reprocessing: recording exists with meeting already linked
        meeting = await meetings_controller.get_by_id(recording.meeting_id)
@@ -341,6 +361,13 @@ async def _process_multitrack_recording_inner(
        )

    transcript = await transcripts_controller.get_by_recording_id(recording.id)
+    if transcript and transcript.deleted_at is not None:
+        logger.info(
+            "Skipping soft-deleted transcript for recording",
+            recording_id=recording.id,
+            transcript_id=transcript.id,
+        )
+        return
    if not transcript:
        transcript = await transcripts_controller.add(
            "",
--- a/server/tests/test_cleanup.py
+++ b/server/tests/test_cleanup.py
@@ -76,8 +76,10 @@ async def test_cleanup_old_public_data_deletes_old_anonymous_transcripts():
    assert result["transcripts_deleted"] == 1
    assert result["errors"] == []

-    # Verify old anonymous transcript was deleted
-    assert await transcripts_controller.get_by_id(old_transcript.id) is None
+    # Verify old anonymous transcript was soft-deleted
+    old = await transcripts_controller.get_by_id(old_transcript.id)
+    assert old is not None
+    assert old.deleted_at is not None

    # Verify new anonymous transcript still exists
    assert await transcripts_controller.get_by_id(new_transcript.id) is not None
@@ -150,15 +152,17 @@ async def test_cleanup_deletes_associated_meeting_and_recording():
    assert result["recordings_deleted"] == 1
    assert result["errors"] == []

-    # Verify transcript was deleted
-    assert await transcripts_controller.get_by_id(old_transcript.id) is None
+    # Verify transcript was soft-deleted
+    old = await transcripts_controller.get_by_id(old_transcript.id)
+    assert old is not None
+    assert old.deleted_at is not None

-    # Verify meeting was deleted
+    # Verify meeting was hard-deleted (cleanup deletes meetings directly)
    query = meetings.select().where(meetings.c.id == meeting_id)
    meeting_result = await get_database().fetch_one(query)
    assert meeting_result is None

-    # Verify recording was deleted
+    # Verify recording was hard-deleted (cleanup deletes recordings directly)
    assert await recordings_controller.get_by_id(recording.id) is None


--- a/server/tests/test_transcripts.py
+++ b/server/tests/test_transcripts.py
@@ -1,7 +1,8 @@
 import pytest

+from reflector.db.recordings import Recording, recordings_controller
 from reflector.db.rooms import rooms_controller
-from reflector.db.transcripts import transcripts_controller
+from reflector.db.transcripts import SourceKind, transcripts_controller


@pytest.mark.asyncio
@@ -192,9 +193,93 @@ async def test_transcript_delete(authenticated_client, client):
    assert response.status_code == 200
    assert response.json()["status"] == "ok"

+    # API returns 404 for soft-deleted transcripts
    response = await client.get(f"/transcripts/{tid}")
    assert response.status_code == 404

+    # But the transcript still exists in DB with deleted_at set
+    transcript = await transcripts_controller.get_by_id(tid)
+    assert transcript is not None
+    assert transcript.deleted_at is not None
+
+
+@pytest.mark.asyncio
+async def test_deleted_transcript_not_in_list(authenticated_client, client):
+    """Soft-deleted transcripts should not appear in the list endpoint."""
+    response = await client.post("/transcripts", json={"name": "testdel_list"})
+    assert response.status_code == 200
+    tid = response.json()["id"]
+
+    # Verify it appears in the list
+    response = await client.get("/transcripts")
+    assert response.status_code == 200
+    ids = [t["id"] for t in response.json()["items"]]
+    assert tid in ids
+
+    # Delete it
+    response = await client.delete(f"/transcripts/{tid}")
+    assert response.status_code == 200
+
+    # Verify it no longer appears in the list
+    response = await client.get("/transcripts")
+    assert response.status_code == 200
+    ids = [t["id"] for t in response.json()["items"]]
+    assert tid not in ids
+
+
+@pytest.mark.asyncio
+async def test_delete_already_deleted_is_idempotent(authenticated_client, client):
+    """Deleting an already-deleted transcript is idempotent (returns 200)."""
+    response = await client.post("/transcripts", json={"name": "testdel_idem"})
+    assert response.status_code == 200
+    tid = response.json()["id"]
+
+    # First delete
+    response = await client.delete(f"/transcripts/{tid}")
+    assert response.status_code == 200
+
+    # Second delete — idempotent, still returns ok
+    response = await client.delete(f"/transcripts/{tid}")
+    assert response.status_code == 200
+
+    # But deleted_at was only set once (not updated)
+    transcript = await transcripts_controller.get_by_id(tid)
+    assert transcript is not None
+    assert transcript.deleted_at is not None
+
+
+@pytest.mark.asyncio
+async def test_deleted_transcript_recording_soft_deleted(authenticated_client, client):
+    """Soft-deleting a transcript also soft-deletes its recording."""
+    from datetime import datetime, timezone
+
+    recording = await recordings_controller.create(
+        Recording(
+            bucket_name="test-bucket",
+            object_key="test.mp4",
+            recorded_at=datetime.now(timezone.utc),
+        )
+    )
+    transcript = await transcripts_controller.add(
+        name="with-recording",
+        source_kind=SourceKind.ROOM,
+        recording_id=recording.id,
+        user_id="randomuserid",
+    )
+
+    response = await client.delete(f"/transcripts/{transcript.id}")
+    assert response.status_code == 200
+
+    # Recording still in DB with deleted_at set
+    rec = await recordings_controller.get_by_id(recording.id)
+    assert rec is not None
+    assert rec.deleted_at is not None
+
+    # Transcript still in DB with deleted_at set
+    tr = await transcripts_controller.get_by_id(transcript.id)
+    assert tr is not None
+    assert tr.deleted_at is not None
+

@pytest.mark.asyncio
 async def test_transcript_mark_reviewed(authenticated_client, client):
--- a/server/tests/test_transcripts_download.py
+++ b/server/tests/test_transcripts_download.py
@@ -0,0 +1,36 @@
+import io
+import zipfile
+
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_download_zip_returns_valid_zip(
+    authenticated_client, client, fake_transcript_with_topics
+):
+    """Test that the zip download endpoint returns a valid zip file."""
+    transcript = fake_transcript_with_topics
+    response = await client.get(f"/transcripts/{transcript.id}/download/zip")
+    assert response.status_code == 200
+    assert response.headers["content-type"] == "application/zip"
+
+    # Verify it's a valid zip
+    zip_buffer = io.BytesIO(response.content)
+    with zipfile.ZipFile(zip_buffer) as zf:
+        names = zf.namelist()
+        assert "metadata.json" in names
+        assert "audio.mp3" in names
+
+
+@pytest.mark.asyncio
+async def test_download_zip_requires_auth(client):
+    """Test that zip download requires authentication."""
+    response = await client.get("/transcripts/nonexistent/download/zip")
+    assert response.status_code in (401, 403, 422)
+
+
+@pytest.mark.asyncio
+async def test_download_zip_not_found(authenticated_client, client):
+    """Test 404 for non-existent transcript."""
+    response = await client.get("/transcripts/nonexistent-id/download/zip")
+    assert response.status_code == 404
--- a/server/tests/test_transcripts_recording_deletion.py
+++ b/server/tests/test_transcripts_recording_deletion.py
@@ -1,5 +1,4 @@
 from datetime import datetime, timezone
-from unittest.mock import AsyncMock, patch

 import pytest

@@ -9,6 +8,7 @@ from reflector.db.transcripts import SourceKind, transcripts_controller

@pytest.mark.asyncio
 async def test_recording_deleted_with_transcript():
+    """Soft-delete: recording and transcript remain in DB with deleted_at set, no files deleted."""
    recording = await recordings_controller.create(
        Recording(
            bucket_name="test-bucket",
@@ -22,16 +22,13 @@ async def test_recording_deleted_with_transcript():
        recording_id=recording.id,
    )

-    with patch("reflector.db.transcripts.get_transcripts_storage") as mock_get_storage:
-        storage_instance = mock_get_storage.return_value
-        storage_instance.delete_file = AsyncMock()
+    await transcripts_controller.remove_by_id(transcript.id)

-        await transcripts_controller.remove_by_id(transcript.id)
+    # Both should still exist in DB but with deleted_at set
+    rec = await recordings_controller.get_by_id(recording.id)
+    assert rec is not None
+    assert rec.deleted_at is not None

-        # Should be called with bucket override
-        storage_instance.delete_file.assert_awaited_once_with(
-            recording.object_key, bucket=recording.bucket_name
-        )
-
-    assert await recordings_controller.get_by_id(recording.id) is None
-    assert await transcripts_controller.get_by_id(transcript.id) is None
+    tr = await transcripts_controller.get_by_id(transcript.id)
+    assert tr is not None
+    assert tr.deleted_at is not None
--- a/server/tests/test_transcripts_video.py
+++ b/server/tests/test_transcripts_video.py
@@ -0,0 +1,105 @@
+from datetime import datetime, timedelta, timezone
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from reflector.db.transcripts import SourceKind, transcripts_controller
+
+
+@pytest.mark.asyncio
+async def test_video_url_returns_404_when_no_meeting(authenticated_client, client):
+    """Test that video URL returns 404 when transcript has no meeting."""
+    response = await client.post("/transcripts", json={"name": "no-meeting"})
+    assert response.status_code == 200
+    tid = response.json()["id"]
+
+    response = await client.get(f"/transcripts/{tid}/video/url")
+    assert response.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_video_url_returns_404_when_no_cloud_video(authenticated_client, client):
+    """Test that video URL returns 404 when meeting has no cloud video."""
+    from reflector.db import get_database
+    from reflector.db.meetings import meetings
+
+    meeting_id = "test-meeting-no-video"
+    await get_database().execute(
+        meetings.insert().values(
+            id=meeting_id,
+            room_name="No Video Meeting",
+            room_url="https://example.com",
+            host_room_url="https://example.com/host",
+            start_date=datetime.now(timezone.utc),
+            end_date=datetime.now(timezone.utc) + timedelta(hours=1),
+            room_id=None,
+        )
+    )
+
+    transcript = await transcripts_controller.add(
+        name="with-meeting",
+        source_kind=SourceKind.ROOM,
+        meeting_id=meeting_id,
+        user_id="randomuserid",
+    )
+
+    response = await client.get(f"/transcripts/{transcript.id}/video/url")
+    assert response.status_code == 404
+
+
+@pytest.mark.asyncio
+async def test_video_url_returns_presigned_url(authenticated_client, client):
+    """Test that video URL returns a presigned URL when cloud video exists."""
+    from reflector.db import get_database
+    from reflector.db.meetings import meetings
+
+    meeting_id = "test-meeting-with-video"
+    await get_database().execute(
+        meetings.insert().values(
+            id=meeting_id,
+            room_name="Video Meeting",
+            room_url="https://example.com",
+            host_room_url="https://example.com/host",
+            start_date=datetime.now(timezone.utc),
+            end_date=datetime.now(timezone.utc) + timedelta(hours=1),
+            room_id=None,
+            daily_composed_video_s3_key="recordings/video.mp4",
+            daily_composed_video_duration=120,
+        )
+    )
+
+    transcript = await transcripts_controller.add(
+        name="with-video",
+        source_kind=SourceKind.ROOM,
+        meeting_id=meeting_id,
+        user_id="randomuserid",
+    )
+
+    with patch("reflector.views.transcripts_video.get_source_storage") as mock_storage:
+        mock_instance = AsyncMock()
+        mock_instance.get_file_url = AsyncMock(
+            return_value="https://s3.example.com/presigned-url"
+        )
+        mock_storage.return_value = mock_instance
+
+        response = await client.get(f"/transcripts/{transcript.id}/video/url")
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["url"] == "https://s3.example.com/presigned-url"
+    assert data["duration"] == 120
+    assert data["content_type"] == "video/mp4"
+
+
+@pytest.mark.asyncio
+async def test_transcript_get_includes_video_fields(authenticated_client, client):
+    """Test that transcript GET response includes has_cloud_video field."""
+    response = await client.post("/transcripts", json={"name": "video-fields"})
+    assert response.status_code == 200
+    tid = response.json()["id"]
+
+    response = await client.get(f"/transcripts/{tid}")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["has_cloud_video"] is False
+    assert data["cloud_video_duration"] is None