update audio-deleted flow

This commit is contained in:
Igor Loskutov
2025-06-18 15:43:34 -04:00
parent 6cb6d90b9a
commit c23e0e07ef
15 changed files with 282 additions and 76 deletions

View File

@@ -0,0 +1,25 @@
"""add audio_deleted field to transcript
Revision ID: 20250618140000
Revises: 20250617140003
Create Date: 2025-06-18 14:00:00.000000
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "20250618140000"
down_revision: Union[str, None] = "20250617140003"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.add_column("transcript", sa.Column("audio_deleted", sa.Boolean(), nullable=True))
def downgrade() -> None:
op.drop_column("transcript", "audio_deleted")

View File

@@ -70,6 +70,7 @@ transcripts = sqlalchemy.Table(
Enum(SourceKind, values_callable=lambda obj: [e.value for e in obj]),
nullable=False,
),
sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean, nullable=True),
)
def generate_transcript_name() -> str:
@@ -157,6 +158,7 @@ class Transcript(BaseModel):
recording_id: str | None = None
zulip_message_id: int | None = None
source_kind: SourceKind
audio_deleted: bool | None = None
def add_event(self, event: str, data: BaseModel) -> TranscriptEvent:
ev = TranscriptEvent(event=event, data=data.model_dump())
@@ -544,8 +546,14 @@ class TranscriptController:
Move mp3 file to storage
"""
if transcript.audio_deleted:
raise FileNotFoundError(f"Invalid state of transcript {transcript.id}: audio_deleted mark is set true")
if transcript.audio_location == "local":
# store the audio on external storage if it's not already there
if not transcript.audio_mp3_filename.exists():
raise FileNotFoundError(f"Audio file not found: {transcript.audio_mp3_filename}")
await get_transcripts_storage().put_file(
transcript.storage_audio_path,
transcript.audio_mp3_filename.read_bytes(),

View File

@@ -59,6 +59,13 @@ from reflector.zulip import (
send_message_to_zulip,
update_zulip_message,
)
from reflector.db.meetings import meetings_controller, meeting_consent_controller
from reflector.db.recordings import recordings_controller
from reflector.storage import get_transcripts_storage
import boto3
from structlog import BoundLogger as Logger
@@ -470,6 +477,7 @@ class PipelineMainWaveform(PipelineMainFromTopics):
@get_transcript
async def pipeline_remove_upload(transcript: Transcript, logger: Logger):
# for future changes: note that there's also a consent process happens, beforehand and users may not consent with keeping files. currently, we delete regardless, so it's no need for that
logger.info("Starting remove upload")
uploads = transcript.data_path.glob("upload.*")
for upload in uploads:
@@ -520,6 +528,10 @@ async def pipeline_upload_mp3(transcript: Transcript, logger: Logger):
logger.info("No storage backend configured, skipping mp3 upload")
return
if transcript.audio_deleted:
logger.info("Skipping MP3 upload - audio marked as deleted")
return
logger.info("Starting upload mp3")
# If the audio mp3 is not available, just skip
@@ -558,6 +570,67 @@ async def pipeline_summaries(transcript: Transcript, logger: Logger):
logger.info("Summaries done")
@get_transcript
async def cleanup_consent(transcript: Transcript, logger: Logger):
logger.info("Starting consent cleanup")
consent_denied = False
recording = None
try:
if transcript.recording_id:
recording = await recordings_controller.get_by_id(transcript.recording_id)
if recording and recording.meeting_id:
meeting = await meetings_controller.get_by_id(recording.meeting_id)
if meeting:
consent_denied = await meeting_consent_controller.has_any_denial(meeting.id)
except Exception as e:
logger.error(f"Failed to get fetch consent: {e}")
consent_denied = True
if not consent_denied:
logger.info("Consent approved, keeping all files")
return
logger.info("Consent denied, cleaning up all related audio files")
# 1. Delete original Whereby recording from S3
if recording and recording.s3_bucket and recording.s3_key:
s3_whereby = boto3.client(
"s3",
aws_access_key_id=settings.AWS_WHEREBY_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_WHEREBY_ACCESS_KEY_SECRET,
)
try:
s3_whereby.delete_object(Bucket=recording.s3_bucket, Key=recording.s3_key)
logger.info(f"Deleted original Whereby recording: {recording.s3_bucket}/{recording.s3_key}")
except Exception as e:
logger.error(f"Failed to delete Whereby recording: {e}")
# non-transactional, files marked for deletion not actually deleted is possible
await transcripts_controller.update(transcript, {"audio_deleted": True})
# 2. Delete processed audio from transcript storage S3 bucket
if transcript.audio_location == "storage":
storage = get_transcripts_storage()
try:
await storage.delete_file(transcript.storage_audio_path)
logger.info(f"Deleted processed audio from storage: {transcript.storage_audio_path}")
except Exception as e:
logger.error(f"Failed to delete processed audio: {e}")
# 3. Delete local audio files
try:
if hasattr(transcript, 'audio_mp3_filename') and transcript.audio_mp3_filename:
transcript.audio_mp3_filename.unlink(missing_ok=True)
if hasattr(transcript, 'audio_wav_filename') and transcript.audio_wav_filename:
transcript.audio_wav_filename.unlink(missing_ok=True)
except Exception as e:
logger.error(f"Failed to delete local audio files: {e}")
logger.info("Consent cleanup done")
@get_transcript
async def pipeline_post_to_zulip(transcript: Transcript, logger: Logger):
logger.info("Starting post to zulip")
@@ -659,6 +732,12 @@ async def task_pipeline_final_summaries(*, transcript_id: str):
await pipeline_summaries(transcript_id=transcript_id)
@shared_task
@asynctask
async def task_cleanup_consent(*, transcript_id: str):
await cleanup_consent(transcript_id=transcript_id)
@shared_task
@asynctask
async def task_pipeline_post_to_zulip(*, transcript_id: str):
@@ -675,6 +754,7 @@ def pipeline_post(*, transcript_id: str):
| task_pipeline_upload_mp3.si(transcript_id=transcript_id)
| task_pipeline_remove_upload.si(transcript_id=transcript_id)
| task_pipeline_diarization.si(transcript_id=transcript_id)
| task_cleanup_consent.si(transcript_id=transcript_id)
)
chain_title_preview = task_pipeline_title.si(transcript_id=transcript_id)
chain_final_summaries = task_pipeline_final_summaries.si(

View File

@@ -43,6 +43,10 @@ def range_requests_response(
):
"""Returns StreamingResponse using Range Requests of a given file"""
if not os.path.exists(file_path):
from fastapi import HTTPException
raise HTTPException(status_code=404, detail="File not found")
file_size = os.stat(file_path).st_size
range_header = request.headers.get("range")

View File

@@ -65,6 +65,7 @@ class GetTranscript(BaseModel):
source_kind: SourceKind
room_id: str | None = None
room_name: str | None = None
audio_deleted: bool | None = None
class CreateTranscript(BaseModel):
@@ -82,6 +83,7 @@ class UpdateTranscript(BaseModel):
share_mode: Optional[Literal["public", "semi-private", "private"]] = Field(None)
participants: Optional[list[TranscriptParticipant]] = Field(None)
reviewed: Optional[bool] = Field(None)
audio_deleted: Optional[bool] = Field(None)
class DeletionStatus(BaseModel):

View File

@@ -86,8 +86,11 @@ async def transcript_get_audio_mp3(
headers=resp.headers,
)
if transcript.audio_deleted:
raise HTTPException(status_code=404, detail="Audio unavailable due to privacy settings")
if not transcript.audio_mp3_filename.exists():
raise HTTPException(status_code=500, detail="Audio not found")
raise HTTPException(status_code=404, detail="Audio file not found")
truncated_id = str(transcript.id).split("-")[0]
filename = f"recording_{truncated_id}.mp3"

View File

@@ -9,7 +9,7 @@ import structlog
from celery import shared_task
from celery.utils.log import get_task_logger
from pydantic import ValidationError
from reflector.db.meetings import meeting_consent_controller, meetings_controller
from reflector.db.meetings import meetings_controller
from reflector.db.recordings import Recording, recordings_controller
from reflector.db.rooms import rooms_controller
from reflector.db.transcripts import SourceKind, transcripts_controller
@@ -131,52 +131,6 @@ async def process_recording(bucket_name: str, object_key: str):
await transcripts_controller.update(transcript, {"status": "uploaded"})
task_pipeline_process.delay(transcript_id=transcript.id)
# Check if any participant denied consent after transcript processing is complete
should_delete = await meeting_consent_controller.has_any_denial(meeting.id)
if should_delete:
logger.info(f"Deleting audio files for {object_key} due to consent denial")
await delete_audio_files(transcript, bucket_name, object_key)
async def delete_audio_files(transcript, bucket_name: str, object_key: str):
"""Delete ONLY audio files from all locations, keep transcript data"""
try:
# 1. Delete original Whereby recording from S3
s3_whereby = boto3.client(
"s3",
aws_access_key_id=settings.AWS_WHEREBY_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_WHEREBY_ACCESS_KEY_SECRET,
)
s3_whereby.delete_object(Bucket=bucket_name, Key=object_key)
logger.info(f"Deleted original Whereby recording: {bucket_name}/{object_key}")
# 2. Delete processed audio from transcript storage S3 bucket
if transcript.audio_location == "storage":
storage = get_transcripts_storage()
await storage.delete_file(transcript.storage_audio_path)
logger.info(f"Deleted processed audio from storage: {transcript.storage_audio_path}")
# 3. Delete local audio files (if any remain)
if hasattr(transcript, 'audio_mp3_filename') and transcript.audio_mp3_filename:
transcript.audio_mp3_filename.unlink(missing_ok=True)
if hasattr(transcript, 'audio_wav_filename') and transcript.audio_wav_filename:
transcript.audio_wav_filename.unlink(missing_ok=True)
upload_path = transcript.data_path / f"upload{os.path.splitext(object_key)[1]}"
upload_path.unlink(missing_ok=True)
# 4. Update transcript to reflect audio deletion (keep all other data)
await transcripts_controller.update(transcript, {
'audio_location_deleted': True
})
logger.info(f"Deleted all audio files for transcript {transcript.id}, kept transcript data")
except Exception as e:
logger.error(f"Failed to delete audio files for {object_key}: {str(e)}")
@shared_task
@asynctask