mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 12:19:06 +00:00
update audio-deleted flow
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
"""add audio_deleted field to transcript
|
||||
|
||||
Revision ID: 20250618140000
|
||||
Revises: 20250617140003
|
||||
Create Date: 2025-06-18 14:00:00.000000
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "20250618140000"
|
||||
down_revision: Union[str, None] = "20250617140003"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column("transcript", sa.Column("audio_deleted", sa.Boolean(), nullable=True))
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("transcript", "audio_deleted")
|
||||
@@ -70,6 +70,7 @@ transcripts = sqlalchemy.Table(
|
||||
Enum(SourceKind, values_callable=lambda obj: [e.value for e in obj]),
|
||||
nullable=False,
|
||||
),
|
||||
sqlalchemy.Column("audio_deleted", sqlalchemy.Boolean, nullable=True),
|
||||
)
|
||||
|
||||
def generate_transcript_name() -> str:
|
||||
@@ -157,6 +158,7 @@ class Transcript(BaseModel):
|
||||
recording_id: str | None = None
|
||||
zulip_message_id: int | None = None
|
||||
source_kind: SourceKind
|
||||
audio_deleted: bool | None = None
|
||||
|
||||
def add_event(self, event: str, data: BaseModel) -> TranscriptEvent:
|
||||
ev = TranscriptEvent(event=event, data=data.model_dump())
|
||||
@@ -544,8 +546,14 @@ class TranscriptController:
|
||||
Move mp3 file to storage
|
||||
"""
|
||||
|
||||
if transcript.audio_deleted:
|
||||
raise FileNotFoundError(f"Invalid state of transcript {transcript.id}: audio_deleted mark is set true")
|
||||
|
||||
if transcript.audio_location == "local":
|
||||
# store the audio on external storage if it's not already there
|
||||
if not transcript.audio_mp3_filename.exists():
|
||||
raise FileNotFoundError(f"Audio file not found: {transcript.audio_mp3_filename}")
|
||||
|
||||
await get_transcripts_storage().put_file(
|
||||
transcript.storage_audio_path,
|
||||
transcript.audio_mp3_filename.read_bytes(),
|
||||
|
||||
@@ -59,6 +59,13 @@ from reflector.zulip import (
|
||||
send_message_to_zulip,
|
||||
update_zulip_message,
|
||||
)
|
||||
|
||||
from reflector.db.meetings import meetings_controller, meeting_consent_controller
|
||||
from reflector.db.recordings import recordings_controller
|
||||
from reflector.storage import get_transcripts_storage
|
||||
|
||||
import boto3
|
||||
|
||||
from structlog import BoundLogger as Logger
|
||||
|
||||
|
||||
@@ -470,6 +477,7 @@ class PipelineMainWaveform(PipelineMainFromTopics):
|
||||
|
||||
@get_transcript
|
||||
async def pipeline_remove_upload(transcript: Transcript, logger: Logger):
|
||||
# for future changes: note that there's also a consent process happens, beforehand and users may not consent with keeping files. currently, we delete regardless, so it's no need for that
|
||||
logger.info("Starting remove upload")
|
||||
uploads = transcript.data_path.glob("upload.*")
|
||||
for upload in uploads:
|
||||
@@ -520,6 +528,10 @@ async def pipeline_upload_mp3(transcript: Transcript, logger: Logger):
|
||||
logger.info("No storage backend configured, skipping mp3 upload")
|
||||
return
|
||||
|
||||
if transcript.audio_deleted:
|
||||
logger.info("Skipping MP3 upload - audio marked as deleted")
|
||||
return
|
||||
|
||||
logger.info("Starting upload mp3")
|
||||
|
||||
# If the audio mp3 is not available, just skip
|
||||
@@ -558,6 +570,67 @@ async def pipeline_summaries(transcript: Transcript, logger: Logger):
|
||||
logger.info("Summaries done")
|
||||
|
||||
|
||||
@get_transcript
|
||||
async def cleanup_consent(transcript: Transcript, logger: Logger):
|
||||
logger.info("Starting consent cleanup")
|
||||
|
||||
consent_denied = False
|
||||
recording = None
|
||||
try:
|
||||
if transcript.recording_id:
|
||||
recording = await recordings_controller.get_by_id(transcript.recording_id)
|
||||
if recording and recording.meeting_id:
|
||||
meeting = await meetings_controller.get_by_id(recording.meeting_id)
|
||||
if meeting:
|
||||
consent_denied = await meeting_consent_controller.has_any_denial(meeting.id)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get fetch consent: {e}")
|
||||
consent_denied = True
|
||||
|
||||
if not consent_denied:
|
||||
logger.info("Consent approved, keeping all files")
|
||||
return
|
||||
|
||||
logger.info("Consent denied, cleaning up all related audio files")
|
||||
|
||||
# 1. Delete original Whereby recording from S3
|
||||
if recording and recording.s3_bucket and recording.s3_key:
|
||||
|
||||
s3_whereby = boto3.client(
|
||||
"s3",
|
||||
aws_access_key_id=settings.AWS_WHEREBY_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=settings.AWS_WHEREBY_ACCESS_KEY_SECRET,
|
||||
)
|
||||
try:
|
||||
s3_whereby.delete_object(Bucket=recording.s3_bucket, Key=recording.s3_key)
|
||||
logger.info(f"Deleted original Whereby recording: {recording.s3_bucket}/{recording.s3_key}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete Whereby recording: {e}")
|
||||
|
||||
# non-transactional, files marked for deletion not actually deleted is possible
|
||||
await transcripts_controller.update(transcript, {"audio_deleted": True})
|
||||
# 2. Delete processed audio from transcript storage S3 bucket
|
||||
if transcript.audio_location == "storage":
|
||||
|
||||
storage = get_transcripts_storage()
|
||||
try:
|
||||
await storage.delete_file(transcript.storage_audio_path)
|
||||
logger.info(f"Deleted processed audio from storage: {transcript.storage_audio_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete processed audio: {e}")
|
||||
|
||||
# 3. Delete local audio files
|
||||
try:
|
||||
if hasattr(transcript, 'audio_mp3_filename') and transcript.audio_mp3_filename:
|
||||
transcript.audio_mp3_filename.unlink(missing_ok=True)
|
||||
if hasattr(transcript, 'audio_wav_filename') and transcript.audio_wav_filename:
|
||||
transcript.audio_wav_filename.unlink(missing_ok=True)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete local audio files: {e}")
|
||||
|
||||
logger.info("Consent cleanup done")
|
||||
|
||||
|
||||
@get_transcript
|
||||
async def pipeline_post_to_zulip(transcript: Transcript, logger: Logger):
|
||||
logger.info("Starting post to zulip")
|
||||
@@ -659,6 +732,12 @@ async def task_pipeline_final_summaries(*, transcript_id: str):
|
||||
await pipeline_summaries(transcript_id=transcript_id)
|
||||
|
||||
|
||||
@shared_task
|
||||
@asynctask
|
||||
async def task_cleanup_consent(*, transcript_id: str):
|
||||
await cleanup_consent(transcript_id=transcript_id)
|
||||
|
||||
|
||||
@shared_task
|
||||
@asynctask
|
||||
async def task_pipeline_post_to_zulip(*, transcript_id: str):
|
||||
@@ -675,6 +754,7 @@ def pipeline_post(*, transcript_id: str):
|
||||
| task_pipeline_upload_mp3.si(transcript_id=transcript_id)
|
||||
| task_pipeline_remove_upload.si(transcript_id=transcript_id)
|
||||
| task_pipeline_diarization.si(transcript_id=transcript_id)
|
||||
| task_cleanup_consent.si(transcript_id=transcript_id)
|
||||
)
|
||||
chain_title_preview = task_pipeline_title.si(transcript_id=transcript_id)
|
||||
chain_final_summaries = task_pipeline_final_summaries.si(
|
||||
|
||||
@@ -43,6 +43,10 @@ def range_requests_response(
|
||||
):
|
||||
"""Returns StreamingResponse using Range Requests of a given file"""
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
from fastapi import HTTPException
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
file_size = os.stat(file_path).st_size
|
||||
range_header = request.headers.get("range")
|
||||
|
||||
|
||||
@@ -65,6 +65,7 @@ class GetTranscript(BaseModel):
|
||||
source_kind: SourceKind
|
||||
room_id: str | None = None
|
||||
room_name: str | None = None
|
||||
audio_deleted: bool | None = None
|
||||
|
||||
|
||||
class CreateTranscript(BaseModel):
|
||||
@@ -82,6 +83,7 @@ class UpdateTranscript(BaseModel):
|
||||
share_mode: Optional[Literal["public", "semi-private", "private"]] = Field(None)
|
||||
participants: Optional[list[TranscriptParticipant]] = Field(None)
|
||||
reviewed: Optional[bool] = Field(None)
|
||||
audio_deleted: Optional[bool] = Field(None)
|
||||
|
||||
|
||||
class DeletionStatus(BaseModel):
|
||||
|
||||
@@ -86,8 +86,11 @@ async def transcript_get_audio_mp3(
|
||||
headers=resp.headers,
|
||||
)
|
||||
|
||||
if transcript.audio_deleted:
|
||||
raise HTTPException(status_code=404, detail="Audio unavailable due to privacy settings")
|
||||
|
||||
if not transcript.audio_mp3_filename.exists():
|
||||
raise HTTPException(status_code=500, detail="Audio not found")
|
||||
raise HTTPException(status_code=404, detail="Audio file not found")
|
||||
|
||||
truncated_id = str(transcript.id).split("-")[0]
|
||||
filename = f"recording_{truncated_id}.mp3"
|
||||
|
||||
@@ -9,7 +9,7 @@ import structlog
|
||||
from celery import shared_task
|
||||
from celery.utils.log import get_task_logger
|
||||
from pydantic import ValidationError
|
||||
from reflector.db.meetings import meeting_consent_controller, meetings_controller
|
||||
from reflector.db.meetings import meetings_controller
|
||||
from reflector.db.recordings import Recording, recordings_controller
|
||||
from reflector.db.rooms import rooms_controller
|
||||
from reflector.db.transcripts import SourceKind, transcripts_controller
|
||||
@@ -131,52 +131,6 @@ async def process_recording(bucket_name: str, object_key: str):
|
||||
await transcripts_controller.update(transcript, {"status": "uploaded"})
|
||||
|
||||
task_pipeline_process.delay(transcript_id=transcript.id)
|
||||
|
||||
# Check if any participant denied consent after transcript processing is complete
|
||||
should_delete = await meeting_consent_controller.has_any_denial(meeting.id)
|
||||
if should_delete:
|
||||
logger.info(f"Deleting audio files for {object_key} due to consent denial")
|
||||
await delete_audio_files(transcript, bucket_name, object_key)
|
||||
|
||||
|
||||
async def delete_audio_files(transcript, bucket_name: str, object_key: str):
|
||||
"""Delete ONLY audio files from all locations, keep transcript data"""
|
||||
|
||||
try:
|
||||
# 1. Delete original Whereby recording from S3
|
||||
s3_whereby = boto3.client(
|
||||
"s3",
|
||||
aws_access_key_id=settings.AWS_WHEREBY_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=settings.AWS_WHEREBY_ACCESS_KEY_SECRET,
|
||||
)
|
||||
s3_whereby.delete_object(Bucket=bucket_name, Key=object_key)
|
||||
logger.info(f"Deleted original Whereby recording: {bucket_name}/{object_key}")
|
||||
|
||||
# 2. Delete processed audio from transcript storage S3 bucket
|
||||
if transcript.audio_location == "storage":
|
||||
storage = get_transcripts_storage()
|
||||
await storage.delete_file(transcript.storage_audio_path)
|
||||
logger.info(f"Deleted processed audio from storage: {transcript.storage_audio_path}")
|
||||
|
||||
# 3. Delete local audio files (if any remain)
|
||||
if hasattr(transcript, 'audio_mp3_filename') and transcript.audio_mp3_filename:
|
||||
transcript.audio_mp3_filename.unlink(missing_ok=True)
|
||||
if hasattr(transcript, 'audio_wav_filename') and transcript.audio_wav_filename:
|
||||
transcript.audio_wav_filename.unlink(missing_ok=True)
|
||||
|
||||
upload_path = transcript.data_path / f"upload{os.path.splitext(object_key)[1]}"
|
||||
upload_path.unlink(missing_ok=True)
|
||||
|
||||
# 4. Update transcript to reflect audio deletion (keep all other data)
|
||||
await transcripts_controller.update(transcript, {
|
||||
'audio_location_deleted': True
|
||||
})
|
||||
|
||||
logger.info(f"Deleted all audio files for transcript {transcript.id}, kept transcript data")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete audio files for {object_key}: {str(e)}")
|
||||
|
||||
|
||||
@shared_task
|
||||
@asynctask
|
||||
|
||||
Reference in New Issue
Block a user