mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
* feat(cleanup): add automatic data retention for public instances - Add Celery task to clean up anonymous data after configurable retention period - Delete transcripts, meetings, and orphaned recordings older than retention days - Only runs when PUBLIC_MODE is enabled to prevent accidental data loss - Properly removes all associated files (local and S3 storage) - Add manual cleanup tool for testing and intervention - Configure retention via PUBLIC_DATA_RETENTION_DAYS setting (default: 7 days) Fixes #571 * fix: apply pre-commit formatting fixes * fix: properly delete recording files from storage during cleanup - Add storage deletion for orphaned recordings in both cleanup task and manual tool - Delete from storage before removing database records - Log warnings if storage deletion fails but continue with database cleanup * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * refactor: cleanup_old_data for better logging * fix: linting * test: fix meeting cleanup test to not require room controller - Simplify test by directly inserting meetings into database - Remove dependency on non-existent rooms_controller.create method - Tests now pass successfully * fix: linting * refactor: simplify cleanup tool to use worker implementation - Remove duplicate cleanup logic from manual tool - Use the same _cleanup_old_public_data function from worker - Remove dry-run feature as requested - Prevent code duplication and ensure consistency - Update documentation to reflect changes * refactor: split cleanup worker into smaller functions - Move all imports to the top of the file - Extract cleanup logic into separate functions: - cleanup_old_transcripts() - cleanup_old_meetings() - cleanup_orphaned_recordings() - log_cleanup_results() - Make code more maintainable and testable - Add days parameter support to Celery task - Update manual tool to work with refactored code * feat: add TypedDict typing for cleanup stats - Add CleanupStats TypedDict for better type safety - Update all function signatures to use proper typing - Add return type annotations to _cleanup_old_public_data - Improves code maintainability and IDE support * feat: add CASCADE DELETE to meeting_consent foreign key - Add ondelete="CASCADE" to meeting_consent.meeting_id foreign key - Generate and apply migration to update existing constraint - Remove manual consent deletion from cleanup code - Add unit test to verify CASCADE DELETE behavior * style: linting * fix: alembic migration branchpoint * fix: correct downgrade constraint name in CASCADE DELETE migration * fix: regenerate CASCADE DELETE migration with proper constraint names - Delete problematic migration and regenerate with correct names - Use explicit constraint name in both upgrade and downgrade - Ensure migration works bidirectionally - All tests passing including CASCADE DELETE test * style: linting * refactor: simplify cleanup to use transcripts as entry point - Remove orphaned_recordings cleanup (not part of this PR scope) - Remove separate old_meetings cleanup - Transcripts are now the main entry point for cleanup - Associated meetings and recordings are deleted with their transcript - Use single database connection for all operations - Update tests to reflect new approach * refactor: cleanup and rename functions for clarity - Rename _cleanup_old_public_data to cleanup_old_public_data (make public) - Rename celery task to cleanup_old_public_data_task for clarity - Update docstrings and improve code organization - Remove unnecessary comments and simplify deletion logic - Update tests to use new function names - All tests passing * style: linting\ * style: typing and review * fix: add transaction on cleanup_single_transcript * fix: naming --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
288 lines
9.7 KiB
Python
288 lines
9.7 KiB
Python
from datetime import datetime, timedelta, timezone
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
|
|
from reflector.db.recordings import Recording, recordings_controller
|
|
from reflector.db.transcripts import SourceKind, transcripts_controller
|
|
from reflector.worker.cleanup import cleanup_old_public_data
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_old_public_data_skips_when_not_public():
|
|
"""Test that cleanup is skipped when PUBLIC_MODE is False."""
|
|
with patch("reflector.worker.cleanup.settings") as mock_settings:
|
|
mock_settings.PUBLIC_MODE = False
|
|
|
|
result = await cleanup_old_public_data()
|
|
|
|
# Should return early without doing anything
|
|
assert result is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_old_public_data_deletes_old_anonymous_transcripts():
|
|
"""Test that old anonymous transcripts are deleted."""
|
|
# Create old and new anonymous transcripts
|
|
old_date = datetime.now(timezone.utc) - timedelta(days=8)
|
|
new_date = datetime.now(timezone.utc) - timedelta(days=2)
|
|
|
|
# Create old anonymous transcript (should be deleted)
|
|
old_transcript = await transcripts_controller.add(
|
|
name="Old Anonymous Transcript",
|
|
source_kind=SourceKind.FILE,
|
|
user_id=None, # Anonymous
|
|
)
|
|
# Manually update created_at to be old
|
|
from reflector.db import get_database
|
|
from reflector.db.transcripts import transcripts
|
|
|
|
await get_database().execute(
|
|
transcripts.update()
|
|
.where(transcripts.c.id == old_transcript.id)
|
|
.values(created_at=old_date)
|
|
)
|
|
|
|
# Create new anonymous transcript (should NOT be deleted)
|
|
new_transcript = await transcripts_controller.add(
|
|
name="New Anonymous Transcript",
|
|
source_kind=SourceKind.FILE,
|
|
user_id=None, # Anonymous
|
|
)
|
|
|
|
# Create old transcript with user (should NOT be deleted)
|
|
old_user_transcript = await transcripts_controller.add(
|
|
name="Old User Transcript",
|
|
source_kind=SourceKind.FILE,
|
|
user_id="user123",
|
|
)
|
|
await get_database().execute(
|
|
transcripts.update()
|
|
.where(transcripts.c.id == old_user_transcript.id)
|
|
.values(created_at=old_date)
|
|
)
|
|
|
|
with patch("reflector.worker.cleanup.settings") as mock_settings:
|
|
mock_settings.PUBLIC_MODE = True
|
|
mock_settings.PUBLIC_DATA_RETENTION_DAYS = 7
|
|
|
|
# Mock the storage deletion
|
|
with patch("reflector.db.transcripts.get_transcripts_storage") as mock_storage:
|
|
mock_storage.return_value.delete_file = AsyncMock()
|
|
|
|
result = await cleanup_old_public_data()
|
|
|
|
# Check results
|
|
assert result["transcripts_deleted"] == 1
|
|
assert result["errors"] == []
|
|
|
|
# Verify old anonymous transcript was deleted
|
|
assert await transcripts_controller.get_by_id(old_transcript.id) is None
|
|
|
|
# Verify new anonymous transcript still exists
|
|
assert await transcripts_controller.get_by_id(new_transcript.id) is not None
|
|
|
|
# Verify user transcript still exists
|
|
assert await transcripts_controller.get_by_id(old_user_transcript.id) is not None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_deletes_associated_meeting_and_recording():
|
|
"""Test that meetings and recordings associated with old transcripts are deleted."""
|
|
from reflector.db import get_database
|
|
from reflector.db.meetings import meetings
|
|
from reflector.db.transcripts import transcripts
|
|
|
|
old_date = datetime.now(timezone.utc) - timedelta(days=8)
|
|
|
|
# Create a meeting
|
|
meeting_id = "test-meeting-for-transcript"
|
|
await get_database().execute(
|
|
meetings.insert().values(
|
|
id=meeting_id,
|
|
room_name="Meeting with Transcript",
|
|
room_url="https://example.com/meeting",
|
|
host_room_url="https://example.com/meeting-host",
|
|
start_date=old_date,
|
|
end_date=old_date + timedelta(hours=1),
|
|
user_id=None,
|
|
room_id=None,
|
|
)
|
|
)
|
|
|
|
# Create a recording
|
|
recording = await recordings_controller.create(
|
|
Recording(
|
|
bucket_name="test-bucket",
|
|
object_key="test-recording.mp4",
|
|
recorded_at=old_date,
|
|
)
|
|
)
|
|
|
|
# Create an old transcript with both meeting and recording
|
|
old_transcript = await transcripts_controller.add(
|
|
name="Old Transcript with Meeting and Recording",
|
|
source_kind=SourceKind.ROOM,
|
|
user_id=None,
|
|
meeting_id=meeting_id,
|
|
recording_id=recording.id,
|
|
)
|
|
|
|
# Update created_at to be old
|
|
await get_database().execute(
|
|
transcripts.update()
|
|
.where(transcripts.c.id == old_transcript.id)
|
|
.values(created_at=old_date)
|
|
)
|
|
|
|
with patch("reflector.worker.cleanup.settings") as mock_settings:
|
|
mock_settings.PUBLIC_MODE = True
|
|
mock_settings.PUBLIC_DATA_RETENTION_DAYS = 7
|
|
|
|
# Mock storage deletion
|
|
with patch("reflector.db.transcripts.get_transcripts_storage") as mock_storage:
|
|
mock_storage.return_value.delete_file = AsyncMock()
|
|
with patch(
|
|
"reflector.worker.cleanup.get_recordings_storage"
|
|
) as mock_rec_storage:
|
|
mock_rec_storage.return_value.delete_file = AsyncMock()
|
|
|
|
result = await cleanup_old_public_data()
|
|
|
|
# Check results
|
|
assert result["transcripts_deleted"] == 1
|
|
assert result["meetings_deleted"] == 1
|
|
assert result["recordings_deleted"] == 1
|
|
assert result["errors"] == []
|
|
|
|
# Verify transcript was deleted
|
|
assert await transcripts_controller.get_by_id(old_transcript.id) is None
|
|
|
|
# Verify meeting was deleted
|
|
query = meetings.select().where(meetings.c.id == meeting_id)
|
|
meeting_result = await get_database().fetch_one(query)
|
|
assert meeting_result is None
|
|
|
|
# Verify recording was deleted
|
|
assert await recordings_controller.get_by_id(recording.id) is None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_cleanup_handles_errors_gracefully():
|
|
"""Test that cleanup continues even when individual deletions fail."""
|
|
old_date = datetime.now(timezone.utc) - timedelta(days=8)
|
|
|
|
# Create multiple old transcripts
|
|
transcript1 = await transcripts_controller.add(
|
|
name="Transcript 1",
|
|
source_kind=SourceKind.FILE,
|
|
user_id=None,
|
|
)
|
|
transcript2 = await transcripts_controller.add(
|
|
name="Transcript 2",
|
|
source_kind=SourceKind.FILE,
|
|
user_id=None,
|
|
)
|
|
|
|
# Update created_at to be old
|
|
from reflector.db import get_database
|
|
from reflector.db.transcripts import transcripts
|
|
|
|
for t_id in [transcript1.id, transcript2.id]:
|
|
await get_database().execute(
|
|
transcripts.update()
|
|
.where(transcripts.c.id == t_id)
|
|
.values(created_at=old_date)
|
|
)
|
|
|
|
with patch("reflector.worker.cleanup.settings") as mock_settings:
|
|
mock_settings.PUBLIC_MODE = True
|
|
mock_settings.PUBLIC_DATA_RETENTION_DAYS = 7
|
|
|
|
# Mock remove_by_id to fail for the first transcript
|
|
original_remove = transcripts_controller.remove_by_id
|
|
call_count = 0
|
|
|
|
async def mock_remove_by_id(transcript_id, user_id=None):
|
|
nonlocal call_count
|
|
call_count += 1
|
|
if call_count == 1:
|
|
raise Exception("Simulated deletion error")
|
|
return await original_remove(transcript_id, user_id)
|
|
|
|
with patch.object(
|
|
transcripts_controller, "remove_by_id", side_effect=mock_remove_by_id
|
|
):
|
|
result = await cleanup_old_public_data()
|
|
|
|
# Should have one successful deletion and one error
|
|
assert result["transcripts_deleted"] == 1
|
|
assert len(result["errors"]) == 1
|
|
assert "Failed to delete transcript" in result["errors"][0]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_meeting_consent_cascade_delete():
|
|
"""Test that meeting_consent records are automatically deleted when meeting is deleted."""
|
|
from reflector.db import get_database
|
|
from reflector.db.meetings import (
|
|
meeting_consent,
|
|
meeting_consent_controller,
|
|
meetings,
|
|
)
|
|
|
|
# Create a meeting
|
|
meeting_id = "test-cascade-meeting"
|
|
await get_database().execute(
|
|
meetings.insert().values(
|
|
id=meeting_id,
|
|
room_name="Test Meeting for CASCADE",
|
|
room_url="https://example.com/cascade-test",
|
|
host_room_url="https://example.com/cascade-test-host",
|
|
start_date=datetime.now(timezone.utc),
|
|
end_date=datetime.now(timezone.utc) + timedelta(hours=1),
|
|
user_id="test-user",
|
|
room_id=None,
|
|
)
|
|
)
|
|
|
|
# Create consent records for this meeting
|
|
consent1_id = "consent-1"
|
|
consent2_id = "consent-2"
|
|
|
|
await get_database().execute(
|
|
meeting_consent.insert().values(
|
|
id=consent1_id,
|
|
meeting_id=meeting_id,
|
|
user_id="user1",
|
|
consent_given=True,
|
|
consent_timestamp=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
|
|
await get_database().execute(
|
|
meeting_consent.insert().values(
|
|
id=consent2_id,
|
|
meeting_id=meeting_id,
|
|
user_id="user2",
|
|
consent_given=False,
|
|
consent_timestamp=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
|
|
# Verify consent records exist
|
|
consents = await meeting_consent_controller.get_by_meeting_id(meeting_id)
|
|
assert len(consents) == 2
|
|
|
|
# Delete the meeting
|
|
await get_database().execute(meetings.delete().where(meetings.c.id == meeting_id))
|
|
|
|
# Verify meeting is deleted
|
|
query = meetings.select().where(meetings.c.id == meeting_id)
|
|
result = await get_database().fetch_one(query)
|
|
assert result is None
|
|
|
|
# Verify consent records are automatically deleted (CASCADE DELETE)
|
|
consents_after = await meeting_consent_controller.get_by_meeting_id(meeting_id)
|
|
assert len(consents_after) == 0
|