mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
* feat(cleanup): add automatic data retention for public instances - Add Celery task to clean up anonymous data after configurable retention period - Delete transcripts, meetings, and orphaned recordings older than retention days - Only runs when PUBLIC_MODE is enabled to prevent accidental data loss - Properly removes all associated files (local and S3 storage) - Add manual cleanup tool for testing and intervention - Configure retention via PUBLIC_DATA_RETENTION_DAYS setting (default: 7 days) Fixes #571 * fix: apply pre-commit formatting fixes * fix: properly delete recording files from storage during cleanup - Add storage deletion for orphaned recordings in both cleanup task and manual tool - Delete from storage before removing database records - Log warnings if storage deletion fails but continue with database cleanup * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * refactor: cleanup_old_data for better logging * fix: linting * test: fix meeting cleanup test to not require room controller - Simplify test by directly inserting meetings into database - Remove dependency on non-existent rooms_controller.create method - Tests now pass successfully * fix: linting * refactor: simplify cleanup tool to use worker implementation - Remove duplicate cleanup logic from manual tool - Use the same _cleanup_old_public_data function from worker - Remove dry-run feature as requested - Prevent code duplication and ensure consistency - Update documentation to reflect changes * refactor: split cleanup worker into smaller functions - Move all imports to the top of the file - Extract cleanup logic into separate functions: - cleanup_old_transcripts() - cleanup_old_meetings() - cleanup_orphaned_recordings() - log_cleanup_results() - Make code more maintainable and testable - Add days parameter support to Celery task - Update manual tool to work with refactored code * feat: add TypedDict typing for cleanup stats - Add CleanupStats TypedDict for better type safety - Update all function signatures to use proper typing - Add return type annotations to _cleanup_old_public_data - Improves code maintainability and IDE support * feat: add CASCADE DELETE to meeting_consent foreign key - Add ondelete="CASCADE" to meeting_consent.meeting_id foreign key - Generate and apply migration to update existing constraint - Remove manual consent deletion from cleanup code - Add unit test to verify CASCADE DELETE behavior * style: linting * fix: alembic migration branchpoint * fix: correct downgrade constraint name in CASCADE DELETE migration * fix: regenerate CASCADE DELETE migration with proper constraint names - Delete problematic migration and regenerate with correct names - Use explicit constraint name in both upgrade and downgrade - Ensure migration works bidirectionally - All tests passing including CASCADE DELETE test * style: linting * refactor: simplify cleanup to use transcripts as entry point - Remove orphaned_recordings cleanup (not part of this PR scope) - Remove separate old_meetings cleanup - Transcripts are now the main entry point for cleanup - Associated meetings and recordings are deleted with their transcript - Use single database connection for all operations - Update tests to reflect new approach * refactor: cleanup and rename functions for clarity - Rename _cleanup_old_public_data to cleanup_old_public_data (make public) - Rename celery task to cleanup_old_public_data_task for clarity - Update docstrings and improve code organization - Remove unnecessary comments and simplify deletion logic - Update tests to use new function names - All tests passing * style: linting\ * style: typing and review * fix: add transaction on cleanup_single_transcript * fix: naming --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
60 lines
2.0 KiB
Python
60 lines
2.0 KiB
Python
import celery
|
|
import structlog
|
|
from celery import Celery
|
|
from celery.schedules import crontab
|
|
|
|
from reflector.settings import settings
|
|
|
|
logger = structlog.get_logger(__name__)
|
|
if celery.current_app.main != "default":
|
|
logger.info(f"Celery already configured ({celery.current_app})")
|
|
app = celery.current_app
|
|
else:
|
|
app = Celery(__name__)
|
|
app.conf.broker_url = settings.CELERY_BROKER_URL
|
|
app.conf.result_backend = settings.CELERY_RESULT_BACKEND
|
|
app.conf.broker_connection_retry_on_startup = True
|
|
app.autodiscover_tasks(
|
|
[
|
|
"reflector.pipelines.main_live_pipeline",
|
|
"reflector.worker.healthcheck",
|
|
"reflector.worker.process",
|
|
"reflector.worker.cleanup",
|
|
]
|
|
)
|
|
|
|
# crontab
|
|
app.conf.beat_schedule = {
|
|
"process_messages": {
|
|
"task": "reflector.worker.process.process_messages",
|
|
"schedule": float(settings.SQS_POLLING_TIMEOUT_SECONDS),
|
|
},
|
|
"process_meetings": {
|
|
"task": "reflector.worker.process.process_meetings",
|
|
"schedule": float(settings.SQS_POLLING_TIMEOUT_SECONDS),
|
|
},
|
|
"reprocess_failed_recordings": {
|
|
"task": "reflector.worker.process.reprocess_failed_recordings",
|
|
"schedule": crontab(hour=5, minute=0), # Midnight EST
|
|
},
|
|
}
|
|
|
|
if settings.PUBLIC_MODE:
|
|
app.conf.beat_schedule["cleanup_old_public_data"] = {
|
|
"task": "reflector.worker.cleanup.cleanup_old_public_data_task",
|
|
"schedule": crontab(hour=3, minute=0),
|
|
}
|
|
logger.info(
|
|
"Public mode cleanup enabled",
|
|
retention_days=settings.PUBLIC_DATA_RETENTION_DAYS,
|
|
)
|
|
|
|
if settings.HEALTHCHECK_URL:
|
|
app.conf.beat_schedule["healthcheck_ping"] = {
|
|
"task": "reflector.worker.healthcheck.healthcheck_ping",
|
|
"schedule": 60.0 * 10,
|
|
}
|
|
logger.info("Healthcheck enabled", url=settings.HEALTHCHECK_URL)
|
|
else:
|
|
logger.warning("Healthcheck disabled, no url configured")
|