mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 20:59:05 +00:00
* feat(cleanup): add automatic data retention for public instances - Add Celery task to clean up anonymous data after configurable retention period - Delete transcripts, meetings, and orphaned recordings older than retention days - Only runs when PUBLIC_MODE is enabled to prevent accidental data loss - Properly removes all associated files (local and S3 storage) - Add manual cleanup tool for testing and intervention - Configure retention via PUBLIC_DATA_RETENTION_DAYS setting (default: 7 days) Fixes #571 * fix: apply pre-commit formatting fixes * fix: properly delete recording files from storage during cleanup - Add storage deletion for orphaned recordings in both cleanup task and manual tool - Delete from storage before removing database records - Log warnings if storage deletion fails but continue with database cleanup * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * Apply suggestion from @pr-agent-monadical[bot] Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com> * refactor: cleanup_old_data for better logging * fix: linting * test: fix meeting cleanup test to not require room controller - Simplify test by directly inserting meetings into database - Remove dependency on non-existent rooms_controller.create method - Tests now pass successfully * fix: linting * refactor: simplify cleanup tool to use worker implementation - Remove duplicate cleanup logic from manual tool - Use the same _cleanup_old_public_data function from worker - Remove dry-run feature as requested - Prevent code duplication and ensure consistency - Update documentation to reflect changes * refactor: split cleanup worker into smaller functions - Move all imports to the top of the file - Extract cleanup logic into separate functions: - cleanup_old_transcripts() - cleanup_old_meetings() - cleanup_orphaned_recordings() - log_cleanup_results() - Make code more maintainable and testable - Add days parameter support to Celery task - Update manual tool to work with refactored code * feat: add TypedDict typing for cleanup stats - Add CleanupStats TypedDict for better type safety - Update all function signatures to use proper typing - Add return type annotations to _cleanup_old_public_data - Improves code maintainability and IDE support * feat: add CASCADE DELETE to meeting_consent foreign key - Add ondelete="CASCADE" to meeting_consent.meeting_id foreign key - Generate and apply migration to update existing constraint - Remove manual consent deletion from cleanup code - Add unit test to verify CASCADE DELETE behavior * style: linting * fix: alembic migration branchpoint * fix: correct downgrade constraint name in CASCADE DELETE migration * fix: regenerate CASCADE DELETE migration with proper constraint names - Delete problematic migration and regenerate with correct names - Use explicit constraint name in both upgrade and downgrade - Ensure migration works bidirectionally - All tests passing including CASCADE DELETE test * style: linting * refactor: simplify cleanup to use transcripts as entry point - Remove orphaned_recordings cleanup (not part of this PR scope) - Remove separate old_meetings cleanup - Transcripts are now the main entry point for cleanup - Associated meetings and recordings are deleted with their transcript - Use single database connection for all operations - Update tests to reflect new approach * refactor: cleanup and rename functions for clarity - Rename _cleanup_old_public_data to cleanup_old_public_data (make public) - Rename celery task to cleanup_old_public_data_task for clarity - Update docstrings and improve code organization - Remove unnecessary comments and simplify deletion logic - Update tests to use new function names - All tests passing * style: linting\ * style: typing and review * fix: add transaction on cleanup_single_transcript * fix: naming --------- Co-authored-by: pr-agent-monadical[bot] <198624643+pr-agent-monadical[bot]@users.noreply.github.com>
137 lines
3.8 KiB
Python
137 lines
3.8 KiB
Python
from pydantic.types import PositiveInt
|
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
model_config = SettingsConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
extra="ignore",
|
|
)
|
|
|
|
# CORS
|
|
UI_BASE_URL: str = "http://localhost:3000"
|
|
CORS_ORIGIN: str = "*"
|
|
CORS_ALLOW_CREDENTIALS: bool = False
|
|
|
|
# Database
|
|
DATABASE_URL: str = (
|
|
"postgresql+asyncpg://reflector:reflector@localhost:5432/reflector"
|
|
)
|
|
|
|
# local data directory
|
|
DATA_DIR: str = "./data"
|
|
|
|
# Audio Chunking
|
|
# backends: silero, frames
|
|
AUDIO_CHUNKER_BACKEND: str = "frames"
|
|
|
|
# Audio Transcription
|
|
# backends: whisper, modal
|
|
TRANSCRIPT_BACKEND: str = "whisper"
|
|
TRANSCRIPT_URL: str | None = None
|
|
TRANSCRIPT_TIMEOUT: int = 90
|
|
TRANSCRIPT_FILE_TIMEOUT: int = 600
|
|
|
|
# Audio Transcription: modal backend
|
|
TRANSCRIPT_MODAL_API_KEY: str | None = None
|
|
|
|
# Audio transcription storage
|
|
TRANSCRIPT_STORAGE_BACKEND: str | None = None
|
|
|
|
# Storage configuration for AWS
|
|
TRANSCRIPT_STORAGE_AWS_BUCKET_NAME: str = "reflector-bucket"
|
|
TRANSCRIPT_STORAGE_AWS_REGION: str = "us-east-1"
|
|
TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID: str | None = None
|
|
TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY: str | None = None
|
|
|
|
# Recording storage
|
|
RECORDING_STORAGE_BACKEND: str | None = None
|
|
|
|
# Recording storage configuration for AWS
|
|
RECORDING_STORAGE_AWS_BUCKET_NAME: str = "recording-bucket"
|
|
RECORDING_STORAGE_AWS_REGION: str = "us-east-1"
|
|
RECORDING_STORAGE_AWS_ACCESS_KEY_ID: str | None = None
|
|
RECORDING_STORAGE_AWS_SECRET_ACCESS_KEY: str | None = None
|
|
|
|
# Translate into the target language
|
|
TRANSLATION_BACKEND: str = "passthrough"
|
|
TRANSLATE_URL: str | None = None
|
|
TRANSLATE_TIMEOUT: int = 90
|
|
|
|
# Translation: modal backend
|
|
TRANSLATE_MODAL_API_KEY: str | None = None
|
|
|
|
# LLM
|
|
LLM_MODEL: str = "microsoft/phi-4"
|
|
LLM_URL: str | None = None
|
|
LLM_API_KEY: str | None = None
|
|
LLM_CONTEXT_WINDOW: int = 16000
|
|
|
|
# Diarization
|
|
DIARIZATION_ENABLED: bool = True
|
|
DIARIZATION_BACKEND: str = "modal"
|
|
DIARIZATION_URL: str | None = None
|
|
DIARIZATION_FILE_TIMEOUT: int = 600
|
|
|
|
# Diarization: modal backend
|
|
DIARIZATION_MODAL_API_KEY: str | None = None
|
|
|
|
# Diarization: local pyannote.audio
|
|
DIARIZATION_PYANNOTE_AUTH_TOKEN: str | None = None
|
|
|
|
# Sentry
|
|
SENTRY_DSN: str | None = None
|
|
|
|
# User authentication (none, jwt)
|
|
AUTH_BACKEND: str = "none"
|
|
|
|
# User authentication using JWT
|
|
AUTH_JWT_ALGORITHM: str = "RS256"
|
|
AUTH_JWT_PUBLIC_KEY: str | None = "authentik.monadical.com_public.pem"
|
|
AUTH_JWT_AUDIENCE: str | None = None
|
|
|
|
PUBLIC_MODE: bool = False
|
|
PUBLIC_DATA_RETENTION_DAYS: PositiveInt = 7
|
|
|
|
# Min transcript length to generate topic + summary
|
|
MIN_TRANSCRIPT_LENGTH: int = 750
|
|
|
|
# Celery
|
|
CELERY_BROKER_URL: str = "redis://localhost:6379/1"
|
|
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
|
|
|
|
# Redis
|
|
REDIS_HOST: str = "localhost"
|
|
REDIS_PORT: int = 6379
|
|
REDIS_CACHE_DB: int = 2
|
|
|
|
# Secret key
|
|
SECRET_KEY: str = "changeme-f02f86fd8b3e4fd892c6043e5a298e21"
|
|
|
|
# Current hosting/domain
|
|
BASE_URL: str = "http://localhost:1250"
|
|
|
|
# Profiling
|
|
PROFILING: bool = False
|
|
|
|
# Healthcheck
|
|
HEALTHCHECK_URL: str | None = None
|
|
|
|
# Whereby integration
|
|
WHEREBY_API_URL: str = "https://api.whereby.dev/v1"
|
|
WHEREBY_API_KEY: str | None = None
|
|
WHEREBY_WEBHOOK_SECRET: str | None = None
|
|
AWS_WHEREBY_ACCESS_KEY_ID: str | None = None
|
|
AWS_WHEREBY_ACCESS_KEY_SECRET: str | None = None
|
|
AWS_PROCESS_RECORDING_QUEUE_URL: str | None = None
|
|
SQS_POLLING_TIMEOUT_SECONDS: int = 60
|
|
|
|
# Zulip integration
|
|
ZULIP_REALM: str | None = None
|
|
ZULIP_API_KEY: str | None = None
|
|
ZULIP_BOT_EMAIL: str | None = None
|
|
|
|
|
|
settings = Settings()
|