feat: enable daily co in selfhosted + only schedule tasks when necessary (#883)

* feat: enable daily co in selfhosted + only schedule tasks when necessary

* feat: refactor aws storage to be platform agnostic + add local pad tracking with slfhosted support
This commit is contained in:
Juan Diego García
2026-03-02 11:08:20 -05:00
committed by GitHub
parent f6cc03286b
commit 045eae8ff2
23 changed files with 1442 additions and 165 deletions

View File

@@ -86,6 +86,18 @@ LLM_API_KEY=not-needed
## Context size for summary generation (tokens)
LLM_CONTEXT_WINDOW=16000
## =======================================================
## Audio Padding
##
## backends: local (in-process PyAV), modal (HTTP API client)
## Default is "local" — no external service needed.
## Set to "modal" when using Modal.com or self-hosted gpu/self_hosted/ container.
## =======================================================
#PADDING_BACKEND=local
#PADDING_BACKEND=modal
#PADDING_URL=https://xxxxx--reflector-padding-web.modal.run
#PADDING_MODAL_API_KEY=xxxxx
## =======================================================
## Diarization
##
@@ -137,6 +149,10 @@ TRANSCRIPT_STORAGE_AWS_REGION=us-east-1
#DAILYCO_STORAGE_AWS_ROLE_ARN=... # IAM role ARN for Daily.co S3 access
#DAILYCO_STORAGE_AWS_BUCKET_NAME=reflector-dailyco
#DAILYCO_STORAGE_AWS_REGION=us-west-2
# Worker credentials for reading/deleting from Daily's recording bucket
# Required when transcript storage is separate from Daily's bucket (e.g., selfhosted with Garage)
#DAILYCO_STORAGE_AWS_ACCESS_KEY_ID=your-aws-access-key
#DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY=your-aws-secret-key
## Whereby (optional separate bucket)
#WHEREBY_STORAGE_AWS_BUCKET_NAME=reflector-whereby

View File

@@ -47,6 +47,9 @@ DIARIZATION_URL=http://transcription:8000
TRANSLATION_BACKEND=modal
TRANSLATE_URL=http://transcription:8000
PADDING_BACKEND=modal
PADDING_URL=http://transcription:8000
# HuggingFace token — optional, for gated models (e.g. pyannote).
# Falls back to public S3 model bundle if not set.
# HF_TOKEN=hf_xxxxx
@@ -93,15 +96,42 @@ TRANSCRIPT_STORAGE_AWS_REGION=us-east-1
# =======================================================
# Daily.co Live Rooms (Optional)
# Enable real-time meeting rooms with Daily.co integration.
# Requires a Daily.co account: https://www.daily.co/
# Configure these BEFORE running setup-selfhosted.sh and the
# script will auto-detect and start Hatchet workflow services.
#
# Prerequisites:
# 1. Daily.co account: https://www.daily.co/
# 2. API key: Dashboard → Developers → API Keys
# 3. S3 bucket for recordings: https://docs.daily.co/guides/products/live-streaming-recording/storing-recordings-in-a-custom-s3-bucket
# 4. IAM role ARN for Daily.co to write recordings to your bucket
#
# After configuring, run: ./scripts/setup-selfhosted.sh <your-flags>
# The script will detect DAILY_API_KEY and automatically:
# - Start Hatchet workflow engine + CPU/LLM workers
# - Generate a Hatchet API token
# - Enable FEATURE_ROOMS in the frontend
# =======================================================
# DEFAULT_VIDEO_PLATFORM=daily
# DAILY_API_KEY=your-daily-api-key
# DAILY_SUBDOMAIN=your-subdomain
# DAILY_WEBHOOK_SECRET=your-daily-webhook-secret
# DEFAULT_VIDEO_PLATFORM=daily
# DAILYCO_STORAGE_AWS_BUCKET_NAME=reflector-dailyco
# DAILYCO_STORAGE_AWS_REGION=us-east-1
# DAILYCO_STORAGE_AWS_ROLE_ARN=arn:aws:iam::role/DailyCoAccess
# Worker credentials for reading/deleting from Daily's recording bucket
# Required when transcript storage is separate from Daily's bucket (e.g., selfhosted with Garage)
# DAILYCO_STORAGE_AWS_ACCESS_KEY_ID=your-aws-access-key
# DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY=your-aws-secret-key
# DAILY_WEBHOOK_SECRET=your-daily-webhook-secret # optional, for faster recording discovery
# =======================================================
# Hatchet Workflow Engine (Auto-configured for Daily.co)
# Required for Daily.co multitrack recording processing.
# The setup script generates HATCHET_CLIENT_TOKEN automatically.
# Do not set these manually unless you know what you're doing.
# =======================================================
# HATCHET_CLIENT_TOKEN=<auto-generated-by-script>
# HATCHET_CLIENT_SERVER_URL=http://hatchet:8888
# HATCHET_CLIENT_HOST_PORT=hatchet:7077
# =======================================================
# Feature Flags

View File

@@ -90,7 +90,6 @@ from reflector.processors.summary.summary_builder import SummaryBuilder
from reflector.processors.types import TitleSummary, Word
from reflector.processors.types import Transcript as TranscriptType
from reflector.settings import settings
from reflector.storage.storage_aws import AwsStorage
from reflector.utils.audio_constants import (
PRESIGNED_URL_EXPIRATION_SECONDS,
WAVEFORM_SEGMENTS,
@@ -117,6 +116,7 @@ class PipelineInput(BaseModel):
bucket_name: NonEmptyString
transcript_id: NonEmptyString
room_id: NonEmptyString | None = None
source_platform: str = "daily"
hatchet = HatchetClientManager.get_client()
@@ -170,15 +170,10 @@ async def set_workflow_error_status(transcript_id: NonEmptyString) -> bool:
def _spawn_storage():
"""Create fresh storage instance."""
# TODO: replace direct AwsStorage construction with get_transcripts_storage() factory
return AwsStorage(
aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY,
aws_endpoint_url=settings.TRANSCRIPT_STORAGE_AWS_ENDPOINT_URL,
)
"""Create fresh storage instance for writing to our transcript bucket."""
from reflector.storage import get_transcripts_storage # noqa: PLC0415
return get_transcripts_storage()
class Loggable(Protocol):
@@ -434,6 +429,7 @@ async def process_tracks(input: PipelineInput, ctx: Context) -> ProcessTracksRes
bucket_name=input.bucket_name,
transcript_id=input.transcript_id,
language=source_language,
source_platform=input.source_platform,
)
)
for i, track in enumerate(input.tracks)
@@ -1195,7 +1191,10 @@ async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
)
from reflector.db.recordings import recordings_controller # noqa: PLC0415
from reflector.db.transcripts import transcripts_controller # noqa: PLC0415
from reflector.storage import get_transcripts_storage # noqa: PLC0415
from reflector.storage import ( # noqa: PLC0415
get_source_storage,
get_transcripts_storage,
)
transcript = await transcripts_controller.get_by_id(input.transcript_id)
if not transcript:
@@ -1245,7 +1244,7 @@ async def cleanup_consent(input: PipelineInput, ctx: Context) -> ConsentResult:
deletion_errors = []
if input_track_keys and input.bucket_name:
master_storage = get_transcripts_storage()
master_storage = get_source_storage(input.source_platform)
for key in input_track_keys:
try:
await master_storage.delete_file(key, bucket=input.bucket_name)

View File

@@ -24,6 +24,7 @@ class PaddingInput(BaseModel):
s3_key: str
bucket_name: str
transcript_id: str
source_platform: str = "daily"
hatchet = HatchetClientManager.get_client()
@@ -45,20 +46,14 @@ async def pad_track(input: PaddingInput, ctx: Context) -> PadTrackResult:
)
try:
# Create fresh storage instance to avoid aioboto3 fork issues
from reflector.settings import settings # noqa: PLC0415
from reflector.storage.storage_aws import AwsStorage # noqa: PLC0415
# TODO: replace direct AwsStorage construction with get_transcripts_storage() factory
storage = AwsStorage(
aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY,
aws_endpoint_url=settings.TRANSCRIPT_STORAGE_AWS_ENDPOINT_URL,
from reflector.storage import ( # noqa: PLC0415
get_source_storage,
get_transcripts_storage,
)
source_url = await storage.get_file_url(
# Source reads: use platform-specific credentials
source_storage = get_source_storage(input.source_platform)
source_url = await source_storage.get_file_url(
input.s3_key,
operation="get_object",
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
@@ -96,52 +91,28 @@ async def pad_track(input: PaddingInput, ctx: Context) -> PadTrackResult:
storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{input.track_index}.webm"
# Presign PUT URL for output (Modal will upload directly)
output_url = await storage.get_file_url(
# Output writes: use transcript storage (our own bucket)
output_storage = get_transcripts_storage()
output_url = await output_storage.get_file_url(
storage_path,
operation="put_object",
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
)
import httpx # noqa: PLC0415
from reflector.processors.audio_padding_modal import ( # noqa: PLC0415
AudioPaddingModalProcessor,
from reflector.processors.audio_padding_auto import ( # noqa: PLC0415
AudioPaddingAutoProcessor,
)
try:
processor = AudioPaddingModalProcessor()
result = await processor.pad_track(
track_url=source_url,
output_url=output_url,
start_time_seconds=start_time_seconds,
track_index=input.track_index,
)
file_size = result.size
processor = AudioPaddingAutoProcessor()
result = await processor.pad_track(
track_url=source_url,
output_url=output_url,
start_time_seconds=start_time_seconds,
track_index=input.track_index,
)
file_size = result.size
ctx.log(f"pad_track: Modal returned size={file_size}")
except httpx.HTTPStatusError as e:
error_detail = e.response.text if hasattr(e.response, "text") else str(e)
logger.error(
"[Hatchet] Modal padding HTTP error",
transcript_id=input.transcript_id,
track_index=input.track_index,
status_code=e.response.status_code if hasattr(e, "response") else None,
error=error_detail,
exc_info=True,
)
raise Exception(
f"Modal padding failed: HTTP {e.response.status_code}"
) from e
except httpx.TimeoutException as e:
logger.error(
"[Hatchet] Modal padding timeout",
transcript_id=input.transcript_id,
track_index=input.track_index,
error=str(e),
exc_info=True,
)
raise Exception("Modal padding timeout") from e
ctx.log(f"pad_track: padding returned size={file_size}")
logger.info(
"[Hatchet] pad_track complete",

View File

@@ -36,6 +36,7 @@ class TrackInput(BaseModel):
bucket_name: str
transcript_id: str
language: str = "en"
source_platform: str = "daily"
hatchet = HatchetClientManager.get_client()
@@ -59,20 +60,14 @@ async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:
)
try:
# Create fresh storage instance to avoid aioboto3 fork issues
# TODO: replace direct AwsStorage construction with get_transcripts_storage() factory
from reflector.settings import settings # noqa: PLC0415
from reflector.storage.storage_aws import AwsStorage # noqa: PLC0415
storage = AwsStorage(
aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY,
aws_endpoint_url=settings.TRANSCRIPT_STORAGE_AWS_ENDPOINT_URL,
from reflector.storage import ( # noqa: PLC0415
get_source_storage,
get_transcripts_storage,
)
source_url = await storage.get_file_url(
# Source reads: use platform-specific credentials
source_storage = get_source_storage(input.source_platform)
source_url = await source_storage.get_file_url(
input.s3_key,
operation="get_object",
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
@@ -99,18 +94,19 @@ async def pad_track(input: TrackInput, ctx: Context) -> PadTrackResult:
storage_path = f"file_pipeline_hatchet/{input.transcript_id}/tracks/padded_{input.track_index}.webm"
# Presign PUT URL for output (Modal uploads directly)
output_url = await storage.get_file_url(
# Output writes: use transcript storage (our own bucket)
output_storage = get_transcripts_storage()
output_url = await output_storage.get_file_url(
storage_path,
operation="put_object",
expires_in=PRESIGNED_URL_EXPIRATION_SECONDS,
)
from reflector.processors.audio_padding_modal import ( # noqa: PLC0415
AudioPaddingModalProcessor,
from reflector.processors.audio_padding_auto import ( # noqa: PLC0415
AudioPaddingAutoProcessor,
)
processor = AudioPaddingModalProcessor()
processor = AudioPaddingAutoProcessor()
result = await processor.pad_track(
track_url=source_url,
output_url=output_url,
@@ -161,18 +157,18 @@ async def transcribe_track(input: TrackInput, ctx: Context) -> TranscribeTrackRe
raise ValueError("Missing padded_key from pad_track")
# Presign URL on demand (avoids stale URLs on workflow replay)
# TODO: replace direct AwsStorage construction with get_transcripts_storage() factory
from reflector.settings import settings # noqa: PLC0415
from reflector.storage.storage_aws import AwsStorage # noqa: PLC0415
storage = AwsStorage(
aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
aws_access_key_id=settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY,
aws_endpoint_url=settings.TRANSCRIPT_STORAGE_AWS_ENDPOINT_URL,
from reflector.storage import ( # noqa: PLC0415
get_source_storage,
get_transcripts_storage,
)
# If bucket_name is set, file is still in the platform's source bucket (no padding applied).
# If bucket_name is None, padded file was written to our transcript storage.
if bucket_name:
storage = get_source_storage(input.source_platform)
else:
storage = get_transcripts_storage()
audio_url = await storage.get_file_url(
padded_key,
operation="get_object",

View File

@@ -0,0 +1,31 @@
import importlib
from reflector.settings import settings
class AudioPaddingAutoProcessor:
_registry = {}
@classmethod
def register(cls, name, kclass):
cls._registry[name] = kclass
def __new__(cls, name: str | None = None, **kwargs):
if name is None:
name = settings.PADDING_BACKEND
if name not in cls._registry:
module_name = f"reflector.processors.audio_padding_{name}"
importlib.import_module(module_name)
# gather specific configuration for the processor
# search `PADDING_XXX_YYY`, push to constructor as `xxx_yyy`
config = {}
name_upper = name.upper()
settings_prefix = "PADDING_"
config_prefix = f"{settings_prefix}{name_upper}_"
for key, value in settings:
if key.startswith(config_prefix):
config_name = key[len(settings_prefix) :].lower()
config[config_name] = value
return cls._registry[name](**config | kwargs)

View File

@@ -0,0 +1,133 @@
"""
Local audio padding processor using PyAV.
Pads audio tracks with silence directly in-process (no HTTP).
Reuses the shared PyAV utilities from reflector.utils.audio_padding.
"""
import asyncio
import os
import tempfile
import av
from reflector.logger import logger
from reflector.processors.audio_padding_auto import AudioPaddingAutoProcessor
from reflector.processors.audio_padding_modal import PaddingResponse
from reflector.utils.audio_padding import apply_audio_padding_to_file
S3_TIMEOUT = 60
class AudioPaddingLocalProcessor:
"""Audio padding processor using local PyAV (no HTTP backend)."""
async def pad_track(
self,
track_url: str,
output_url: str,
start_time_seconds: float,
track_index: int,
) -> PaddingResponse:
"""Pad audio track with silence locally via PyAV.
Args:
track_url: Presigned GET URL for source audio track
output_url: Presigned PUT URL for output WebM
start_time_seconds: Amount of silence to prepend
track_index: Track index for logging
"""
if not track_url:
raise ValueError("track_url cannot be empty")
if start_time_seconds <= 0:
raise ValueError(
f"start_time_seconds must be positive, got {start_time_seconds}"
)
log = logger.bind(track_index=track_index, padding_seconds=start_time_seconds)
log.info("Starting local PyAV padding")
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
self._pad_track_blocking,
track_url,
output_url,
start_time_seconds,
track_index,
)
def _pad_track_blocking(
self,
track_url: str,
output_url: str,
start_time_seconds: float,
track_index: int,
) -> PaddingResponse:
"""Blocking padding work: download, pad with PyAV, upload."""
import requests
log = logger.bind(track_index=track_index, padding_seconds=start_time_seconds)
temp_dir = tempfile.mkdtemp()
input_path = None
output_path = None
try:
# Download source audio
log.info("Downloading track for local padding")
response = requests.get(track_url, stream=True, timeout=S3_TIMEOUT)
response.raise_for_status()
input_path = os.path.join(temp_dir, "track.webm")
total_bytes = 0
with open(input_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
total_bytes += len(chunk)
log.info("Track downloaded", bytes=total_bytes)
# Apply padding using shared PyAV utility
output_path = os.path.join(temp_dir, "padded.webm")
with av.open(input_path) as in_container:
apply_audio_padding_to_file(
in_container,
output_path,
start_time_seconds,
track_index,
logger=logger,
)
file_size = os.path.getsize(output_path)
log.info("Local padding complete", size=file_size)
# Upload padded track
log.info("Uploading padded track to S3")
with open(output_path, "rb") as f:
upload_response = requests.put(output_url, data=f, timeout=S3_TIMEOUT)
upload_response.raise_for_status()
log.info("Upload complete", size=file_size)
return PaddingResponse(size=file_size)
except Exception as e:
log.error("Local padding failed", error=str(e), exc_info=True)
raise
finally:
if input_path and os.path.exists(input_path):
try:
os.unlink(input_path)
except Exception as e:
log.warning("Failed to cleanup input file", error=str(e))
if output_path and os.path.exists(output_path):
try:
os.unlink(output_path)
except Exception as e:
log.warning("Failed to cleanup output file", error=str(e))
try:
os.rmdir(temp_dir)
except Exception as e:
log.warning("Failed to cleanup temp directory", error=str(e))
AudioPaddingAutoProcessor.register("local", AudioPaddingLocalProcessor)

View File

@@ -10,6 +10,7 @@ from pydantic import BaseModel
from reflector.hatchet.constants import TIMEOUT_AUDIO
from reflector.logger import logger
from reflector.processors.audio_padding_auto import AudioPaddingAutoProcessor
class PaddingResponse(BaseModel):
@@ -111,3 +112,6 @@ class AudioPaddingModalProcessor:
except Exception as e:
log.error("Modal padding unexpected error", error=str(e), exc_info=True)
raise
AudioPaddingAutoProcessor.register("modal", AudioPaddingModalProcessor)

View File

@@ -40,6 +40,7 @@ class MultitrackProcessingConfig:
track_keys: list[str]
recording_id: NonEmptyString | None = None
room_id: NonEmptyString | None = None
source_platform: str = "daily"
mode: Literal["multitrack"] = "multitrack"
@@ -256,6 +257,7 @@ async def dispatch_transcript_processing(
"bucket_name": config.bucket_name,
"transcript_id": config.transcript_id,
"room_id": config.room_id,
"source_platform": config.source_platform,
},
additional_metadata={
"transcript_id": config.transcript_id,

View File

@@ -73,6 +73,9 @@ class Settings(BaseSettings):
DAILYCO_STORAGE_AWS_BUCKET_NAME: str | None = None
DAILYCO_STORAGE_AWS_REGION: str | None = None
DAILYCO_STORAGE_AWS_ROLE_ARN: str | None = None
# Worker credentials for reading/deleting from Daily's recording bucket
DAILYCO_STORAGE_AWS_ACCESS_KEY_ID: str | None = None
DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY: str | None = None
# Translate into the target language
TRANSLATION_BACKEND: str = "passthrough"
@@ -106,7 +109,11 @@ class Settings(BaseSettings):
# Diarization: modal backend
DIARIZATION_MODAL_API_KEY: str | None = None
# Audio Padding (Modal.com backend)
# Audio Padding
# backends:
# - local: in-process PyAV padding (no HTTP, runs in same process)
# - modal: HTTP API client (works with Modal.com OR self-hosted gpu/self_hosted/)
PADDING_BACKEND: str = "local"
PADDING_URL: str | None = None
PADDING_MODAL_API_KEY: str | None = None

View File

@@ -17,6 +17,49 @@ def get_transcripts_storage() -> Storage:
)
def get_source_storage(platform: str) -> Storage:
"""Get storage for reading/deleting source recording files from the platform's bucket.
Returns an AwsStorage configured with the platform's worker credentials
(access keys), or falls back to get_transcripts_storage() when platform-specific
credentials aren't configured (e.g., single-bucket setups).
Args:
platform: Recording platform name ("daily", "whereby", or other).
"""
if platform == "daily":
if (
settings.DAILYCO_STORAGE_AWS_ACCESS_KEY_ID
and settings.DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY
and settings.DAILYCO_STORAGE_AWS_BUCKET_NAME
):
from reflector.storage.storage_aws import AwsStorage
return AwsStorage(
aws_bucket_name=settings.DAILYCO_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.DAILYCO_STORAGE_AWS_REGION or "us-east-1",
aws_access_key_id=settings.DAILYCO_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY,
)
elif platform == "whereby":
if (
settings.WHEREBY_STORAGE_AWS_ACCESS_KEY_ID
and settings.WHEREBY_STORAGE_AWS_SECRET_ACCESS_KEY
and settings.WHEREBY_STORAGE_AWS_BUCKET_NAME
):
from reflector.storage.storage_aws import AwsStorage
return AwsStorage(
aws_bucket_name=settings.WHEREBY_STORAGE_AWS_BUCKET_NAME,
aws_region=settings.WHEREBY_STORAGE_AWS_REGION or "us-east-1",
aws_access_key_id=settings.WHEREBY_STORAGE_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.WHEREBY_STORAGE_AWS_SECRET_ACCESS_KEY,
)
return get_transcripts_storage()
def get_whereby_storage() -> Storage:
"""
Get storage config for Whereby (for passing to Whereby API).

View File

@@ -24,6 +24,118 @@ RECONCILIATION_INTERVAL = _override or 30.0
ICS_SYNC_INTERVAL = _override or 60.0
UPCOMING_MEETINGS_INTERVAL = _override or 30.0
def build_beat_schedule(
*,
whereby_api_key=None,
aws_process_recording_queue_url=None,
daily_api_key=None,
public_mode=False,
public_data_retention_days=None,
healthcheck_url=None,
):
"""Build the Celery beat schedule based on configured services.
Only registers tasks for services that are actually configured,
avoiding unnecessary worker wake-ups in selfhosted deployments.
"""
beat_schedule = {}
_whereby_enabled = bool(whereby_api_key) or bool(aws_process_recording_queue_url)
if _whereby_enabled:
beat_schedule["process_messages"] = {
"task": "reflector.worker.process.process_messages",
"schedule": SQS_POLL_INTERVAL,
}
beat_schedule["reprocess_failed_recordings"] = {
"task": "reflector.worker.process.reprocess_failed_recordings",
"schedule": crontab(hour=5, minute=0), # Midnight EST
}
logger.info(
"Whereby beat tasks enabled",
tasks=["process_messages", "reprocess_failed_recordings"],
)
else:
logger.info("Whereby beat tasks disabled (no WHEREBY_API_KEY or SQS URL)")
_daily_enabled = bool(daily_api_key)
if _daily_enabled:
beat_schedule["poll_daily_recordings"] = {
"task": "reflector.worker.process.poll_daily_recordings",
"schedule": POLL_DAILY_RECORDINGS_INTERVAL_SEC,
}
beat_schedule["trigger_daily_reconciliation"] = {
"task": "reflector.worker.process.trigger_daily_reconciliation",
"schedule": RECONCILIATION_INTERVAL,
}
beat_schedule["reprocess_failed_daily_recordings"] = {
"task": "reflector.worker.process.reprocess_failed_daily_recordings",
"schedule": crontab(hour=5, minute=0), # Midnight EST
}
logger.info(
"Daily.co beat tasks enabled",
tasks=[
"poll_daily_recordings",
"trigger_daily_reconciliation",
"reprocess_failed_daily_recordings",
],
)
else:
logger.info("Daily.co beat tasks disabled (no DAILY_API_KEY)")
_any_platform = _whereby_enabled or _daily_enabled
if _any_platform:
beat_schedule["process_meetings"] = {
"task": "reflector.worker.process.process_meetings",
"schedule": SQS_POLL_INTERVAL,
}
beat_schedule["sync_all_ics_calendars"] = {
"task": "reflector.worker.ics_sync.sync_all_ics_calendars",
"schedule": ICS_SYNC_INTERVAL,
}
beat_schedule["create_upcoming_meetings"] = {
"task": "reflector.worker.ics_sync.create_upcoming_meetings",
"schedule": UPCOMING_MEETINGS_INTERVAL,
}
logger.info(
"Platform tasks enabled",
tasks=[
"process_meetings",
"sync_all_ics_calendars",
"create_upcoming_meetings",
],
)
else:
logger.info("Platform tasks disabled (no video platform configured)")
if public_mode:
beat_schedule["cleanup_old_public_data"] = {
"task": "reflector.worker.cleanup.cleanup_old_public_data_task",
"schedule": crontab(hour=3, minute=0),
}
logger.info(
"Public mode cleanup enabled",
retention_days=public_data_retention_days,
)
if healthcheck_url:
beat_schedule["healthcheck_ping"] = {
"task": "reflector.worker.healthcheck.healthcheck_ping",
"schedule": 60.0 * 10,
}
logger.info("Healthcheck enabled", url=healthcheck_url)
else:
logger.warning("Healthcheck disabled, no url configured")
logger.info(
"Beat schedule configured",
total_tasks=len(beat_schedule),
task_names=sorted(beat_schedule.keys()),
)
return beat_schedule
if celery.current_app.main != "default":
logger.info(f"Celery already configured ({celery.current_app})")
app = celery.current_app
@@ -42,57 +154,11 @@ else:
]
)
# crontab
app.conf.beat_schedule = {
"process_messages": {
"task": "reflector.worker.process.process_messages",
"schedule": SQS_POLL_INTERVAL,
},
"process_meetings": {
"task": "reflector.worker.process.process_meetings",
"schedule": SQS_POLL_INTERVAL,
},
"reprocess_failed_recordings": {
"task": "reflector.worker.process.reprocess_failed_recordings",
"schedule": crontab(hour=5, minute=0), # Midnight EST
},
"reprocess_failed_daily_recordings": {
"task": "reflector.worker.process.reprocess_failed_daily_recordings",
"schedule": crontab(hour=5, minute=0), # Midnight EST
},
"poll_daily_recordings": {
"task": "reflector.worker.process.poll_daily_recordings",
"schedule": POLL_DAILY_RECORDINGS_INTERVAL_SEC,
},
"trigger_daily_reconciliation": {
"task": "reflector.worker.process.trigger_daily_reconciliation",
"schedule": RECONCILIATION_INTERVAL,
},
"sync_all_ics_calendars": {
"task": "reflector.worker.ics_sync.sync_all_ics_calendars",
"schedule": ICS_SYNC_INTERVAL,
},
"create_upcoming_meetings": {
"task": "reflector.worker.ics_sync.create_upcoming_meetings",
"schedule": UPCOMING_MEETINGS_INTERVAL,
},
}
if settings.PUBLIC_MODE:
app.conf.beat_schedule["cleanup_old_public_data"] = {
"task": "reflector.worker.cleanup.cleanup_old_public_data_task",
"schedule": crontab(hour=3, minute=0),
}
logger.info(
"Public mode cleanup enabled",
retention_days=settings.PUBLIC_DATA_RETENTION_DAYS,
)
if settings.HEALTHCHECK_URL:
app.conf.beat_schedule["healthcheck_ping"] = {
"task": "reflector.worker.healthcheck.healthcheck_ping",
"schedule": 60.0 * 10,
}
logger.info("Healthcheck enabled", url=settings.HEALTHCHECK_URL)
else:
logger.warning("Healthcheck disabled, no url configured")
app.conf.beat_schedule = build_beat_schedule(
whereby_api_key=settings.WHEREBY_API_KEY,
aws_process_recording_queue_url=settings.AWS_PROCESS_RECORDING_QUEUE_URL,
daily_api_key=settings.DAILY_API_KEY,
public_mode=settings.PUBLIC_MODE,
public_data_retention_days=settings.PUBLIC_DATA_RETENTION_DAYS,
healthcheck_url=settings.HEALTHCHECK_URL,
)

View File

@@ -357,6 +357,7 @@ async def _process_multitrack_recording_inner(
"bucket_name": bucket_name,
"transcript_id": transcript.id,
"room_id": room.id,
"source_platform": "daily",
},
additional_metadata={
"transcript_id": transcript.id,
@@ -1068,6 +1069,7 @@ async def reprocess_failed_daily_recordings():
"bucket_name": bucket_name,
"transcript_id": transcript.id,
"room_id": room.id if room else None,
"source_platform": "daily",
},
additional_metadata={
"transcript_id": transcript.id,

View File

@@ -0,0 +1,247 @@
"""Tests for conditional Celery beat schedule registration.
Verifies that beat tasks are only registered when their corresponding
services are configured (WHEREBY_API_KEY, DAILY_API_KEY, etc.).
"""
import pytest
from reflector.worker.app import build_beat_schedule
# Override autouse fixtures from conftest — these tests don't need database or websockets
@pytest.fixture(autouse=True)
def setup_database():
yield
@pytest.fixture(autouse=True)
def ws_manager_in_memory():
yield
@pytest.fixture(autouse=True)
def reset_hatchet_client():
yield
# Task name sets for each group
WHEREBY_TASKS = {"process_messages", "reprocess_failed_recordings"}
DAILY_TASKS = {
"poll_daily_recordings",
"trigger_daily_reconciliation",
"reprocess_failed_daily_recordings",
}
PLATFORM_TASKS = {
"process_meetings",
"sync_all_ics_calendars",
"create_upcoming_meetings",
}
class TestNoPlatformConfigured:
"""When no video platform is configured, no platform tasks should be registered."""
def test_no_platform_tasks(self):
schedule = build_beat_schedule()
task_names = set(schedule.keys())
assert not task_names & WHEREBY_TASKS
assert not task_names & DAILY_TASKS
assert not task_names & PLATFORM_TASKS
def test_only_healthcheck_disabled_warning(self):
"""With no config at all, schedule should be empty (healthcheck needs URL)."""
schedule = build_beat_schedule()
assert len(schedule) == 0
def test_healthcheck_only(self):
schedule = build_beat_schedule(healthcheck_url="https://hc.example.com/ping")
assert set(schedule.keys()) == {"healthcheck_ping"}
def test_public_mode_only(self):
schedule = build_beat_schedule(public_mode=True)
assert set(schedule.keys()) == {"cleanup_old_public_data"}
class TestWherebyOnly:
"""When only Whereby is configured."""
def test_whereby_api_key(self):
schedule = build_beat_schedule(whereby_api_key="test-key")
task_names = set(schedule.keys())
assert WHEREBY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
assert not task_names & DAILY_TASKS
def test_whereby_sqs_url(self):
schedule = build_beat_schedule(
aws_process_recording_queue_url="https://sqs.us-east-1.amazonaws.com/123/queue"
)
task_names = set(schedule.keys())
assert WHEREBY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
assert not task_names & DAILY_TASKS
def test_whereby_task_count(self):
schedule = build_beat_schedule(whereby_api_key="test-key")
# Whereby (2) + Platform (3) = 5
assert len(schedule) == 5
class TestDailyOnly:
"""When only Daily.co is configured."""
def test_daily_api_key(self):
schedule = build_beat_schedule(daily_api_key="test-daily-key")
task_names = set(schedule.keys())
assert DAILY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
assert not task_names & WHEREBY_TASKS
def test_daily_task_count(self):
schedule = build_beat_schedule(daily_api_key="test-daily-key")
# Daily (3) + Platform (3) = 6
assert len(schedule) == 6
class TestBothPlatforms:
"""When both Whereby and Daily.co are configured."""
def test_all_tasks_registered(self):
schedule = build_beat_schedule(
whereby_api_key="test-key",
daily_api_key="test-daily-key",
)
task_names = set(schedule.keys())
assert WHEREBY_TASKS <= task_names
assert DAILY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
def test_combined_task_count(self):
schedule = build_beat_schedule(
whereby_api_key="test-key",
daily_api_key="test-daily-key",
)
# Whereby (2) + Daily (3) + Platform (3) = 8
assert len(schedule) == 8
class TestConditionalFlags:
"""Test PUBLIC_MODE and HEALTHCHECK_URL interact correctly with platform tasks."""
def test_all_flags_enabled(self):
schedule = build_beat_schedule(
whereby_api_key="test-key",
daily_api_key="test-daily-key",
public_mode=True,
healthcheck_url="https://hc.example.com/ping",
)
task_names = set(schedule.keys())
assert "cleanup_old_public_data" in task_names
assert "healthcheck_ping" in task_names
assert WHEREBY_TASKS <= task_names
assert DAILY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
# Whereby (2) + Daily (3) + Platform (3) + cleanup (1) + healthcheck (1) = 10
assert len(schedule) == 10
def test_public_mode_with_whereby(self):
schedule = build_beat_schedule(
whereby_api_key="test-key",
public_mode=True,
)
task_names = set(schedule.keys())
assert "cleanup_old_public_data" in task_names
assert WHEREBY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
def test_healthcheck_with_daily(self):
schedule = build_beat_schedule(
daily_api_key="test-daily-key",
healthcheck_url="https://hc.example.com/ping",
)
task_names = set(schedule.keys())
assert "healthcheck_ping" in task_names
assert DAILY_TASKS <= task_names
assert PLATFORM_TASKS <= task_names
class TestTaskDefinitions:
"""Verify task definitions have correct structure."""
def test_whereby_task_paths(self):
schedule = build_beat_schedule(whereby_api_key="test-key")
assert (
schedule["process_messages"]["task"]
== "reflector.worker.process.process_messages"
)
assert (
schedule["reprocess_failed_recordings"]["task"]
== "reflector.worker.process.reprocess_failed_recordings"
)
def test_daily_task_paths(self):
schedule = build_beat_schedule(daily_api_key="test-daily-key")
assert (
schedule["poll_daily_recordings"]["task"]
== "reflector.worker.process.poll_daily_recordings"
)
assert (
schedule["trigger_daily_reconciliation"]["task"]
== "reflector.worker.process.trigger_daily_reconciliation"
)
assert (
schedule["reprocess_failed_daily_recordings"]["task"]
== "reflector.worker.process.reprocess_failed_daily_recordings"
)
def test_platform_task_paths(self):
schedule = build_beat_schedule(daily_api_key="test-daily-key")
assert (
schedule["process_meetings"]["task"]
== "reflector.worker.process.process_meetings"
)
assert (
schedule["sync_all_ics_calendars"]["task"]
== "reflector.worker.ics_sync.sync_all_ics_calendars"
)
assert (
schedule["create_upcoming_meetings"]["task"]
== "reflector.worker.ics_sync.create_upcoming_meetings"
)
def test_all_tasks_have_schedule(self):
"""Every registered task must have a 'schedule' key."""
schedule = build_beat_schedule(
whereby_api_key="test-key",
daily_api_key="test-daily-key",
public_mode=True,
healthcheck_url="https://hc.example.com/ping",
)
for name, config in schedule.items():
assert "schedule" in config, f"Task '{name}' missing 'schedule' key"
assert "task" in config, f"Task '{name}' missing 'task' key"
class TestEmptyStringValues:
"""Empty strings should be treated as not configured (falsy)."""
def test_empty_whereby_key(self):
schedule = build_beat_schedule(whereby_api_key="")
assert not set(schedule.keys()) & WHEREBY_TASKS
def test_empty_daily_key(self):
schedule = build_beat_schedule(daily_api_key="")
assert not set(schedule.keys()) & DAILY_TASKS
def test_empty_sqs_url(self):
schedule = build_beat_schedule(aws_process_recording_queue_url="")
assert not set(schedule.keys()) & WHEREBY_TASKS
def test_none_values(self):
schedule = build_beat_schedule(
whereby_api_key=None,
daily_api_key=None,
aws_process_recording_queue_url=None,
)
assert len(schedule) == 0

View File

@@ -367,3 +367,126 @@ async def test_aws_storage_none_endpoint_url():
assert storage.base_url == "https://reflector-bucket.s3.amazonaws.com/"
# No s3 addressing_style override — boto_config should only have retries
assert not hasattr(storage.boto_config, "s3") or storage.boto_config.s3 is None
# --- Tests for get_source_storage() ---
def test_get_source_storage_daily_with_credentials():
"""Daily platform with access keys returns AwsStorage with Daily credentials."""
with patch("reflector.storage.settings") as mock_settings:
mock_settings.DAILYCO_STORAGE_AWS_ACCESS_KEY_ID = "daily-key"
mock_settings.DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY = "daily-secret"
mock_settings.DAILYCO_STORAGE_AWS_BUCKET_NAME = "daily-bucket"
mock_settings.DAILYCO_STORAGE_AWS_REGION = "us-west-2"
from reflector.storage import get_source_storage
storage = get_source_storage("daily")
assert isinstance(storage, AwsStorage)
assert storage._bucket_name == "daily-bucket"
assert storage._region == "us-west-2"
assert storage._access_key_id == "daily-key"
assert storage._secret_access_key == "daily-secret"
assert storage._endpoint_url is None
def test_get_source_storage_daily_falls_back_without_credentials():
"""Daily platform without access keys falls back to transcript storage."""
with patch("reflector.storage.settings") as mock_settings:
mock_settings.DAILYCO_STORAGE_AWS_ACCESS_KEY_ID = None
mock_settings.DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY = None
mock_settings.DAILYCO_STORAGE_AWS_BUCKET_NAME = "daily-bucket"
mock_settings.TRANSCRIPT_STORAGE_BACKEND = "aws"
mock_settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME = "transcript-bucket"
mock_settings.TRANSCRIPT_STORAGE_AWS_REGION = "us-east-1"
mock_settings.TRANSCRIPT_STORAGE_AWS_ACCESS_KEY_ID = "transcript-key"
mock_settings.TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY = "transcript-secret"
mock_settings.TRANSCRIPT_STORAGE_AWS_ENDPOINT_URL = None
from reflector.storage import get_source_storage
with patch("reflector.storage.get_transcripts_storage") as mock_get_transcripts:
fallback = AwsStorage(
aws_bucket_name="transcript-bucket",
aws_region="us-east-1",
aws_access_key_id="transcript-key",
aws_secret_access_key="transcript-secret",
)
mock_get_transcripts.return_value = fallback
storage = get_source_storage("daily")
mock_get_transcripts.assert_called_once()
assert storage is fallback
def test_get_source_storage_whereby_with_credentials():
"""Whereby platform with access keys returns AwsStorage with Whereby credentials."""
with patch("reflector.storage.settings") as mock_settings:
mock_settings.WHEREBY_STORAGE_AWS_ACCESS_KEY_ID = "whereby-key"
mock_settings.WHEREBY_STORAGE_AWS_SECRET_ACCESS_KEY = "whereby-secret"
mock_settings.WHEREBY_STORAGE_AWS_BUCKET_NAME = "whereby-bucket"
mock_settings.WHEREBY_STORAGE_AWS_REGION = "eu-west-1"
from reflector.storage import get_source_storage
storage = get_source_storage("whereby")
assert isinstance(storage, AwsStorage)
assert storage._bucket_name == "whereby-bucket"
assert storage._region == "eu-west-1"
assert storage._access_key_id == "whereby-key"
assert storage._secret_access_key == "whereby-secret"
def test_get_source_storage_unknown_platform_falls_back():
"""Unknown platform falls back to transcript storage."""
with patch("reflector.storage.settings"):
from reflector.storage import get_source_storage
with patch("reflector.storage.get_transcripts_storage") as mock_get_transcripts:
fallback = MagicMock()
mock_get_transcripts.return_value = fallback
storage = get_source_storage("unknown-platform")
mock_get_transcripts.assert_called_once()
assert storage is fallback
@pytest.mark.asyncio
async def test_source_storage_presigns_for_correct_bucket():
"""Source storage presigns URLs using the platform's credentials and the override bucket."""
with patch("reflector.storage.settings") as mock_settings:
mock_settings.DAILYCO_STORAGE_AWS_ACCESS_KEY_ID = "daily-key"
mock_settings.DAILYCO_STORAGE_AWS_SECRET_ACCESS_KEY = "daily-secret"
mock_settings.DAILYCO_STORAGE_AWS_BUCKET_NAME = "daily-bucket"
mock_settings.DAILYCO_STORAGE_AWS_REGION = "us-west-2"
from reflector.storage import get_source_storage
storage = get_source_storage("daily")
mock_client = AsyncMock()
mock_client.generate_presigned_url = AsyncMock(
return_value="https://daily-bucket.s3.amazonaws.com/track.webm?signed"
)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=None)
with patch.object(storage.session, "client", return_value=mock_client):
url = await storage.get_file_url(
"track.webm",
operation="get_object",
expires_in=3600,
bucket="override-bucket",
)
assert "track.webm" in url
mock_client.generate_presigned_url.assert_called_once()
call_kwargs = mock_client.generate_presigned_url.call_args
params = call_kwargs[1].get("Params") or call_kwargs[0][1]
assert params["Bucket"] == "override-bucket"
assert params["Key"] == "track.webm"