feat: 3-mode selfhosted refactoring (--gpu, --cpu, --hosted) + audio token auth fallback (#896)

* fix: local processing instead of http server for cpu

* add fallback token if service worker doesnt work

* chore: rename processors to keep processor pattern up to date and allow other processors to be createed and used with env vars
This commit is contained in:
Juan Diego García
2026-03-04 16:31:08 -05:00
committed by GitHub
parent 4235ab4293
commit a682846645
34 changed files with 2640 additions and 172 deletions

View File

@@ -1,11 +1,12 @@
# Self-hosted production Docker Compose — single file for everything.
#
# Usage: ./scripts/setup-selfhosted.sh --gpu --ollama-gpu --garage --caddy
# or: docker compose -f docker-compose.selfhosted.yml --profile gpu [--profile ollama-gpu] [--profile garage] [--profile caddy] up -d
# Usage: ./scripts/setup-selfhosted.sh <--gpu|--cpu|--hosted> [--ollama-gpu|--ollama-cpu] [--garage] [--caddy]
# or: docker compose -f docker-compose.selfhosted.yml [--profile gpu] [--profile ollama-gpu] [--profile garage] [--profile caddy] up -d
#
# Specialized models (pick ONE — required):
# --profile gpu NVIDIA GPU for transcription/diarization/translation
# --profile cpu CPU-only for transcription/diarization/translation
# ML processing modes (pick ONE — required):
# --gpu NVIDIA GPU container for transcription/diarization/translation (profile: gpu)
# --cpu In-process CPU processing on server/worker (no ML container needed)
# --hosted Remote GPU service URL (no ML container needed)
#
# Local LLM (optional — for summarization/topics):
# --profile ollama-gpu Local Ollama with NVIDIA GPU
@@ -45,16 +46,9 @@ services:
REDIS_HOST: redis
CELERY_BROKER_URL: redis://redis:6379/1
CELERY_RESULT_BACKEND: redis://redis:6379/1
# Specialized models via gpu/cpu container (aliased as "transcription")
TRANSCRIPT_BACKEND: modal
TRANSCRIPT_URL: http://transcription:8000
TRANSCRIPT_MODAL_API_KEY: selfhosted
DIARIZATION_BACKEND: modal
DIARIZATION_URL: http://transcription:8000
TRANSLATION_BACKEND: modal
TRANSLATE_URL: http://transcription:8000
PADDING_BACKEND: modal
PADDING_URL: http://transcription:8000
# ML backend config comes from env_file (server/.env), set per-mode by setup script
# HF_TOKEN needed for in-process pyannote diarization (--cpu mode)
HF_TOKEN: ${HF_TOKEN:-}
# WebRTC: fixed UDP port range for ICE candidates (mapped above)
WEBRTC_PORT_RANGE: "51000-51100"
depends_on:
@@ -79,15 +73,8 @@ services:
REDIS_HOST: redis
CELERY_BROKER_URL: redis://redis:6379/1
CELERY_RESULT_BACKEND: redis://redis:6379/1
TRANSCRIPT_BACKEND: modal
TRANSCRIPT_URL: http://transcription:8000
TRANSCRIPT_MODAL_API_KEY: selfhosted
DIARIZATION_BACKEND: modal
DIARIZATION_URL: http://transcription:8000
TRANSLATION_BACKEND: modal
TRANSLATE_URL: http://transcription:8000
PADDING_BACKEND: modal
PADDING_URL: http://transcription:8000
# ML backend config comes from env_file (server/.env), set per-mode by setup script
HF_TOKEN: ${HF_TOKEN:-}
depends_on:
postgres:
condition: service_healthy
@@ -165,7 +152,10 @@ services:
# ===========================================================
# Specialized model containers (transcription, diarization, translation)
# Both gpu and cpu get alias "transcription" so server config never changes.
# Only the gpu profile is activated by the setup script (--gpu mode).
# The cpu service definition is kept for manual/standalone use but is
# NOT activated by --cpu mode (which uses in-process local backends).
# Both services get alias "transcription" so server config never changes.
# ===========================================================
gpu: