feat: 3-mode selfhosted refactoring (--gpu, --cpu, --hosted) + audio token auth fallback (#896)

* fix: local processing instead of http server for cpu * add fallback token if service worker doesnt work * chore: rename processors to keep processor pattern up to date and allow other processors to be createed and used with env vars
2026-05-06 11:15:18 +00:00 · 2026-03-04 16:31:08 -05:00
parent 4235ab4293
commit a682846645
34 changed files with 2640 additions and 172 deletions
--- a/docker-compose.selfhosted.yml
+++ b/docker-compose.selfhosted.yml
@@ -1,11 +1,12 @@
 # Self-hosted production Docker Compose — single file for everything.
 #
-# Usage: ./scripts/setup-selfhosted.sh --gpu --ollama-gpu --garage --caddy
-#   or:  docker compose -f docker-compose.selfhosted.yml --profile gpu [--profile ollama-gpu] [--profile garage] [--profile caddy] up -d
+# Usage: ./scripts/setup-selfhosted.sh <--gpu|--cpu|--hosted> [--ollama-gpu|--ollama-cpu] [--garage] [--caddy]
+#   or:  docker compose -f docker-compose.selfhosted.yml [--profile gpu] [--profile ollama-gpu] [--profile garage] [--profile caddy] up -d
 #
-# Specialized models (pick ONE — required):
-#   --profile gpu          NVIDIA GPU for transcription/diarization/translation
-#   --profile cpu          CPU-only for transcription/diarization/translation
+# ML processing modes (pick ONE — required):
+#   --gpu                  NVIDIA GPU container for transcription/diarization/translation (profile: gpu)
+#   --cpu                  In-process CPU processing on server/worker (no ML container needed)
+#   --hosted               Remote GPU service URL (no ML container needed)
 #
 # Local LLM (optional — for summarization/topics):
 #   --profile ollama-gpu   Local Ollama with NVIDIA GPU
@@ -45,16 +46,9 @@ services:
      REDIS_HOST: redis
      CELERY_BROKER_URL: redis://redis:6379/1
      CELERY_RESULT_BACKEND: redis://redis:6379/1
-      # Specialized models via gpu/cpu container (aliased as "transcription")
-      TRANSCRIPT_BACKEND: modal
-      TRANSCRIPT_URL: http://transcription:8000
-      TRANSCRIPT_MODAL_API_KEY: selfhosted
-      DIARIZATION_BACKEND: modal
-      DIARIZATION_URL: http://transcription:8000
-      TRANSLATION_BACKEND: modal
-      TRANSLATE_URL: http://transcription:8000
-      PADDING_BACKEND: modal
-      PADDING_URL: http://transcription:8000
+      # ML backend config comes from env_file (server/.env), set per-mode by setup script
+      # HF_TOKEN needed for in-process pyannote diarization (--cpu mode)
+      HF_TOKEN: ${HF_TOKEN:-}
      # WebRTC: fixed UDP port range for ICE candidates (mapped above)
      WEBRTC_PORT_RANGE: "51000-51100"
    depends_on:
@@ -79,15 +73,8 @@ services:
      REDIS_HOST: redis
      CELERY_BROKER_URL: redis://redis:6379/1
      CELERY_RESULT_BACKEND: redis://redis:6379/1
-      TRANSCRIPT_BACKEND: modal
-      TRANSCRIPT_URL: http://transcription:8000
-      TRANSCRIPT_MODAL_API_KEY: selfhosted
-      DIARIZATION_BACKEND: modal
-      DIARIZATION_URL: http://transcription:8000
-      TRANSLATION_BACKEND: modal
-      TRANSLATE_URL: http://transcription:8000
-      PADDING_BACKEND: modal
-      PADDING_URL: http://transcription:8000
+      # ML backend config comes from env_file (server/.env), set per-mode by setup script
+      HF_TOKEN: ${HF_TOKEN:-}
    depends_on:
      postgres:
        condition: service_healthy
@@ -165,7 +152,10 @@ services:

  # ===========================================================
  # Specialized model containers (transcription, diarization, translation)
-  # Both gpu and cpu get alias "transcription" so server config never changes.
+  # Only the gpu profile is activated by the setup script (--gpu mode).
+  # The cpu service definition is kept for manual/standalone use but is
+  # NOT activated by --cpu mode (which uses in-process local backends).
+  # Both services get alias "transcription" so server config never changes.
  # ===========================================================

  gpu: