feat: 3-mode selfhosted refactoring (--gpu, --cpu, --hosted) + audio token auth fallback (#896)

* fix: local processing instead of http server for cpu * add fallback token if service worker doesnt work * chore: rename processors to keep processor pattern up to date and allow other processors to be createed and used with env vars
2026-04-13 08:46:55 +00:00 · 2026-03-04 16:31:08 -05:00
parent 4235ab4293
commit a682846645
34 changed files with 2640 additions and 172 deletions
--- a/server/.env.selfhosted.example
+++ b/server/.env.selfhosted.example
@@ -32,26 +32,46 @@ AUTH_BACKEND=none

 # =======================================================
 # Specialized Models (Transcription, Diarization, Translation)
-# These run in the gpu/cpu container — NOT an LLM.
-# The "modal" backend means "HTTP API client" — it talks to
-# the self-hosted container, not Modal.com cloud.
+# These do NOT use an LLM. Configured per mode by the setup script:
+#
+# --gpu mode:    modal backends → GPU container (http://transcription:8000)
+# --cpu mode:    whisper/pyannote/marian/pyav → in-process ML on server/worker
+# --hosted mode: modal backends → user-provided remote GPU service URL
 # =======================================================
+
+# --- --gpu mode (default) ---
 TRANSCRIPT_BACKEND=modal
 TRANSCRIPT_URL=http://transcription:8000
 TRANSCRIPT_MODAL_API_KEY=selfhosted
-
 DIARIZATION_ENABLED=true
 DIARIZATION_BACKEND=modal
 DIARIZATION_URL=http://transcription:8000
-
 TRANSLATION_BACKEND=modal
 TRANSLATE_URL=http://transcription:8000
-
 PADDING_BACKEND=modal
 PADDING_URL=http://transcription:8000

-# HuggingFace token — optional, for gated models (e.g. pyannote).
-# Falls back to public S3 model bundle if not set.
+# --- --cpu mode (set by setup script) ---
+# TRANSCRIPT_BACKEND=whisper
+# DIARIZATION_BACKEND=pyannote
+# TRANSLATION_BACKEND=marian
+# PADDING_BACKEND=pyav
+
+# --- --hosted mode (set by setup script) ---
+# TRANSCRIPT_BACKEND=modal
+# TRANSCRIPT_URL=https://your-gpu-service.example.com
+# DIARIZATION_BACKEND=modal
+# DIARIZATION_URL=https://your-gpu-service.example.com
+# ... (all URLs point to one remote service)
+
+# Whisper model sizes for local transcription (--cpu mode)
+# Options: "tiny", "base", "small", "medium", "large-v2"
+# WHISPER_CHUNK_MODEL=tiny
+# WHISPER_FILE_MODEL=tiny
+
+# HuggingFace token — for gated models (e.g. pyannote diarization).
+# Required for --gpu and --cpu modes; falls back to public S3 bundle if not set.
+# Not needed for --hosted mode (remote service handles its own auth).
 # HF_TOKEN=hf_xxxxx

 # =======================================================