feat: add local pyannote file diarization processor (#858)

* feat: add local pyannote file diarization processor Enables file diarization without Modal by using pyannote.audio locally. Downloads model bundle from S3 on first use, caches locally, patches config to use local paths. Set DIARIZATION_BACKEND=pyannote to enable. * fix: standalone setup enables pyannote diarization and public mode Replace DIARIZATION_ENABLED=false with DIARIZATION_BACKEND=pyannote so file uploads get speaker diarization out of the box. Add PUBLIC_MODE=true so unauthenticated users can list/browse transcripts. * fix: touch env files before first compose_cmd in standalone setup docker-compose.yml references www/.env.local as env_file, but the setup script only creates it in step 4. compose_cmd calls in step 3 (Garage) fail on a fresh clone when the file doesn't exist yet. * feat: standalone uses self-hosted GPU service for transcription+diarization Replace in-process pyannote approach with self-hosted gpu/self_hosted/ service. Same HTTP API as Modal — just TRANSCRIPT_URL/DIARIZATION_URL point to local container. - Add gpu/self_hosted/Dockerfile.cpu (GPU Dockerfile minus NVIDIA CUDA) - Add S3 model bundle fallback in diarizer.py when HF_TOKEN not set - Add gpu service to docker-compose.standalone.yml with compose env overrides - Fix /browse empty in PUBLIC_MODE (search+list queries filtered out roomless transcripts) - Remove audio_diarization_pyannote.py, file_diarization_pyannote.py and tests - Remove pyannote-audio from server local deps * fix: allow unauthenticated GPU requests when no API key configured OAuth2PasswordBearer with auto_error=True rejects requests without Authorization header before apikey_auth can check if auth is needed. * fix: rename standalone gpu service to cpu to match Dockerfile.cpu usage * docs: add programmatic testing section and fix gpu->cpu naming in setup script/docs - Add "Testing programmatically" section to standalone docs with curl commands for creating transcript, uploading audio, polling status, checking result - Fix setup-standalone.sh to reference `cpu` service (was still `gpu` after rename) - Update all docs references from gpu to cpu service naming --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-04-16 10:16:55 +00:00 · 2026-02-11 12:41:32 -05:00
parent ec4f356b4c
commit adc4c20bf4
12 changed files with 248 additions and 777 deletions
--- a/gpu/self_hosted/Dockerfile.cpu
+++ b/gpu/self_hosted/Dockerfile.cpu
@@ -0,0 +1,39 @@
+FROM python:3.12-slim
+
+ENV PYTHONUNBUFFERED=1 \
+    UV_LINK_MODE=copy \
+    UV_NO_CACHE=1
+
+WORKDIR /tmp
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+ --mount=type=cache,target=/var/lib/apt,sharing=locked \
+ apt-get update \
+ && apt-get install -y \
+    ffmpeg \
+    curl \
+    ca-certificates \
+    gnupg \
+    wget
+ADD https://astral.sh/uv/install.sh /uv-installer.sh
+RUN sh /uv-installer.sh && rm /uv-installer.sh
+ENV PATH="/root/.local/bin/:$PATH"
+
+RUN mkdir -p /app
+WORKDIR /app
+COPY pyproject.toml uv.lock /app/
+
+
+COPY ./app /app/app
+COPY ./main.py /app/
+COPY ./runserver.sh /app/
+
+# prevent uv failing with too many open files on big cpus
+ENV UV_CONCURRENT_INSTALLS=16
+
+# first install
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --compile-bytecode --locked
+
+EXPOSE 8000
+
+CMD ["sh", "/app/runserver.sh"]
--- a/gpu/self_hosted/app/auth.py
+++ b/gpu/self_hosted/app/auth.py
@@ -3,14 +3,14 @@ import os
 from fastapi import Depends, HTTPException, status
 from fastapi.security import OAuth2PasswordBearer

-oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token", auto_error=False)


-def apikey_auth(apikey: str = Depends(oauth2_scheme)):
+def apikey_auth(apikey: str | None = Depends(oauth2_scheme)):
    required_key = os.environ.get("REFLECTOR_GPU_APIKEY")
    if not required_key:
        return
-    if apikey == required_key:
+    if apikey and apikey == required_key:
        return
    raise HTTPException(
        status_code=status.HTTP_401_UNAUTHORIZED,
--- a/gpu/self_hosted/app/services/diarizer.py
+++ b/gpu/self_hosted/app/services/diarizer.py
@@ -1,10 +1,65 @@
+import logging
 import os
+import tarfile
 import threading
+from pathlib import Path
+from urllib.request import urlopen

 import torch
 import torchaudio
+import yaml
 from pyannote.audio import Pipeline

+logger = logging.getLogger(__name__)
+
+S3_BUNDLE_URL = "https://reflector-public.s3.us-east-1.amazonaws.com/pyannote-speaker-diarization-3.1.tar.gz"
+BUNDLE_CACHE_DIR = Path("/root/.cache/pyannote-bundle")
+
+
+def _ensure_model(cache_dir: Path) -> str:
+    """Download and extract S3 model bundle if not cached."""
+    model_dir = cache_dir / "pyannote-speaker-diarization-3.1"
+    config_path = model_dir / "config.yaml"
+
+    if config_path.exists():
+        logger.info("Using cached model bundle at %s", model_dir)
+        return str(model_dir)
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    tarball_path = cache_dir / "model.tar.gz"
+
+    logger.info("Downloading model bundle from %s", S3_BUNDLE_URL)
+    with urlopen(S3_BUNDLE_URL) as response, open(tarball_path, "wb") as f:
+        while chunk := response.read(8192):
+            f.write(chunk)
+
+    logger.info("Extracting model bundle")
+    with tarfile.open(tarball_path, "r:gz") as tar:
+        tar.extractall(path=cache_dir, filter="data")
+    tarball_path.unlink()
+
+    _patch_config(model_dir, cache_dir)
+    return str(model_dir)
+
+
+def _patch_config(model_dir: Path, cache_dir: Path) -> None:
+    """Rewrite config.yaml to reference local pytorch_model.bin paths."""
+    config_path = model_dir / "config.yaml"
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+
+    config["pipeline"]["params"]["segmentation"] = str(
+        cache_dir / "pyannote-segmentation-3.0" / "pytorch_model.bin"
+    )
+    config["pipeline"]["params"]["embedding"] = str(
+        cache_dir / "pyannote-wespeaker-voxceleb-resnet34-LM" / "pytorch_model.bin"
+    )
+
+    with open(config_path, "w") as f:
+        yaml.dump(config, f)
+
+    logger.info("Patched config.yaml with local model paths")
+

 class PyannoteDiarizationService:
    def __init__(self):
@@ -14,10 +69,20 @@ class PyannoteDiarizationService:

    def load(self):
        self._device = "cuda" if torch.cuda.is_available() else "cpu"
-        self._pipeline = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization-3.1",
-            use_auth_token=os.environ.get("HF_TOKEN"),
-        )
+        hf_token = os.environ.get("HF_TOKEN")
+
+        if hf_token:
+            logger.info("Loading pyannote model from HuggingFace (HF_TOKEN set)")
+            self._pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+                use_auth_token=hf_token,
+            )
+        else:
+            logger.info("HF_TOKEN not set — loading model from S3 bundle")
+            model_path = _ensure_model(BUNDLE_CACHE_DIR)
+            config_path = Path(model_path) / "config.yaml"
+            self._pipeline = Pipeline.from_pretrained(str(config_path))
+
        self._pipeline.to(torch.device(self._device))

    def diarize_file(self, file_path: str, timestamp: float = 0.0) -> dict: