docs: docs website + installation (#778)

* feat: WIP doc (vibe started and iterated) * install from scratch docs * caddyfile.example * gitignore * authentik script * authentik script * authentik script * llm doc * authentik ongoing * more daily setup logs * doc website * gpu self hosted setup guide (no-mistakes) * doc review round * doc review round * doc review round * update doc site sidebars * feat(docs): add mermaid diagram support * docs polishing * live pipeline doc * move pipeline dev docs to dev docs location * doc pr review iteration * dockerfile healthcheck * docs/pr-comments * remove jwt comment * llm suggestion * pr comments * pr comments * document auto migrations * cleanup docs --------- Co-authored-by: Mathieu Virbel <mat@meltingrocks.com> Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2026-02-04 09:56:47 +00:00 · 2026-01-06 17:25:02 -05:00
parent e644d6497b
commit 407c15299f
61 changed files with 32653 additions and 26 deletions
--- a/gpu/modal_deployments/deploy-all.sh
+++ b/gpu/modal_deployments/deploy-all.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+set -e
+
+# --- Usage ---
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  --hf-token TOKEN    HuggingFace token"
+    echo "  --help              Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0                              # Interactive mode"
+    echo "  $0 --hf-token hf_xxxxx          # Non-interactive mode"
+    echo ""
+    exit 0
+}
+
+# --- Parse Arguments ---
+HF_TOKEN=""
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --hf-token)
+            HF_TOKEN="$2"
+            shift 2
+            ;;
+        --help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+echo "=========================================="
+echo "Reflector GPU Functions Deployment"
+echo "=========================================="
+echo ""
+
+# --- Check Dependencies ---
+if ! command -v modal &> /dev/null; then
+    echo "Error: Modal CLI not installed."
+    echo "  Install with: pip install modal"
+    exit 1
+fi
+
+if ! command -v openssl &> /dev/null; then
+    echo "Error: openssl not found."
+    echo "  Mac: brew install openssl"
+    echo "  Ubuntu: sudo apt-get install openssl"
+    exit 1
+fi
+
+# Check Modal authentication
+if ! modal profile current &> /dev/null; then
+    echo "Error: Not authenticated with Modal."
+    echo "  Run: modal setup"
+    exit 1
+fi
+
+# --- HuggingFace Token Setup ---
+if [ -z "$HF_TOKEN" ]; then
+    echo "HuggingFace token required for Pyannote diarization model."
+    echo "1. Create account at https://huggingface.co"
+    echo "2. Accept license at https://huggingface.co/pyannote/speaker-diarization-3.1"
+    echo "3. Generate token at https://huggingface.co/settings/tokens"
+    echo ""
+    read -p "Enter your HuggingFace token: " HF_TOKEN
+fi
+
+if [ -z "$HF_TOKEN" ]; then
+    echo "Error: HuggingFace token is required for diarization"
+    exit 1
+fi
+
+# Basic token format validation
+if [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+    echo "Warning: HuggingFace tokens usually start with 'hf_'"
+    if [ -t 0 ]; then
+        read -p "Continue anyway? (y/n): " confirm
+        if [ "$confirm" != "y" ]; then
+            exit 1
+        fi
+    else
+        echo "Non-interactive mode: proceeding anyway"
+    fi
+fi
+
+# --- Auto-generate reflector<->GPU API Key ---
+echo ""
+echo "Generating API key for GPU services..."
+API_KEY=$(openssl rand -hex 32)
+
+# --- Create Modal Secrets ---
+echo "Creating Modal secrets..."
+
+# Create or update hf_token secret (delete first if exists)
+if modal secret list 2>/dev/null | grep -q "hf_token"; then
+    echo "  -> Recreating secret: hf_token"
+    modal secret delete hf_token --yes 2>/dev/null || true
+fi
+echo "  -> Creating secret: hf_token"
+modal secret create hf_token HF_TOKEN="$HF_TOKEN"
+
+# Create or update reflector-gpu secret (delete first if exists)
+if modal secret list 2>/dev/null | grep -q "reflector-gpu"; then
+    echo "  -> Recreating secret: reflector-gpu"
+    modal secret delete reflector-gpu --yes 2>/dev/null || true
+fi
+echo "  -> Creating secret: reflector-gpu"
+modal secret create reflector-gpu REFLECTOR_GPU_APIKEY="$API_KEY"
+
+# --- Deploy Functions ---
+echo ""
+echo "Deploying transcriber (Whisper)..."
+TRANSCRIBER_URL=$(modal deploy reflector_transcriber.py 2>&1 | grep -o 'https://[^ ]*web.modal.run' | head -1)
+if [ -z "$TRANSCRIBER_URL" ]; then
+    echo "Error: Failed to deploy transcriber. Check Modal dashboard for details."
+    exit 1
+fi
+echo "  -> $TRANSCRIBER_URL"
+
+echo ""
+echo "Deploying diarizer (Pyannote)..."
+DIARIZER_URL=$(modal deploy reflector_diarizer.py 2>&1 | grep -o 'https://[^ ]*web.modal.run' | head -1)
+if [ -z "$DIARIZER_URL" ]; then
+    echo "Error: Failed to deploy diarizer. Check Modal dashboard for details."
+    exit 1
+fi
+echo "  -> $DIARIZER_URL"
+
+# --- Output Configuration ---
+echo ""
+echo "=========================================="
+echo "Deployment complete!"
+echo "=========================================="
+echo ""
+echo "Copy these values to your server's server/.env file:"
+echo ""
+echo "# --- Modal GPU Configuration ---"
+echo "TRANSCRIPT_BACKEND=modal"
+echo "TRANSCRIPT_URL=$TRANSCRIBER_URL"
+echo "TRANSCRIPT_MODAL_API_KEY=$API_KEY"
+echo ""
+echo "DIARIZATION_BACKEND=modal"
+echo "DIARIZATION_URL=$DIARIZER_URL"
+echo "DIARIZATION_MODAL_API_KEY=$API_KEY"
+echo "# --- End Modal Configuration ---"
--- a/gpu/modal_deployments/reflector_diarizer.py
+++ b/gpu/modal_deployments/reflector_diarizer.py
@@ -24,6 +24,12 @@ app = modal.App(name="reflector-diarizer")
 upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True)


+# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
+# If you modify the audio format detection logic, you MUST update all copies:
+#   - gpu/self_hosted/app/utils.py
+#   - gpu/modal_deployments/reflector_transcriber.py (2 copies)
+#   - gpu/modal_deployments/reflector_transcriber_parakeet.py
+#   - gpu/modal_deployments/reflector_diarizer.py (this file)
 def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
    parsed_url = urlparse(url)
    url_path = parsed_url.path
@@ -39,6 +45,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
        return AudioFileExtension("wav")
    if "audio/mp4" in content_type:
        return AudioFileExtension("mp4")
+    if "audio/webm" in content_type or "video/webm" in content_type:
+        return AudioFileExtension("webm")

    raise ValueError(
        f"Unsupported audio format for URL: {url}. "
@@ -105,7 +113,7 @@ def download_pyannote_audio():


 diarizer_image = (
-    modal.Image.debian_slim(python_version="3.10.8")
+    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
        "pyannote.audio==3.1.0",
        "requests",
@@ -116,7 +124,7 @@ diarizer_image = (
        "transformers==4.34.0",
        "sentencepiece",
        "protobuf",
-        "numpy",
+        "numpy<2",
        "huggingface_hub",
        "hf-transfer",
    )
--- a/gpu/modal_deployments/reflector_transcriber.py
+++ b/gpu/modal_deployments/reflector_transcriber.py
@@ -89,6 +89,7 @@ image = (
        "torch==2.5.1",
        "faster-whisper==1.1.1",
        "fastapi==0.115.12",
+        "python-multipart",
        "requests",
        "librosa==0.10.1",
        "numpy<2",
@@ -98,6 +99,12 @@ image = (
 )


+# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
+# If you modify the audio format detection logic, you MUST update all copies:
+#   - gpu/self_hosted/app/utils.py
+#   - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
+#   - gpu/modal_deployments/reflector_transcriber_parakeet.py
+#   - gpu/modal_deployments/reflector_diarizer.py
 def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
    parsed_url = urlparse(url)
    url_path = parsed_url.path
@@ -113,6 +120,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
        return AudioFileExtension("wav")
    if "audio/mp4" in content_type:
        return AudioFileExtension("mp4")
+    if "audio/webm" in content_type or "video/webm" in content_type:
+        return AudioFileExtension("webm")

    raise ValueError(
        f"Unsupported audio format for URL: {url}. "
@@ -315,6 +324,11 @@ class TranscriberWhisperFile:
        import numpy as np
        from silero_vad import VADIterator

+        # IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
+        # If you modify this function, you MUST update all copies:
+        #   - gpu/modal_deployments/reflector_transcriber.py (this file)
+        #   - gpu/modal_deployments/reflector_transcriber_parakeet.py
+        #   - gpu/self_hosted/app/services/transcriber.py
        def vad_segments(
            audio_array,
            sample_rate: int = SAMPLERATE,
@@ -322,6 +336,7 @@ class TranscriberWhisperFile:
        ) -> Generator[TimeSegment, None, None]:
            """Generate speech segments as TimeSegment using Silero VAD."""
            iterator = VADIterator(self.vad_model, sampling_rate=sample_rate)
+            audio_duration = len(audio_array) / float(SAMPLERATE)
            start = None
            for i in range(0, len(audio_array), window_size):
                chunk = audio_array[i : i + window_size]
@@ -341,6 +356,9 @@ class TranscriberWhisperFile:
                        start / float(SAMPLERATE), end / float(SAMPLERATE)
                    )
                    start = None
+            # Handle case where audio ends while speech is still active
+            if start is not None:
+                yield TimeSegment(start / float(SAMPLERATE), audio_duration)
            iterator.reset_states()

        upload_volume.reload()
@@ -406,6 +424,12 @@ class TranscriberWhisperFile:
        return {"text": " ".join(all_text), "words": all_words}


+# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
+# If you modify the audio format detection logic, you MUST update all copies:
+#   - gpu/self_hosted/app/utils.py
+#   - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
+#   - gpu/modal_deployments/reflector_transcriber_parakeet.py
+#   - gpu/modal_deployments/reflector_diarizer.py
 def detect_audio_format(url: str, headers: dict) -> str:
    from urllib.parse import urlparse

@@ -423,6 +447,8 @@ def detect_audio_format(url: str, headers: dict) -> str:
        return "wav"
    if "audio/mp4" in content_type:
        return "mp4"
+    if "audio/webm" in content_type or "video/webm" in content_type:
+        return "webm"

    raise HTTPException(
        status_code=400,
--- a/gpu/modal_deployments/reflector_transcriber_parakeet.py
+++ b/gpu/modal_deployments/reflector_transcriber_parakeet.py
@@ -90,6 +90,12 @@ image = (
 )


+# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
+# If you modify the audio format detection logic, you MUST update all copies:
+#   - gpu/self_hosted/app/utils.py
+#   - gpu/modal_deployments/reflector_transcriber.py (2 copies)
+#   - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
+#   - gpu/modal_deployments/reflector_diarizer.py
 def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
    parsed_url = urlparse(url)
    url_path = parsed_url.path
@@ -105,6 +111,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
        return AudioFileExtension("wav")
    if "audio/mp4" in content_type:
        return AudioFileExtension("mp4")
+    if "audio/webm" in content_type or "video/webm" in content_type:
+        return AudioFileExtension("webm")

    raise ValueError(
        f"Unsupported audio format for URL: {url}. "
@@ -301,6 +309,11 @@ class TranscriberParakeetFile:
            audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True)
            return audio_array

+        # IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
+        # If you modify this function, you MUST update all copies:
+        #   - gpu/modal_deployments/reflector_transcriber.py
+        #   - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
+        #   - gpu/self_hosted/app/services/transcriber.py
        def vad_segment_generator(
            audio_array,
        ) -> Generator[TimeSegment, None, None]:
--- a/gpu/modal_deployments/reflector_translator.py
+++ b/gpu/modal_deployments/reflector_translator.py
@@ -103,7 +103,7 @@ def configure_seamless_m4t():


 transcriber_image = (
-    Image.debian_slim(python_version="3.10.8")
+    Image.debian_slim(python_version="3.10")
    .apt_install("git")
    .apt_install("wget")
    .apt_install("libsndfile-dev")
@@ -119,6 +119,7 @@ transcriber_image = (
        "fairseq2",
        "pyyaml",
        "hf-transfer~=0.1",
+        "pydantic",
    )
    .run_function(install_seamless_communication)
    .run_function(download_seamlessm4t_model)