docs: docs website + installation (#778)

* feat: WIP doc (vibe started and iterated)

* install from scratch docs

* caddyfile.example

* gitignore

* authentik script

* authentik script

* authentik script

* llm doc

* authentik ongoing

* more daily setup logs

* doc website

* gpu self hosted setup guide (no-mistakes)

* doc review round

* doc review round

* doc review round

* update doc site sidebars

* feat(docs): add mermaid diagram support

* docs polishing

* live pipeline doc

* move pipeline dev docs to dev docs location

* doc pr review iteration

* dockerfile healthcheck

* docs/pr-comments

* remove jwt comment

* llm suggestion

* pr comments

* pr comments

* document auto migrations

* cleanup docs

---------

Co-authored-by: Mathieu Virbel <mat@meltingrocks.com>
Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
This commit is contained in:
2026-01-06 17:25:02 -05:00
committed by GitHub
parent e644d6497b
commit 407c15299f
61 changed files with 32653 additions and 26 deletions

View File

@@ -0,0 +1,150 @@
#!/bin/bash
set -e
# --- Usage ---
usage() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --hf-token TOKEN HuggingFace token"
echo " --help Show this help message"
echo ""
echo "Examples:"
echo " $0 # Interactive mode"
echo " $0 --hf-token hf_xxxxx # Non-interactive mode"
echo ""
exit 0
}
# --- Parse Arguments ---
HF_TOKEN=""
while [[ $# -gt 0 ]]; do
case $1 in
--hf-token)
HF_TOKEN="$2"
shift 2
;;
--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
echo "=========================================="
echo "Reflector GPU Functions Deployment"
echo "=========================================="
echo ""
# --- Check Dependencies ---
if ! command -v modal &> /dev/null; then
echo "Error: Modal CLI not installed."
echo " Install with: pip install modal"
exit 1
fi
if ! command -v openssl &> /dev/null; then
echo "Error: openssl not found."
echo " Mac: brew install openssl"
echo " Ubuntu: sudo apt-get install openssl"
exit 1
fi
# Check Modal authentication
if ! modal profile current &> /dev/null; then
echo "Error: Not authenticated with Modal."
echo " Run: modal setup"
exit 1
fi
# --- HuggingFace Token Setup ---
if [ -z "$HF_TOKEN" ]; then
echo "HuggingFace token required for Pyannote diarization model."
echo "1. Create account at https://huggingface.co"
echo "2. Accept license at https://huggingface.co/pyannote/speaker-diarization-3.1"
echo "3. Generate token at https://huggingface.co/settings/tokens"
echo ""
read -p "Enter your HuggingFace token: " HF_TOKEN
fi
if [ -z "$HF_TOKEN" ]; then
echo "Error: HuggingFace token is required for diarization"
exit 1
fi
# Basic token format validation
if [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
echo "Warning: HuggingFace tokens usually start with 'hf_'"
if [ -t 0 ]; then
read -p "Continue anyway? (y/n): " confirm
if [ "$confirm" != "y" ]; then
exit 1
fi
else
echo "Non-interactive mode: proceeding anyway"
fi
fi
# --- Auto-generate reflector<->GPU API Key ---
echo ""
echo "Generating API key for GPU services..."
API_KEY=$(openssl rand -hex 32)
# --- Create Modal Secrets ---
echo "Creating Modal secrets..."
# Create or update hf_token secret (delete first if exists)
if modal secret list 2>/dev/null | grep -q "hf_token"; then
echo " -> Recreating secret: hf_token"
modal secret delete hf_token --yes 2>/dev/null || true
fi
echo " -> Creating secret: hf_token"
modal secret create hf_token HF_TOKEN="$HF_TOKEN"
# Create or update reflector-gpu secret (delete first if exists)
if modal secret list 2>/dev/null | grep -q "reflector-gpu"; then
echo " -> Recreating secret: reflector-gpu"
modal secret delete reflector-gpu --yes 2>/dev/null || true
fi
echo " -> Creating secret: reflector-gpu"
modal secret create reflector-gpu REFLECTOR_GPU_APIKEY="$API_KEY"
# --- Deploy Functions ---
echo ""
echo "Deploying transcriber (Whisper)..."
TRANSCRIBER_URL=$(modal deploy reflector_transcriber.py 2>&1 | grep -o 'https://[^ ]*web.modal.run' | head -1)
if [ -z "$TRANSCRIBER_URL" ]; then
echo "Error: Failed to deploy transcriber. Check Modal dashboard for details."
exit 1
fi
echo " -> $TRANSCRIBER_URL"
echo ""
echo "Deploying diarizer (Pyannote)..."
DIARIZER_URL=$(modal deploy reflector_diarizer.py 2>&1 | grep -o 'https://[^ ]*web.modal.run' | head -1)
if [ -z "$DIARIZER_URL" ]; then
echo "Error: Failed to deploy diarizer. Check Modal dashboard for details."
exit 1
fi
echo " -> $DIARIZER_URL"
# --- Output Configuration ---
echo ""
echo "=========================================="
echo "Deployment complete!"
echo "=========================================="
echo ""
echo "Copy these values to your server's server/.env file:"
echo ""
echo "# --- Modal GPU Configuration ---"
echo "TRANSCRIPT_BACKEND=modal"
echo "TRANSCRIPT_URL=$TRANSCRIBER_URL"
echo "TRANSCRIPT_MODAL_API_KEY=$API_KEY"
echo ""
echo "DIARIZATION_BACKEND=modal"
echo "DIARIZATION_URL=$DIARIZER_URL"
echo "DIARIZATION_MODAL_API_KEY=$API_KEY"
echo "# --- End Modal Configuration ---"

View File

@@ -24,6 +24,12 @@ app = modal.App(name="reflector-diarizer")
upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True)
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
# If you modify the audio format detection logic, you MUST update all copies:
# - gpu/self_hosted/app/utils.py
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/modal_deployments/reflector_diarizer.py (this file)
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
parsed_url = urlparse(url)
url_path = parsed_url.path
@@ -39,6 +45,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
return AudioFileExtension("wav")
if "audio/mp4" in content_type:
return AudioFileExtension("mp4")
if "audio/webm" in content_type or "video/webm" in content_type:
return AudioFileExtension("webm")
raise ValueError(
f"Unsupported audio format for URL: {url}. "
@@ -105,7 +113,7 @@ def download_pyannote_audio():
diarizer_image = (
modal.Image.debian_slim(python_version="3.10.8")
modal.Image.debian_slim(python_version="3.10")
.pip_install(
"pyannote.audio==3.1.0",
"requests",
@@ -116,7 +124,7 @@ diarizer_image = (
"transformers==4.34.0",
"sentencepiece",
"protobuf",
"numpy",
"numpy<2",
"huggingface_hub",
"hf-transfer",
)

View File

@@ -89,6 +89,7 @@ image = (
"torch==2.5.1",
"faster-whisper==1.1.1",
"fastapi==0.115.12",
"python-multipart",
"requests",
"librosa==0.10.1",
"numpy<2",
@@ -98,6 +99,12 @@ image = (
)
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
# If you modify the audio format detection logic, you MUST update all copies:
# - gpu/self_hosted/app/utils.py
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/modal_deployments/reflector_diarizer.py
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
parsed_url = urlparse(url)
url_path = parsed_url.path
@@ -113,6 +120,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
return AudioFileExtension("wav")
if "audio/mp4" in content_type:
return AudioFileExtension("mp4")
if "audio/webm" in content_type or "video/webm" in content_type:
return AudioFileExtension("webm")
raise ValueError(
f"Unsupported audio format for URL: {url}. "
@@ -315,6 +324,11 @@ class TranscriberWhisperFile:
import numpy as np
from silero_vad import VADIterator
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
# If you modify this function, you MUST update all copies:
# - gpu/modal_deployments/reflector_transcriber.py (this file)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/self_hosted/app/services/transcriber.py
def vad_segments(
audio_array,
sample_rate: int = SAMPLERATE,
@@ -322,6 +336,7 @@ class TranscriberWhisperFile:
) -> Generator[TimeSegment, None, None]:
"""Generate speech segments as TimeSegment using Silero VAD."""
iterator = VADIterator(self.vad_model, sampling_rate=sample_rate)
audio_duration = len(audio_array) / float(SAMPLERATE)
start = None
for i in range(0, len(audio_array), window_size):
chunk = audio_array[i : i + window_size]
@@ -341,6 +356,9 @@ class TranscriberWhisperFile:
start / float(SAMPLERATE), end / float(SAMPLERATE)
)
start = None
# Handle case where audio ends while speech is still active
if start is not None:
yield TimeSegment(start / float(SAMPLERATE), audio_duration)
iterator.reset_states()
upload_volume.reload()
@@ -406,6 +424,12 @@ class TranscriberWhisperFile:
return {"text": " ".join(all_text), "words": all_words}
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
# If you modify the audio format detection logic, you MUST update all copies:
# - gpu/self_hosted/app/utils.py
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/modal_deployments/reflector_diarizer.py
def detect_audio_format(url: str, headers: dict) -> str:
from urllib.parse import urlparse
@@ -423,6 +447,8 @@ def detect_audio_format(url: str, headers: dict) -> str:
return "wav"
if "audio/mp4" in content_type:
return "mp4"
if "audio/webm" in content_type or "video/webm" in content_type:
return "webm"
raise HTTPException(
status_code=400,

View File

@@ -90,6 +90,12 @@ image = (
)
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
# If you modify the audio format detection logic, you MUST update all copies:
# - gpu/self_hosted/app/utils.py
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
# - gpu/modal_deployments/reflector_diarizer.py
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
parsed_url = urlparse(url)
url_path = parsed_url.path
@@ -105,6 +111,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
return AudioFileExtension("wav")
if "audio/mp4" in content_type:
return AudioFileExtension("mp4")
if "audio/webm" in content_type or "video/webm" in content_type:
return AudioFileExtension("webm")
raise ValueError(
f"Unsupported audio format for URL: {url}. "
@@ -301,6 +309,11 @@ class TranscriberParakeetFile:
audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True)
return audio_array
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
# If you modify this function, you MUST update all copies:
# - gpu/modal_deployments/reflector_transcriber.py
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
# - gpu/self_hosted/app/services/transcriber.py
def vad_segment_generator(
audio_array,
) -> Generator[TimeSegment, None, None]:

View File

@@ -103,7 +103,7 @@ def configure_seamless_m4t():
transcriber_image = (
Image.debian_slim(python_version="3.10.8")
Image.debian_slim(python_version="3.10")
.apt_install("git")
.apt_install("wget")
.apt_install("libsndfile-dev")
@@ -119,6 +119,7 @@ transcriber_image = (
"fairseq2",
"pyyaml",
"hf-transfer~=0.1",
"pydantic",
)
.run_function(install_seamless_communication)
.run_function(download_seamlessm4t_model)

View File

@@ -0,0 +1,137 @@
# Local Development GPU Setup
Run transcription and diarization locally for development/testing.
> **For production deployment**, see the [Self-Hosted GPU Setup Guide](../../docs/docs/installation/self-hosted-gpu-setup.md).
## Prerequisites
1. **Python 3.12+** and **uv** package manager
2. **FFmpeg** installed and on PATH
3. **HuggingFace account** with access to pyannote models
### Accept Pyannote Licenses (Required)
Before first run, accept licenses for these gated models (logged into HuggingFace):
- https://hf.co/pyannote/speaker-diarization-3.1
- https://hf.co/pyannote/segmentation-3.0
## Quick Start
### 1. Install dependencies
```bash
cd gpu/self_hosted
uv sync
```
### 2. Start the GPU service
```bash
cd gpu/self_hosted
HF_TOKEN=<your-huggingface-token> \
REFLECTOR_GPU_APIKEY=dev-key-12345 \
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
```
Note: The `.env` file is NOT auto-loaded. Pass env vars explicitly or use:
```bash
export HF_TOKEN=<your-token>
export REFLECTOR_GPU_APIKEY=dev-key-12345
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
```
### 3. Configure Reflector to use local GPU
Edit `server/.env`:
```bash
# Transcription - local GPU service
TRANSCRIPT_BACKEND=modal
TRANSCRIPT_URL=http://host.docker.internal:8000
TRANSCRIPT_MODAL_API_KEY=dev-key-12345
# Diarization - local GPU service
DIARIZATION_BACKEND=modal
DIARIZATION_URL=http://host.docker.internal:8000
DIARIZATION_MODAL_API_KEY=dev-key-12345
```
Note: Use `host.docker.internal` because Reflector server runs in Docker.
### 4. Restart Reflector server
```bash
cd server
docker compose restart server worker
```
## Testing
### Test transcription
```bash
curl -s -X POST http://localhost:8000/v1/audio/transcriptions \
-H "Authorization: Bearer dev-key-12345" \
-F "file=@/path/to/audio.wav" \
-F "language=en"
```
### Test diarization
```bash
curl -s -X POST "http://localhost:8000/diarize?audio_file_url=<audio-url>" \
-H "Authorization: Bearer dev-key-12345"
```
## Platform Notes
### macOS (ARM)
Docker build fails - CUDA packages are x86_64 only. Use local Python instead:
```bash
uv sync
HF_TOKEN=xxx REFLECTOR_GPU_APIKEY=xxx .venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
```
### Linux with NVIDIA GPU
Docker works with CUDA acceleration:
```bash
docker compose up -d
```
### CPU-only
Works on any platform, just slower. PyTorch auto-detects and falls back to CPU.
## Switching Back to Modal.com
Edit `server/.env`:
```bash
TRANSCRIPT_BACKEND=modal
TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-parakeet-web.modal.run
TRANSCRIPT_MODAL_API_KEY=<modal-api-key>
DIARIZATION_BACKEND=modal
DIARIZATION_URL=https://monadical-sas--reflector-diarizer-web.modal.run
DIARIZATION_MODAL_API_KEY=<modal-api-key>
```
## Troubleshooting
### "Could not download pyannote pipeline"
- Accept model licenses at HuggingFace (see Prerequisites)
- Verify HF_TOKEN is set and valid
### Service won't start
- Check port 8000 is free: `lsof -i :8000`
- Kill orphan processes if needed
### Transcription returns empty text
- Ensure audio contains speech (not just tones/silence)
- Check audio format is supported (wav, mp3, etc.)
### Deprecation warnings from torchaudio/pyannote
- Safe to ignore - doesn't affect functionality

View File

@@ -56,9 +56,13 @@ Docker
- Not yet provided in this directory. A Dockerfile will be added later. For now, use Local run above
Conformance tests
# Setup
# From this directory
[SETUP.md](SETUP.md)
# Conformance tests
## From this directory
TRANSCRIPT_URL=http://localhost:8000 \
TRANSCRIPT_API_KEY=dev-key \

View File

@@ -129,6 +129,11 @@ class WhisperService:
audio = np.frombuffer(proc.stdout, dtype=np.float32)
return audio
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
# If you modify this function, you MUST update all copies:
# - gpu/modal_deployments/reflector_transcriber.py
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/self_hosted/app/services/transcriber.py (this file)
def vad_segments(
audio_array,
sample_rate: int = SAMPLE_RATE,
@@ -153,6 +158,10 @@ class WhisperService:
end = speech["end"]
yield (start / float(SAMPLE_RATE), end / float(SAMPLE_RATE))
start = None
# Handle case where audio ends while speech is still active
if start is not None:
audio_duration = len(audio_array) / float(sample_rate)
yield (start / float(SAMPLE_RATE), audio_duration)
iterator.reset_states()
audio_array = load_audio_via_ffmpeg(file_path, SAMPLE_RATE)

View File

@@ -34,6 +34,12 @@ def ensure_dirs():
UPLOADS_PATH.mkdir(parents=True, exist_ok=True)
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
# If you modify the audio format detection logic, you MUST update all copies:
# - gpu/self_hosted/app/utils.py (this file)
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
# - gpu/modal_deployments/reflector_diarizer.py
def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
url_path = urlparse(url).path
for ext in SUPPORTED_FILE_EXTENSIONS:
@@ -47,6 +53,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
return "wav"
if "audio/mp4" in content_type:
return "mp4"
if "audio/webm" in content_type or "video/webm" in content_type:
return "webm"
raise HTTPException(
status_code=400,

View File

@@ -8,3 +8,11 @@ services:
- .env
volumes:
- ./cache:/root/.cache
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
restart: unless-stopped