diff --git a/.gitleaksignore b/.gitleaksignore index a00500e1..8eb80bd5 100644 --- a/.gitleaksignore +++ b/.gitleaksignore @@ -1,3 +1,5 @@ b9d891d3424f371642cb032ecfd0e2564470a72c:server/tests/test_transcripts_recording_deletion.py:generic-api-key:15 docs/docs/installation/auth-setup.md:curl-auth-header:250 docs/docs/installation/daily-setup.md:curl-auth-header:277 +gpu/self_hosted/DEV_SETUP.md:curl-auth-header:74 +gpu/self_hosted/DEV_SETUP.md:curl-auth-header:83 diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index b1f968ed..45ff4402 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -2,7 +2,7 @@ # Usage: docker compose -f docker-compose.prod.yml up -d # # Prerequisites: -# 1. Copy env.example to .env and configure for both server/ and www/ +# 1. Copy .env.example to .env and configure for both server/ and www/ # 2. Copy Caddyfile.example to Caddyfile and edit with your domains # 3. Deploy Modal GPU functions (see gpu/modal_deployments/deploy-all.sh) diff --git a/docs/TODO.md b/docs/TODO.md index 1c03f6eb..5498a767 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -84,6 +84,7 @@ Please provide specific values for: - Exact UDP port range used (e.g., 10000-20000) - STUN server configuration (if any) - ICE candidate gathering timeout + - https://docs.daily.co/guides/privacy-and-security/corporate-firewalls-nats-allowed-ip-list - [ ] **Worker Configuration** - Default Celery worker count diff --git a/docs/docs/installation/overview.md b/docs/docs/installation/overview.md index 0f1a2e92..6813e989 100644 --- a/docs/docs/installation/overview.md +++ b/docs/docs/installation/overview.md @@ -9,11 +9,17 @@ This guide walks you through deploying Reflector from scratch. Follow these step ## What You'll Set Up -``` -User --> Caddy (auto-SSL) --> Frontend (Next.js) - --> Backend (FastAPI) --> PostgreSQL - --> Redis - --> Celery Workers --> Modal.com GPU +```mermaid +flowchart LR + User --> Caddy["Caddy (auto-SSL)"] + Caddy --> Frontend["Frontend (Next.js)"] + Caddy --> Backend["Backend (FastAPI)"] + Backend --> PostgreSQL + Backend --> Redis + Backend --> Workers["Celery Workers"] + Workers --> PostgreSQL + Workers --> Redis + Workers --> GPU["GPU Processing
(Modal.com OR Self-hosted)"] ``` ## Prerequisites @@ -22,7 +28,9 @@ Before starting, you need: - [ ] **Production server** - Ubuntu 22.04+, 4+ cores, 8GB+ RAM, public IP - [ ] **Two domain names** - e.g., `app.example.com` (frontend) and `api.example.com` (backend) -- [ ] **Modal.com account** - Free tier at https://modal.com +- [ ] **GPU processing** - Choose one: + - Modal.com account (free tier at https://modal.com), OR + - GPU server with NVIDIA GPU (8GB+ VRAM) - [ ] **HuggingFace account** - Free at https://huggingface.co - [ ] **OpenAI API key** - For summaries and topic detection at https://platform.openai.com/account/api-keys @@ -52,13 +60,24 @@ dig api.example.com +short --- -## Step 2: Deploy Modal GPU Functions +## Step 2: Deploy GPU Processing + +Reflector requires GPU processing for transcription (Whisper) and speaker diarization (Pyannote). Choose one option: + +| | **Modal.com (Cloud)** | **Self-Hosted GPU** | +|---|---|---| +| **Best for** | No GPU hardware, zero maintenance | Own GPU server, full control | +| **Pricing** | Pay-per-use (~$0.01-0.10/min audio) | Fixed infrastructure cost | +| **Setup** | Run from laptop (browser auth) | Run on GPU server | +| **Scaling** | Automatic | Manual | + +### Option A: Modal.com (Serverless Cloud GPU) **Location: YOUR LOCAL COMPUTER (laptop/desktop)** Modal requires browser authentication, so this runs locally - not on your server. -### Accept HuggingFace Licenses +#### Accept HuggingFace Licenses Visit both pages and click "Accept": - https://huggingface.co/pyannote/speaker-diarization-3.1 @@ -66,7 +85,7 @@ Visit both pages and click "Accept": Then generate a token at https://huggingface.co/settings/tokens -### Deploy to Modal +#### Deploy to Modal ```bash pip install modal @@ -77,10 +96,26 @@ cd reflector/gpu/modal_deployments ./deploy-all.sh --hf-token YOUR_HUGGINGFACE_TOKEN ``` -**Save the output** - copy the configuration block, you'll need it for Step 5. +**Save the output** - copy the configuration block, you'll need it for Step 4. See [Modal Setup](./modal-setup) for troubleshooting and details. +### Option B: Self-Hosted GPU + +**Location: YOUR GPU SERVER** + +Requires: NVIDIA GPU with 8GB+ VRAM, Ubuntu 22.04+, 40-50GB disk (Docker) or 25-30GB (Systemd). + +See [Self-Hosted GPU Setup](./self-hosted-gpu-setup) for complete instructions. Quick summary: + +1. Install NVIDIA drivers and Docker (or uv for systemd) +2. Clone repository: `git clone https://github.com/monadical-sas/reflector.git` +3. Configure `.env` with HuggingFace token +4. Start service (Docker compose or systemd) +5. Set up Caddy reverse proxy for HTTPS + +**Save your API key and HTTPS URL** - you'll need them for Step 4. + --- ## Step 3: Prepare Server @@ -102,12 +137,9 @@ ssh user@your-server-ip docker --version # verify ``` -### Open Firewall +### Firewall -```bash -sudo ufw allow 80/tcp -sudo ufw allow 443/tcp -``` +Ensure ports 80 (HTTP) and 443 (HTTPS) are open for inbound traffic. The method varies by cloud provider and OS configuration. ### Clone Repository @@ -120,7 +152,7 @@ cd reflector ## Step 4: Configure Environment -**Location: YOUR SERVER (via SSH)** +**Location: YOUR SERVER (via SSH, in the `reflector` directory)** Reflector has two env files: - `server/.env` - Backend configuration @@ -129,7 +161,7 @@ Reflector has two env files: ### Backend Configuration ```bash -cp server/env.example server/.env +cp server/.env.example server/.env nano server/.env ``` @@ -151,15 +183,24 @@ CORS_ALLOW_CREDENTIALS=true # Secret key - generate with: openssl rand -hex 32 SECRET_KEY= -# Modal GPU (paste from deploy-all.sh output) +# GPU Processing - choose ONE option from Step 2: + +# Option A: Modal.com (paste from deploy-all.sh output) TRANSCRIPT_BACKEND=modal TRANSCRIPT_URL=https://yourname--reflector-transcriber-web.modal.run TRANSCRIPT_MODAL_API_KEY= - DIARIZATION_BACKEND=modal DIARIZATION_URL=https://yourname--reflector-diarizer-web.modal.run DIARIZATION_MODAL_API_KEY= +# Option B: Self-hosted GPU (use your GPU server URL and API key) +# TRANSCRIPT_BACKEND=modal +# TRANSCRIPT_URL=https://gpu.example.com +# TRANSCRIPT_MODAL_API_KEY= +# DIARIZATION_BACKEND=modal +# DIARIZATION_URL=https://gpu.example.com +# DIARIZATION_MODAL_API_KEY= + # Storage - where to store audio files and transcripts TRANSCRIPT_STORAGE_BACKEND=local @@ -205,7 +246,8 @@ cp Caddyfile.example Caddyfile nano Caddyfile ``` -Replace `example.com` with your domains: +Replace `example.com` with your domains. The `{$VAR:default}` syntax uses Caddy's env var substitution - you can either edit the file directly or set `FRONTEND_DOMAIN` and `API_DOMAIN` environment variables. + ``` {$FRONTEND_DOMAIN:app.example.com} { reverse_proxy web:3000 @@ -226,9 +268,13 @@ Replace `example.com` with your domains: docker compose -f docker-compose.prod.yml up -d ``` -Wait for containers to start (~30 seconds), then run migrations: +Wait for PostgreSQL to be ready, then run migrations: ```bash +# Wait for postgres to be healthy (may take 30-60 seconds on first run) +docker compose -f docker-compose.prod.yml exec postgres pg_isready -U reflector + +# Run database migrations docker compose -f docker-compose.prod.yml exec server uv run alembic upgrade head ``` @@ -332,6 +378,7 @@ docker compose -f docker-compose.prod.yml logs ## Next Steps -- [Modal Setup](./modal-setup) - GPU processing details +- [Modal Setup](./modal-setup) - Cloud GPU processing details +- [Self-Hosted GPU Setup](./self-hosted-gpu-setup) - Own GPU server deployment - [Authentication Setup](./auth-setup) - Authentik OAuth - [System Requirements](./requirements) - Hardware specs diff --git a/docs/docs/installation/self-hosted-gpu-setup.md b/docs/docs/installation/self-hosted-gpu-setup.md new file mode 100644 index 00000000..288685e4 --- /dev/null +++ b/docs/docs/installation/self-hosted-gpu-setup.md @@ -0,0 +1,514 @@ +--- +sidebar_position: 5 +title: Self-Hosted GPU Setup +--- + +# Self-Hosted GPU Setup + +This guide covers deploying Reflector's GPU processing on your own server instead of Modal.com. For the complete deployment guide, see [Deployment Guide](./overview). + +## When to Use Self-Hosted GPU + +**Choose self-hosted GPU if you:** +- Have GPU hardware available (NVIDIA required) +- Want full control over processing +- Prefer fixed infrastructure costs over pay-per-use +- Have privacy or data locality requirements +- Need to process audio without external API calls + +**Choose Modal.com instead if you:** +- Don't have GPU hardware +- Want zero infrastructure management +- Prefer pay-per-use pricing +- Need instant scaling for variable workloads + +See [Modal.com Setup](./modal-setup) for cloud GPU deployment. + +## What Gets Deployed + +The self-hosted GPU service provides the same API endpoints as Modal: +- `POST /v1/audio/transcriptions` - Whisper transcription +- `POST /diarize` - Pyannote speaker diarization + +Your main Reflector server connects to this service exactly like it connects to Modal - only the URL changes. + +## Prerequisites + +### Hardware +- **GPU**: NVIDIA GPU with 8GB+ VRAM (tested on Tesla T4 with 15GB) +- **CPU**: 4+ cores recommended +- **RAM**: 8GB minimum, 16GB recommended +- **Disk**: + - Docker method: 40-50GB minimum + - Systemd method: 25-30GB minimum + +### Software +- Ubuntu 22.04 or 24.04 +- Public IP address +- Domain name with DNS A record pointing to server + +### Accounts +- **HuggingFace account** with accepted Pyannote licenses: + - https://huggingface.co/pyannote/speaker-diarization-3.1 + - https://huggingface.co/pyannote/segmentation-3.0 +- **HuggingFace access token** from https://huggingface.co/settings/tokens + +## Choose Deployment Method + +### Docker Deployment (Recommended) + +**Pros:** +- Container isolation and reproducibility +- No manual library path configuration +- Easier to replicate across servers +- Built-in restart policies +- Simpler dependency management + +**Cons:** +- Higher disk usage (~15GB for container) +- Requires 40-50GB disk minimum + +**Best for:** Teams wanting reproducible deployments, multiple GPU servers + +### Systemd Deployment + +**Pros:** +- Lower disk usage (~8GB total) +- Direct GPU access (no container layer) +- Works on smaller disks (25-30GB) + +**Cons:** +- Manual `LD_LIBRARY_PATH` configuration +- Less portable across systems + +**Best for:** Single GPU server, limited disk space + +--- + +## Docker Deployment + +### Step 1: Install NVIDIA Driver + +```bash +sudo apt update +sudo apt install -y nvidia-driver-535 + +# Load kernel modules +sudo modprobe nvidia + +# Verify installation +nvidia-smi +``` + +Expected output: GPU details with driver version and CUDA version. + +### Step 2: Install Docker + +```bash +curl -fsSL https://get.docker.com | sudo sh +sudo usermod -aG docker $USER + +# Log out and back in for group changes +exit +# SSH back in +``` + +### Step 3: Install NVIDIA Container Toolkit + +```bash +# Add NVIDIA repository +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ + sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +# Install toolkit +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit + +# Configure Docker runtime +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +### Step 4: Clone Repository and Configure + +```bash +git clone https://github.com/monadical-sas/reflector.git +cd reflector/gpu/self_hosted + +# Create environment file +cat > .env << EOF +REFLECTOR_GPU_APIKEY=$(openssl rand -hex 16) +HF_TOKEN=your_huggingface_token_here +EOF + +# Note the generated API key - you'll need it for main server config +cat .env +``` + +### Step 5: Create Docker Compose File + +```bash +cat > compose.yml << 'EOF' +services: + reflector_gpu: + build: + context: . + ports: + - "8000:8000" + env_file: + - .env + volumes: + - ./cache:/root/.cache + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + restart: unless-stopped +EOF +``` + +### Step 6: Build and Start + +```bash +# Build image (takes ~5 minutes, downloads ~10GB) +sudo docker compose build + +# Start service +sudo docker compose up -d + +# Wait for startup and verify +sleep 30 +sudo docker compose logs +``` + +Look for: `INFO: Application startup complete. Uvicorn running on http://0.0.0.0:8000` + +### Step 7: Verify GPU Access + +```bash +# Check GPU is accessible from container +sudo docker exec $(sudo docker ps -q) nvidia-smi +``` + +Should show GPU with ~3GB VRAM used (models loaded). + +--- + +## Systemd Deployment + +### Step 1: Install NVIDIA Driver + +```bash +sudo apt update +sudo apt install -y nvidia-driver-535 + +# Load kernel modules +sudo modprobe nvidia + +# Verify installation +nvidia-smi +``` + +### Step 2: Install Dependencies + +```bash +# Install ffmpeg +sudo apt install -y ffmpeg + +# Install uv package manager +curl -LsSf https://astral.sh/uv/install.sh | sh +source ~/.local/bin/env + +# Clone repository +git clone https://github.com/monadical-sas/reflector.git +cd reflector/gpu/self_hosted +``` + +### Step 3: Configure Environment + +```bash +# Create environment file +cat > .env << EOF +REFLECTOR_GPU_APIKEY=$(openssl rand -hex 16) +HF_TOKEN=your_huggingface_token_here +EOF + +# Note the generated API key +cat .env +``` + +### Step 4: Install Python Packages + +```bash +# Install dependencies (~3GB download) +uv sync +``` + +### Step 5: Create Systemd Service + +```bash +# Generate library paths for NVIDIA packages +export NVIDIA_LIBS=$(find ~/reflector/gpu/self_hosted/.venv/lib/python3.12/site-packages/nvidia -name lib -type d | tr '\n' ':') + +# Load environment variables +source ~/reflector/gpu/self_hosted/.env + +# Create service file +sudo tee /etc/systemd/system/reflector-gpu.service << EOFSVC +[Unit] +Description=Reflector GPU Service (Transcription & Diarization) +After=network.target + +[Service] +Type=simple +User=$USER +WorkingDirectory=$HOME/reflector/gpu/self_hosted +Environment="PATH=$HOME/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" +Environment="HF_TOKEN=${HF_TOKEN}" +Environment="REFLECTOR_GPU_APIKEY=${REFLECTOR_GPU_APIKEY}" +Environment="LD_LIBRARY_PATH=${NVIDIA_LIBS}" +ExecStart=$HOME/reflector/gpu/self_hosted/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000 +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOFSVC + +# Enable and start +sudo systemctl daemon-reload +sudo systemctl enable reflector-gpu +sudo systemctl start reflector-gpu +``` + +### Step 6: Verify Service + +```bash +# Check status +sudo systemctl status reflector-gpu + +# View logs +sudo journalctl -u reflector-gpu -f +``` + +Look for: `INFO: Application startup complete.` + +--- + +## Configure HTTPS with Caddy + +Both deployment methods need HTTPS for production. Caddy handles SSL automatically. + +### Install Caddy + +```bash +sudo apt install -y debian-keyring debian-archive-keyring apt-transport-https curl + +curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | \ + sudo gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg + +curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | \ + sudo tee /etc/apt/sources.list.d/caddy-stable.list + +sudo apt update +sudo apt install -y caddy +``` + +### Configure Reverse Proxy + +```bash +sudo tee /etc/caddy/Caddyfile << 'EOF' +gpu.example.com { + reverse_proxy localhost:8000 +} +EOF + +# Reload Caddy (auto-provisions SSL certificate) +sudo systemctl reload caddy +``` + +Replace `gpu.example.com` with your domain. + +### Verify HTTPS + +```bash +curl -I https://gpu.example.com/docs +# Should return HTTP/2 200 +``` + +--- + +## Configure Main Reflector Server + +On your main Reflector server, update `server/.env`: + +```env +# GPU Processing - Self-hosted +TRANSCRIPT_BACKEND=modal +TRANSCRIPT_URL=https://gpu.example.com +TRANSCRIPT_MODAL_API_KEY= + +DIARIZATION_BACKEND=modal +DIARIZATION_URL=https://gpu.example.com +DIARIZATION_MODAL_API_KEY= +``` + +**Note:** The backend type is `modal` because the self-hosted GPU service implements the same API contract as Modal.com. This allows you to switch between cloud and self-hosted GPU processing by only changing the URL and API key. + +Restart services to apply: + +```bash +docker compose -f docker-compose.prod.yml restart server worker +``` + +--- + +## Service Management + +All commands in this section assume you're in `~/reflector/gpu/self_hosted/`. + +### Docker + +```bash +# View logs +sudo docker compose logs -f + +# Restart service +sudo docker compose restart + +# Stop service +sudo docker compose down + +# Check status +sudo docker compose ps +``` + +### Systemd + +```bash +# View logs +sudo journalctl -u reflector-gpu -f + +# Restart service +sudo systemctl restart reflector-gpu + +# Stop service +sudo systemctl stop reflector-gpu + +# Check status +sudo systemctl status reflector-gpu +``` + +### Monitor GPU + +```bash +# Check GPU usage +nvidia-smi + +# Watch in real-time +watch -n 1 nvidia-smi +``` + +**Typical GPU memory usage:** +- Idle (models loaded): ~3GB VRAM +- During transcription: ~4-5GB VRAM + +--- + +## Performance Notes + +**Tesla T4 benchmarks:** +- Transcription: ~2-3x real-time (10 min audio in 3-5 min) +- Diarization: ~1.5x real-time +- Max concurrent requests: 2-3 (depends on audio length) +- First request warmup: ~10 seconds (model loading) + +--- + +## Troubleshooting + +### nvidia-smi fails after driver install + +```bash +# Manually load kernel modules +sudo modprobe nvidia +nvidia-smi +``` + +### Service fails with "Could not download pyannote pipeline" + +1. Verify HF_TOKEN is valid: `echo $HF_TOKEN` +2. Check model access at https://huggingface.co/pyannote/speaker-diarization-3.1 +3. Regenerate service/compose with correct token +4. Restart service + +### cuDNN library loading errors (Systemd only) + +Symptom: `Unable to load libcudnn_cnn.so` + +Regenerate the systemd service file - the `LD_LIBRARY_PATH` must include all NVIDIA package directories. + +### Cannot connect to HTTPS endpoint + +1. Verify DNS resolves: `dig +short gpu.example.com` +2. Check firewall: `sudo ufw status` (ports 80, 443 must be open) +3. Check Caddy: `sudo systemctl status caddy` +4. View Caddy logs: `sudo journalctl -u caddy -n 50` + +### SSL certificate not provisioning + +Requirements for Let's Encrypt: +- Ports 80 and 443 publicly accessible +- DNS resolves to server's public IP +- Valid domain (not localhost or private IP) + +### Docker container won't start + +```bash +# Check logs +sudo docker compose logs + +# Common issues: +# - Port 8000 already in use +# - GPU not accessible (nvidia-ctk not configured) +# - Missing .env file +``` + +--- + +## Security Considerations + +1. **API Key**: Keep `REFLECTOR_GPU_APIKEY` secret, rotate periodically +2. **HuggingFace Token**: Treat as password, never commit to git +3. **Firewall**: Only expose ports 80 and 443 publicly +4. **Updates**: Regularly update system packages +5. **Monitoring**: Set up alerts for service failures + +--- + +## Updating + +### Docker + +```bash +cd ~/reflector/gpu/self_hosted +git pull +sudo docker compose build +sudo docker compose up -d +``` + +### Systemd + +```bash +cd ~/reflector/gpu/self_hosted +git pull +uv sync +sudo systemctl restart reflector-gpu +``` diff --git a/gpu/modal_deployments/reflector_diarizer.py b/gpu/modal_deployments/reflector_diarizer.py index 9bcc8a4e..816e17cf 100644 --- a/gpu/modal_deployments/reflector_diarizer.py +++ b/gpu/modal_deployments/reflector_diarizer.py @@ -24,6 +24,12 @@ app = modal.App(name="reflector-diarizer") upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True) +# IMPORTANT: This function is duplicated in multiple files for deployment isolation. +# If you modify the audio format detection logic, you MUST update all copies: +# - gpu/self_hosted/app/utils.py +# - gpu/modal_deployments/reflector_transcriber.py (2 copies) +# - gpu/modal_deployments/reflector_transcriber_parakeet.py +# - gpu/modal_deployments/reflector_diarizer.py (this file) def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension: parsed_url = urlparse(url) url_path = parsed_url.path @@ -39,6 +45,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens return AudioFileExtension("wav") if "audio/mp4" in content_type: return AudioFileExtension("mp4") + if "audio/webm" in content_type or "video/webm" in content_type: + return AudioFileExtension("webm") raise ValueError( f"Unsupported audio format for URL: {url}. " diff --git a/gpu/modal_deployments/reflector_transcriber.py b/gpu/modal_deployments/reflector_transcriber.py index 3f652de9..fbd41083 100644 --- a/gpu/modal_deployments/reflector_transcriber.py +++ b/gpu/modal_deployments/reflector_transcriber.py @@ -99,6 +99,12 @@ image = ( ) +# IMPORTANT: This function is duplicated in multiple files for deployment isolation. +# If you modify the audio format detection logic, you MUST update all copies: +# - gpu/self_hosted/app/utils.py +# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!) +# - gpu/modal_deployments/reflector_transcriber_parakeet.py +# - gpu/modal_deployments/reflector_diarizer.py def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension: parsed_url = urlparse(url) url_path = parsed_url.path @@ -114,6 +120,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens return AudioFileExtension("wav") if "audio/mp4" in content_type: return AudioFileExtension("mp4") + if "audio/webm" in content_type or "video/webm" in content_type: + return AudioFileExtension("webm") raise ValueError( f"Unsupported audio format for URL: {url}. " @@ -316,6 +324,11 @@ class TranscriberWhisperFile: import numpy as np from silero_vad import VADIterator + # IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation. + # If you modify this function, you MUST update all copies: + # - gpu/modal_deployments/reflector_transcriber.py (this file) + # - gpu/modal_deployments/reflector_transcriber_parakeet.py + # - gpu/self_hosted/app/services/transcriber.py def vad_segments( audio_array, sample_rate: int = SAMPLERATE, @@ -323,6 +336,7 @@ class TranscriberWhisperFile: ) -> Generator[TimeSegment, None, None]: """Generate speech segments as TimeSegment using Silero VAD.""" iterator = VADIterator(self.vad_model, sampling_rate=sample_rate) + audio_duration = len(audio_array) / float(SAMPLERATE) start = None for i in range(0, len(audio_array), window_size): chunk = audio_array[i : i + window_size] @@ -342,6 +356,9 @@ class TranscriberWhisperFile: start / float(SAMPLERATE), end / float(SAMPLERATE) ) start = None + # Handle case where audio ends while speech is still active + if start is not None: + yield TimeSegment(start / float(SAMPLERATE), audio_duration) iterator.reset_states() upload_volume.reload() @@ -407,6 +424,12 @@ class TranscriberWhisperFile: return {"text": " ".join(all_text), "words": all_words} +# IMPORTANT: This function is duplicated in multiple files for deployment isolation. +# If you modify the audio format detection logic, you MUST update all copies: +# - gpu/self_hosted/app/utils.py +# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!) +# - gpu/modal_deployments/reflector_transcriber_parakeet.py +# - gpu/modal_deployments/reflector_diarizer.py def detect_audio_format(url: str, headers: dict) -> str: from urllib.parse import urlparse @@ -424,6 +447,8 @@ def detect_audio_format(url: str, headers: dict) -> str: return "wav" if "audio/mp4" in content_type: return "mp4" + if "audio/webm" in content_type or "video/webm" in content_type: + return "webm" raise HTTPException( status_code=400, diff --git a/gpu/modal_deployments/reflector_transcriber_parakeet.py b/gpu/modal_deployments/reflector_transcriber_parakeet.py index 5f326b77..046f1a7c 100644 --- a/gpu/modal_deployments/reflector_transcriber_parakeet.py +++ b/gpu/modal_deployments/reflector_transcriber_parakeet.py @@ -90,6 +90,12 @@ image = ( ) +# IMPORTANT: This function is duplicated in multiple files for deployment isolation. +# If you modify the audio format detection logic, you MUST update all copies: +# - gpu/self_hosted/app/utils.py +# - gpu/modal_deployments/reflector_transcriber.py (2 copies) +# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file) +# - gpu/modal_deployments/reflector_diarizer.py def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension: parsed_url = urlparse(url) url_path = parsed_url.path @@ -105,6 +111,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens return AudioFileExtension("wav") if "audio/mp4" in content_type: return AudioFileExtension("mp4") + if "audio/webm" in content_type or "video/webm" in content_type: + return AudioFileExtension("webm") raise ValueError( f"Unsupported audio format for URL: {url}. " @@ -301,6 +309,11 @@ class TranscriberParakeetFile: audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True) return audio_array + # IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation. + # If you modify this function, you MUST update all copies: + # - gpu/modal_deployments/reflector_transcriber.py + # - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file) + # - gpu/self_hosted/app/services/transcriber.py def vad_segment_generator( audio_array, ) -> Generator[TimeSegment, None, None]: diff --git a/gpu/self_hosted/DEV_SETUP.md b/gpu/self_hosted/DEV_SETUP.md new file mode 100644 index 00000000..9b76865c --- /dev/null +++ b/gpu/self_hosted/DEV_SETUP.md @@ -0,0 +1,137 @@ +# Local Development GPU Setup + +Run transcription and diarization locally for development/testing. + +> **For production deployment**, see the [Self-Hosted GPU Setup Guide](../../docs/docs/installation/self-hosted-gpu-setup.md). + +## Prerequisites + +1. **Python 3.12+** and **uv** package manager +2. **FFmpeg** installed and on PATH +3. **HuggingFace account** with access to pyannote models + +### Accept Pyannote Licenses (Required) + +Before first run, accept licenses for these gated models (logged into HuggingFace): +- https://hf.co/pyannote/speaker-diarization-3.1 +- https://hf.co/pyannote/segmentation-3.0 + +## Quick Start + +### 1. Install dependencies + +```bash +cd gpu/self_hosted +uv sync +``` + +### 2. Start the GPU service + +```bash +cd gpu/self_hosted +HF_TOKEN= \ +REFLECTOR_GPU_APIKEY=dev-key-12345 \ +.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +Note: The `.env` file is NOT auto-loaded. Pass env vars explicitly or use: +```bash +export HF_TOKEN= +export REFLECTOR_GPU_APIKEY=dev-key-12345 +.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +### 3. Configure Reflector to use local GPU + +Edit `server/.env`: + +```bash +# Transcription - local GPU service +TRANSCRIPT_BACKEND=modal +TRANSCRIPT_URL=http://host.docker.internal:8000 +TRANSCRIPT_MODAL_API_KEY=dev-key-12345 + +# Diarization - local GPU service +DIARIZATION_BACKEND=modal +DIARIZATION_URL=http://host.docker.internal:8000 +DIARIZATION_MODAL_API_KEY=dev-key-12345 +``` + +Note: Use `host.docker.internal` because Reflector server runs in Docker. + +### 4. Restart Reflector server + +```bash +cd server +docker compose restart server worker +``` + +## Testing + +### Test transcription + +```bash +curl -s -X POST http://localhost:8000/v1/audio/transcriptions \ + -H "Authorization: Bearer dev-key-12345" \ + -F "file=@/path/to/audio.wav" \ + -F "language=en" +``` + +### Test diarization + +```bash +curl -s -X POST "http://localhost:8000/diarize?audio_file_url=" \ + -H "Authorization: Bearer dev-key-12345" +``` + +## Platform Notes + +### macOS (ARM) + +Docker build fails - CUDA packages are x86_64 only. Use local Python instead: +```bash +uv sync +HF_TOKEN=xxx REFLECTOR_GPU_APIKEY=xxx .venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +### Linux with NVIDIA GPU + +Docker works with CUDA acceleration: +```bash +docker compose up -d +``` + +### CPU-only + +Works on any platform, just slower. PyTorch auto-detects and falls back to CPU. + +## Switching Back to Modal.com + +Edit `server/.env`: + +```bash +TRANSCRIPT_BACKEND=modal +TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-parakeet-web.modal.run +TRANSCRIPT_MODAL_API_KEY= + +DIARIZATION_BACKEND=modal +DIARIZATION_URL=https://monadical-sas--reflector-diarizer-web.modal.run +DIARIZATION_MODAL_API_KEY= +``` + +## Troubleshooting + +### "Could not download pyannote pipeline" +- Accept model licenses at HuggingFace (see Prerequisites) +- Verify HF_TOKEN is set and valid + +### Service won't start +- Check port 8000 is free: `lsof -i :8000` +- Kill orphan processes if needed + +### Transcription returns empty text +- Ensure audio contains speech (not just tones/silence) +- Check audio format is supported (wav, mp3, etc.) + +### Deprecation warnings from torchaudio/pyannote +- Safe to ignore - doesn't affect functionality diff --git a/gpu/self_hosted/README.md b/gpu/self_hosted/README.md index 0180a8ae..395e05a6 100644 --- a/gpu/self_hosted/README.md +++ b/gpu/self_hosted/README.md @@ -56,9 +56,13 @@ Docker - Not yet provided in this directory. A Dockerfile will be added later. For now, use Local run above -Conformance tests +# Setup -# From this directory +[SETUP.md](SETUP.md) + +# Conformance tests + +## From this directory TRANSCRIPT_URL=http://localhost:8000 \ TRANSCRIPT_API_KEY=dev-key \ diff --git a/gpu/self_hosted/app/services/transcriber.py b/gpu/self_hosted/app/services/transcriber.py index 26a313cc..21e55fe5 100644 --- a/gpu/self_hosted/app/services/transcriber.py +++ b/gpu/self_hosted/app/services/transcriber.py @@ -129,6 +129,11 @@ class WhisperService: audio = np.frombuffer(proc.stdout, dtype=np.float32) return audio + # IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation. + # If you modify this function, you MUST update all copies: + # - gpu/modal_deployments/reflector_transcriber.py + # - gpu/modal_deployments/reflector_transcriber_parakeet.py + # - gpu/self_hosted/app/services/transcriber.py (this file) def vad_segments( audio_array, sample_rate: int = SAMPLE_RATE, @@ -153,6 +158,10 @@ class WhisperService: end = speech["end"] yield (start / float(SAMPLE_RATE), end / float(SAMPLE_RATE)) start = None + # Handle case where audio ends while speech is still active + if start is not None: + audio_duration = len(audio_array) / float(sample_rate) + yield (start / float(SAMPLE_RATE), audio_duration) iterator.reset_states() audio_array = load_audio_via_ffmpeg(file_path, SAMPLE_RATE) diff --git a/gpu/self_hosted/app/utils.py b/gpu/self_hosted/app/utils.py index 679804cb..7200cfa2 100644 --- a/gpu/self_hosted/app/utils.py +++ b/gpu/self_hosted/app/utils.py @@ -34,6 +34,12 @@ def ensure_dirs(): UPLOADS_PATH.mkdir(parents=True, exist_ok=True) +# IMPORTANT: This function is duplicated in multiple files for deployment isolation. +# If you modify the audio format detection logic, you MUST update all copies: +# - gpu/self_hosted/app/utils.py (this file) +# - gpu/modal_deployments/reflector_transcriber.py (2 copies) +# - gpu/modal_deployments/reflector_transcriber_parakeet.py +# - gpu/modal_deployments/reflector_diarizer.py def detect_audio_format(url: str, headers: Mapping[str, str]) -> str: url_path = urlparse(url).path for ext in SUPPORTED_FILE_EXTENSIONS: @@ -47,6 +53,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> str: return "wav" if "audio/mp4" in content_type: return "mp4" + if "audio/webm" in content_type or "video/webm" in content_type: + return "webm" raise HTTPException( status_code=400, diff --git a/server/env.example b/server/.env.example similarity index 100% rename from server/env.example rename to server/.env.example diff --git a/server/reflector/settings.py b/server/reflector/settings.py index 1ec46d94..3f78f393 100644 --- a/server/reflector/settings.py +++ b/server/reflector/settings.py @@ -30,7 +30,9 @@ class Settings(BaseSettings): AUDIO_CHUNKER_BACKEND: str = "frames" # Audio Transcription - # backends: whisper, modal + # backends: + # - whisper: in-process model loading (no HTTP, runs in same process) + # - modal: HTTP API client (works with Modal.com OR self-hosted gpu/self_hosted/) TRANSCRIPT_BACKEND: str = "whisper" TRANSCRIPT_URL: str | None = None TRANSCRIPT_TIMEOUT: int = 90 @@ -75,6 +77,9 @@ class Settings(BaseSettings): LLM_CONTEXT_WINDOW: int = 16000 # Diarization + # backends: + # - pyannote: in-process model loading (no HTTP, runs in same process) + # - modal: HTTP API client (works with Modal.com OR self-hosted gpu/self_hosted/) DIARIZATION_ENABLED: bool = True DIARIZATION_BACKEND: str = "modal" DIARIZATION_URL: str | None = None