mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 12:49:06 +00:00
gpu self hosted setup guide (no-mistakes)
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
b9d891d3424f371642cb032ecfd0e2564470a72c:server/tests/test_transcripts_recording_deletion.py:generic-api-key:15
|
b9d891d3424f371642cb032ecfd0e2564470a72c:server/tests/test_transcripts_recording_deletion.py:generic-api-key:15
|
||||||
docs/docs/installation/auth-setup.md:curl-auth-header:250
|
docs/docs/installation/auth-setup.md:curl-auth-header:250
|
||||||
docs/docs/installation/daily-setup.md:curl-auth-header:277
|
docs/docs/installation/daily-setup.md:curl-auth-header:277
|
||||||
|
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:74
|
||||||
|
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:83
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# Usage: docker compose -f docker-compose.prod.yml up -d
|
# Usage: docker compose -f docker-compose.prod.yml up -d
|
||||||
#
|
#
|
||||||
# Prerequisites:
|
# Prerequisites:
|
||||||
# 1. Copy env.example to .env and configure for both server/ and www/
|
# 1. Copy .env.example to .env and configure for both server/ and www/
|
||||||
# 2. Copy Caddyfile.example to Caddyfile and edit with your domains
|
# 2. Copy Caddyfile.example to Caddyfile and edit with your domains
|
||||||
# 3. Deploy Modal GPU functions (see gpu/modal_deployments/deploy-all.sh)
|
# 3. Deploy Modal GPU functions (see gpu/modal_deployments/deploy-all.sh)
|
||||||
|
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ Please provide specific values for:
|
|||||||
- Exact UDP port range used (e.g., 10000-20000)
|
- Exact UDP port range used (e.g., 10000-20000)
|
||||||
- STUN server configuration (if any)
|
- STUN server configuration (if any)
|
||||||
- ICE candidate gathering timeout
|
- ICE candidate gathering timeout
|
||||||
|
- https://docs.daily.co/guides/privacy-and-security/corporate-firewalls-nats-allowed-ip-list
|
||||||
|
|
||||||
- [ ] **Worker Configuration**
|
- [ ] **Worker Configuration**
|
||||||
- Default Celery worker count
|
- Default Celery worker count
|
||||||
|
|||||||
@@ -9,11 +9,17 @@ This guide walks you through deploying Reflector from scratch. Follow these step
|
|||||||
|
|
||||||
## What You'll Set Up
|
## What You'll Set Up
|
||||||
|
|
||||||
```
|
```mermaid
|
||||||
User --> Caddy (auto-SSL) --> Frontend (Next.js)
|
flowchart LR
|
||||||
--> Backend (FastAPI) --> PostgreSQL
|
User --> Caddy["Caddy (auto-SSL)"]
|
||||||
--> Redis
|
Caddy --> Frontend["Frontend (Next.js)"]
|
||||||
--> Celery Workers --> Modal.com GPU
|
Caddy --> Backend["Backend (FastAPI)"]
|
||||||
|
Backend --> PostgreSQL
|
||||||
|
Backend --> Redis
|
||||||
|
Backend --> Workers["Celery Workers"]
|
||||||
|
Workers --> PostgreSQL
|
||||||
|
Workers --> Redis
|
||||||
|
Workers --> GPU["GPU Processing<br/>(Modal.com OR Self-hosted)"]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
@@ -22,7 +28,9 @@ Before starting, you need:
|
|||||||
|
|
||||||
- [ ] **Production server** - Ubuntu 22.04+, 4+ cores, 8GB+ RAM, public IP
|
- [ ] **Production server** - Ubuntu 22.04+, 4+ cores, 8GB+ RAM, public IP
|
||||||
- [ ] **Two domain names** - e.g., `app.example.com` (frontend) and `api.example.com` (backend)
|
- [ ] **Two domain names** - e.g., `app.example.com` (frontend) and `api.example.com` (backend)
|
||||||
- [ ] **Modal.com account** - Free tier at https://modal.com
|
- [ ] **GPU processing** - Choose one:
|
||||||
|
- Modal.com account (free tier at https://modal.com), OR
|
||||||
|
- GPU server with NVIDIA GPU (8GB+ VRAM)
|
||||||
- [ ] **HuggingFace account** - Free at https://huggingface.co
|
- [ ] **HuggingFace account** - Free at https://huggingface.co
|
||||||
- [ ] **OpenAI API key** - For summaries and topic detection at https://platform.openai.com/account/api-keys
|
- [ ] **OpenAI API key** - For summaries and topic detection at https://platform.openai.com/account/api-keys
|
||||||
|
|
||||||
@@ -52,13 +60,24 @@ dig api.example.com +short
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Step 2: Deploy Modal GPU Functions
|
## Step 2: Deploy GPU Processing
|
||||||
|
|
||||||
|
Reflector requires GPU processing for transcription (Whisper) and speaker diarization (Pyannote). Choose one option:
|
||||||
|
|
||||||
|
| | **Modal.com (Cloud)** | **Self-Hosted GPU** |
|
||||||
|
|---|---|---|
|
||||||
|
| **Best for** | No GPU hardware, zero maintenance | Own GPU server, full control |
|
||||||
|
| **Pricing** | Pay-per-use (~$0.01-0.10/min audio) | Fixed infrastructure cost |
|
||||||
|
| **Setup** | Run from laptop (browser auth) | Run on GPU server |
|
||||||
|
| **Scaling** | Automatic | Manual |
|
||||||
|
|
||||||
|
### Option A: Modal.com (Serverless Cloud GPU)
|
||||||
|
|
||||||
**Location: YOUR LOCAL COMPUTER (laptop/desktop)**
|
**Location: YOUR LOCAL COMPUTER (laptop/desktop)**
|
||||||
|
|
||||||
Modal requires browser authentication, so this runs locally - not on your server.
|
Modal requires browser authentication, so this runs locally - not on your server.
|
||||||
|
|
||||||
### Accept HuggingFace Licenses
|
#### Accept HuggingFace Licenses
|
||||||
|
|
||||||
Visit both pages and click "Accept":
|
Visit both pages and click "Accept":
|
||||||
- https://huggingface.co/pyannote/speaker-diarization-3.1
|
- https://huggingface.co/pyannote/speaker-diarization-3.1
|
||||||
@@ -66,7 +85,7 @@ Visit both pages and click "Accept":
|
|||||||
|
|
||||||
Then generate a token at https://huggingface.co/settings/tokens
|
Then generate a token at https://huggingface.co/settings/tokens
|
||||||
|
|
||||||
### Deploy to Modal
|
#### Deploy to Modal
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install modal
|
pip install modal
|
||||||
@@ -77,10 +96,26 @@ cd reflector/gpu/modal_deployments
|
|||||||
./deploy-all.sh --hf-token YOUR_HUGGINGFACE_TOKEN
|
./deploy-all.sh --hf-token YOUR_HUGGINGFACE_TOKEN
|
||||||
```
|
```
|
||||||
|
|
||||||
**Save the output** - copy the configuration block, you'll need it for Step 5.
|
**Save the output** - copy the configuration block, you'll need it for Step 4.
|
||||||
|
|
||||||
See [Modal Setup](./modal-setup) for troubleshooting and details.
|
See [Modal Setup](./modal-setup) for troubleshooting and details.
|
||||||
|
|
||||||
|
### Option B: Self-Hosted GPU
|
||||||
|
|
||||||
|
**Location: YOUR GPU SERVER**
|
||||||
|
|
||||||
|
Requires: NVIDIA GPU with 8GB+ VRAM, Ubuntu 22.04+, 40-50GB disk (Docker) or 25-30GB (Systemd).
|
||||||
|
|
||||||
|
See [Self-Hosted GPU Setup](./self-hosted-gpu-setup) for complete instructions. Quick summary:
|
||||||
|
|
||||||
|
1. Install NVIDIA drivers and Docker (or uv for systemd)
|
||||||
|
2. Clone repository: `git clone https://github.com/monadical-sas/reflector.git`
|
||||||
|
3. Configure `.env` with HuggingFace token
|
||||||
|
4. Start service (Docker compose or systemd)
|
||||||
|
5. Set up Caddy reverse proxy for HTTPS
|
||||||
|
|
||||||
|
**Save your API key and HTTPS URL** - you'll need them for Step 4.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Step 3: Prepare Server
|
## Step 3: Prepare Server
|
||||||
@@ -102,12 +137,9 @@ ssh user@your-server-ip
|
|||||||
docker --version # verify
|
docker --version # verify
|
||||||
```
|
```
|
||||||
|
|
||||||
### Open Firewall
|
### Firewall
|
||||||
|
|
||||||
```bash
|
Ensure ports 80 (HTTP) and 443 (HTTPS) are open for inbound traffic. The method varies by cloud provider and OS configuration.
|
||||||
sudo ufw allow 80/tcp
|
|
||||||
sudo ufw allow 443/tcp
|
|
||||||
```
|
|
||||||
|
|
||||||
### Clone Repository
|
### Clone Repository
|
||||||
|
|
||||||
@@ -120,7 +152,7 @@ cd reflector
|
|||||||
|
|
||||||
## Step 4: Configure Environment
|
## Step 4: Configure Environment
|
||||||
|
|
||||||
**Location: YOUR SERVER (via SSH)**
|
**Location: YOUR SERVER (via SSH, in the `reflector` directory)**
|
||||||
|
|
||||||
Reflector has two env files:
|
Reflector has two env files:
|
||||||
- `server/.env` - Backend configuration
|
- `server/.env` - Backend configuration
|
||||||
@@ -129,7 +161,7 @@ Reflector has two env files:
|
|||||||
### Backend Configuration
|
### Backend Configuration
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cp server/env.example server/.env
|
cp server/.env.example server/.env
|
||||||
nano server/.env
|
nano server/.env
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -151,15 +183,24 @@ CORS_ALLOW_CREDENTIALS=true
|
|||||||
# Secret key - generate with: openssl rand -hex 32
|
# Secret key - generate with: openssl rand -hex 32
|
||||||
SECRET_KEY=<your-generated-secret>
|
SECRET_KEY=<your-generated-secret>
|
||||||
|
|
||||||
# Modal GPU (paste from deploy-all.sh output)
|
# GPU Processing - choose ONE option from Step 2:
|
||||||
|
|
||||||
|
# Option A: Modal.com (paste from deploy-all.sh output)
|
||||||
TRANSCRIPT_BACKEND=modal
|
TRANSCRIPT_BACKEND=modal
|
||||||
TRANSCRIPT_URL=https://yourname--reflector-transcriber-web.modal.run
|
TRANSCRIPT_URL=https://yourname--reflector-transcriber-web.modal.run
|
||||||
TRANSCRIPT_MODAL_API_KEY=<from-deploy-all.sh-output>
|
TRANSCRIPT_MODAL_API_KEY=<from-deploy-all.sh-output>
|
||||||
|
|
||||||
DIARIZATION_BACKEND=modal
|
DIARIZATION_BACKEND=modal
|
||||||
DIARIZATION_URL=https://yourname--reflector-diarizer-web.modal.run
|
DIARIZATION_URL=https://yourname--reflector-diarizer-web.modal.run
|
||||||
DIARIZATION_MODAL_API_KEY=<from-deploy-all.sh-output>
|
DIARIZATION_MODAL_API_KEY=<from-deploy-all.sh-output>
|
||||||
|
|
||||||
|
# Option B: Self-hosted GPU (use your GPU server URL and API key)
|
||||||
|
# TRANSCRIPT_BACKEND=modal
|
||||||
|
# TRANSCRIPT_URL=https://gpu.example.com
|
||||||
|
# TRANSCRIPT_MODAL_API_KEY=<your-generated-api-key>
|
||||||
|
# DIARIZATION_BACKEND=modal
|
||||||
|
# DIARIZATION_URL=https://gpu.example.com
|
||||||
|
# DIARIZATION_MODAL_API_KEY=<your-generated-api-key>
|
||||||
|
|
||||||
# Storage - where to store audio files and transcripts
|
# Storage - where to store audio files and transcripts
|
||||||
TRANSCRIPT_STORAGE_BACKEND=local
|
TRANSCRIPT_STORAGE_BACKEND=local
|
||||||
|
|
||||||
@@ -205,7 +246,8 @@ cp Caddyfile.example Caddyfile
|
|||||||
nano Caddyfile
|
nano Caddyfile
|
||||||
```
|
```
|
||||||
|
|
||||||
Replace `example.com` with your domains:
|
Replace `example.com` with your domains. The `{$VAR:default}` syntax uses Caddy's env var substitution - you can either edit the file directly or set `FRONTEND_DOMAIN` and `API_DOMAIN` environment variables.
|
||||||
|
|
||||||
```
|
```
|
||||||
{$FRONTEND_DOMAIN:app.example.com} {
|
{$FRONTEND_DOMAIN:app.example.com} {
|
||||||
reverse_proxy web:3000
|
reverse_proxy web:3000
|
||||||
@@ -226,9 +268,13 @@ Replace `example.com` with your domains:
|
|||||||
docker compose -f docker-compose.prod.yml up -d
|
docker compose -f docker-compose.prod.yml up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
Wait for containers to start (~30 seconds), then run migrations:
|
Wait for PostgreSQL to be ready, then run migrations:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
# Wait for postgres to be healthy (may take 30-60 seconds on first run)
|
||||||
|
docker compose -f docker-compose.prod.yml exec postgres pg_isready -U reflector
|
||||||
|
|
||||||
|
# Run database migrations
|
||||||
docker compose -f docker-compose.prod.yml exec server uv run alembic upgrade head
|
docker compose -f docker-compose.prod.yml exec server uv run alembic upgrade head
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -332,6 +378,7 @@ docker compose -f docker-compose.prod.yml logs
|
|||||||
|
|
||||||
## Next Steps
|
## Next Steps
|
||||||
|
|
||||||
- [Modal Setup](./modal-setup) - GPU processing details
|
- [Modal Setup](./modal-setup) - Cloud GPU processing details
|
||||||
|
- [Self-Hosted GPU Setup](./self-hosted-gpu-setup) - Own GPU server deployment
|
||||||
- [Authentication Setup](./auth-setup) - Authentik OAuth
|
- [Authentication Setup](./auth-setup) - Authentik OAuth
|
||||||
- [System Requirements](./requirements) - Hardware specs
|
- [System Requirements](./requirements) - Hardware specs
|
||||||
|
|||||||
514
docs/docs/installation/self-hosted-gpu-setup.md
Normal file
514
docs/docs/installation/self-hosted-gpu-setup.md
Normal file
@@ -0,0 +1,514 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 5
|
||||||
|
title: Self-Hosted GPU Setup
|
||||||
|
---
|
||||||
|
|
||||||
|
# Self-Hosted GPU Setup
|
||||||
|
|
||||||
|
This guide covers deploying Reflector's GPU processing on your own server instead of Modal.com. For the complete deployment guide, see [Deployment Guide](./overview).
|
||||||
|
|
||||||
|
## When to Use Self-Hosted GPU
|
||||||
|
|
||||||
|
**Choose self-hosted GPU if you:**
|
||||||
|
- Have GPU hardware available (NVIDIA required)
|
||||||
|
- Want full control over processing
|
||||||
|
- Prefer fixed infrastructure costs over pay-per-use
|
||||||
|
- Have privacy or data locality requirements
|
||||||
|
- Need to process audio without external API calls
|
||||||
|
|
||||||
|
**Choose Modal.com instead if you:**
|
||||||
|
- Don't have GPU hardware
|
||||||
|
- Want zero infrastructure management
|
||||||
|
- Prefer pay-per-use pricing
|
||||||
|
- Need instant scaling for variable workloads
|
||||||
|
|
||||||
|
See [Modal.com Setup](./modal-setup) for cloud GPU deployment.
|
||||||
|
|
||||||
|
## What Gets Deployed
|
||||||
|
|
||||||
|
The self-hosted GPU service provides the same API endpoints as Modal:
|
||||||
|
- `POST /v1/audio/transcriptions` - Whisper transcription
|
||||||
|
- `POST /diarize` - Pyannote speaker diarization
|
||||||
|
|
||||||
|
Your main Reflector server connects to this service exactly like it connects to Modal - only the URL changes.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### Hardware
|
||||||
|
- **GPU**: NVIDIA GPU with 8GB+ VRAM (tested on Tesla T4 with 15GB)
|
||||||
|
- **CPU**: 4+ cores recommended
|
||||||
|
- **RAM**: 8GB minimum, 16GB recommended
|
||||||
|
- **Disk**:
|
||||||
|
- Docker method: 40-50GB minimum
|
||||||
|
- Systemd method: 25-30GB minimum
|
||||||
|
|
||||||
|
### Software
|
||||||
|
- Ubuntu 22.04 or 24.04
|
||||||
|
- Public IP address
|
||||||
|
- Domain name with DNS A record pointing to server
|
||||||
|
|
||||||
|
### Accounts
|
||||||
|
- **HuggingFace account** with accepted Pyannote licenses:
|
||||||
|
- https://huggingface.co/pyannote/speaker-diarization-3.1
|
||||||
|
- https://huggingface.co/pyannote/segmentation-3.0
|
||||||
|
- **HuggingFace access token** from https://huggingface.co/settings/tokens
|
||||||
|
|
||||||
|
## Choose Deployment Method
|
||||||
|
|
||||||
|
### Docker Deployment (Recommended)
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Container isolation and reproducibility
|
||||||
|
- No manual library path configuration
|
||||||
|
- Easier to replicate across servers
|
||||||
|
- Built-in restart policies
|
||||||
|
- Simpler dependency management
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Higher disk usage (~15GB for container)
|
||||||
|
- Requires 40-50GB disk minimum
|
||||||
|
|
||||||
|
**Best for:** Teams wanting reproducible deployments, multiple GPU servers
|
||||||
|
|
||||||
|
### Systemd Deployment
|
||||||
|
|
||||||
|
**Pros:**
|
||||||
|
- Lower disk usage (~8GB total)
|
||||||
|
- Direct GPU access (no container layer)
|
||||||
|
- Works on smaller disks (25-30GB)
|
||||||
|
|
||||||
|
**Cons:**
|
||||||
|
- Manual `LD_LIBRARY_PATH` configuration
|
||||||
|
- Less portable across systems
|
||||||
|
|
||||||
|
**Best for:** Single GPU server, limited disk space
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Docker Deployment
|
||||||
|
|
||||||
|
### Step 1: Install NVIDIA Driver
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y nvidia-driver-535
|
||||||
|
|
||||||
|
# Load kernel modules
|
||||||
|
sudo modprobe nvidia
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected output: GPU details with driver version and CUDA version.
|
||||||
|
|
||||||
|
### Step 2: Install Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -fsSL https://get.docker.com | sudo sh
|
||||||
|
sudo usermod -aG docker $USER
|
||||||
|
|
||||||
|
# Log out and back in for group changes
|
||||||
|
exit
|
||||||
|
# SSH back in
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Install NVIDIA Container Toolkit
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Add NVIDIA repository
|
||||||
|
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
|
||||||
|
sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
|
||||||
|
|
||||||
|
curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
|
||||||
|
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
|
||||||
|
|
||||||
|
# Install toolkit
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y nvidia-container-toolkit
|
||||||
|
|
||||||
|
# Configure Docker runtime
|
||||||
|
sudo nvidia-ctk runtime configure --runtime=docker
|
||||||
|
sudo systemctl restart docker
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Clone Repository and Configure
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/monadical-sas/reflector.git
|
||||||
|
cd reflector/gpu/self_hosted
|
||||||
|
|
||||||
|
# Create environment file
|
||||||
|
cat > .env << EOF
|
||||||
|
REFLECTOR_GPU_APIKEY=$(openssl rand -hex 16)
|
||||||
|
HF_TOKEN=your_huggingface_token_here
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Note the generated API key - you'll need it for main server config
|
||||||
|
cat .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Create Docker Compose File
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat > compose.yml << 'EOF'
|
||||||
|
services:
|
||||||
|
reflector_gpu:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
volumes:
|
||||||
|
- ./cache:/root/.cache
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
restart: unless-stopped
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 6: Build and Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build image (takes ~5 minutes, downloads ~10GB)
|
||||||
|
sudo docker compose build
|
||||||
|
|
||||||
|
# Start service
|
||||||
|
sudo docker compose up -d
|
||||||
|
|
||||||
|
# Wait for startup and verify
|
||||||
|
sleep 30
|
||||||
|
sudo docker compose logs
|
||||||
|
```
|
||||||
|
|
||||||
|
Look for: `INFO: Application startup complete. Uvicorn running on http://0.0.0.0:8000`
|
||||||
|
|
||||||
|
### Step 7: Verify GPU Access
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check GPU is accessible from container
|
||||||
|
sudo docker exec $(sudo docker ps -q) nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
Should show GPU with ~3GB VRAM used (models loaded).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Systemd Deployment
|
||||||
|
|
||||||
|
### Step 1: Install NVIDIA Driver
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y nvidia-driver-535
|
||||||
|
|
||||||
|
# Load kernel modules
|
||||||
|
sudo modprobe nvidia
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install ffmpeg
|
||||||
|
sudo apt install -y ffmpeg
|
||||||
|
|
||||||
|
# Install uv package manager
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
source ~/.local/bin/env
|
||||||
|
|
||||||
|
# Clone repository
|
||||||
|
git clone https://github.com/monadical-sas/reflector.git
|
||||||
|
cd reflector/gpu/self_hosted
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Configure Environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create environment file
|
||||||
|
cat > .env << EOF
|
||||||
|
REFLECTOR_GPU_APIKEY=$(openssl rand -hex 16)
|
||||||
|
HF_TOKEN=your_huggingface_token_here
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Note the generated API key
|
||||||
|
cat .env
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Install Python Packages
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install dependencies (~3GB download)
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Create Systemd Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate library paths for NVIDIA packages
|
||||||
|
export NVIDIA_LIBS=$(find ~/reflector/gpu/self_hosted/.venv/lib/python3.12/site-packages/nvidia -name lib -type d | tr '\n' ':')
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
source ~/reflector/gpu/self_hosted/.env
|
||||||
|
|
||||||
|
# Create service file
|
||||||
|
sudo tee /etc/systemd/system/reflector-gpu.service << EOFSVC
|
||||||
|
[Unit]
|
||||||
|
Description=Reflector GPU Service (Transcription & Diarization)
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=$USER
|
||||||
|
WorkingDirectory=$HOME/reflector/gpu/self_hosted
|
||||||
|
Environment="PATH=$HOME/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||||
|
Environment="HF_TOKEN=${HF_TOKEN}"
|
||||||
|
Environment="REFLECTOR_GPU_APIKEY=${REFLECTOR_GPU_APIKEY}"
|
||||||
|
Environment="LD_LIBRARY_PATH=${NVIDIA_LIBS}"
|
||||||
|
ExecStart=$HOME/reflector/gpu/self_hosted/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOFSVC
|
||||||
|
|
||||||
|
# Enable and start
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable reflector-gpu
|
||||||
|
sudo systemctl start reflector-gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 6: Verify Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check status
|
||||||
|
sudo systemctl status reflector-gpu
|
||||||
|
|
||||||
|
# View logs
|
||||||
|
sudo journalctl -u reflector-gpu -f
|
||||||
|
```
|
||||||
|
|
||||||
|
Look for: `INFO: Application startup complete.`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configure HTTPS with Caddy
|
||||||
|
|
||||||
|
Both deployment methods need HTTPS for production. Caddy handles SSL automatically.
|
||||||
|
|
||||||
|
### Install Caddy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt install -y debian-keyring debian-archive-keyring apt-transport-https curl
|
||||||
|
|
||||||
|
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/gpg.key' | \
|
||||||
|
sudo gpg --dearmor -o /usr/share/keyrings/caddy-stable-archive-keyring.gpg
|
||||||
|
|
||||||
|
curl -1sLf 'https://dl.cloudsmith.io/public/caddy/stable/debian.deb.txt' | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/caddy-stable.list
|
||||||
|
|
||||||
|
sudo apt update
|
||||||
|
sudo apt install -y caddy
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configure Reverse Proxy
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo tee /etc/caddy/Caddyfile << 'EOF'
|
||||||
|
gpu.example.com {
|
||||||
|
reverse_proxy localhost:8000
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Reload Caddy (auto-provisions SSL certificate)
|
||||||
|
sudo systemctl reload caddy
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace `gpu.example.com` with your domain.
|
||||||
|
|
||||||
|
### Verify HTTPS
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -I https://gpu.example.com/docs
|
||||||
|
# Should return HTTP/2 200
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configure Main Reflector Server
|
||||||
|
|
||||||
|
On your main Reflector server, update `server/.env`:
|
||||||
|
|
||||||
|
```env
|
||||||
|
# GPU Processing - Self-hosted
|
||||||
|
TRANSCRIPT_BACKEND=modal
|
||||||
|
TRANSCRIPT_URL=https://gpu.example.com
|
||||||
|
TRANSCRIPT_MODAL_API_KEY=<your-generated-api-key>
|
||||||
|
|
||||||
|
DIARIZATION_BACKEND=modal
|
||||||
|
DIARIZATION_URL=https://gpu.example.com
|
||||||
|
DIARIZATION_MODAL_API_KEY=<your-generated-api-key>
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** The backend type is `modal` because the self-hosted GPU service implements the same API contract as Modal.com. This allows you to switch between cloud and self-hosted GPU processing by only changing the URL and API key.
|
||||||
|
|
||||||
|
Restart services to apply:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose.prod.yml restart server worker
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Service Management
|
||||||
|
|
||||||
|
All commands in this section assume you're in `~/reflector/gpu/self_hosted/`.
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View logs
|
||||||
|
sudo docker compose logs -f
|
||||||
|
|
||||||
|
# Restart service
|
||||||
|
sudo docker compose restart
|
||||||
|
|
||||||
|
# Stop service
|
||||||
|
sudo docker compose down
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo docker compose ps
|
||||||
|
```
|
||||||
|
|
||||||
|
### Systemd
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View logs
|
||||||
|
sudo journalctl -u reflector-gpu -f
|
||||||
|
|
||||||
|
# Restart service
|
||||||
|
sudo systemctl restart reflector-gpu
|
||||||
|
|
||||||
|
# Stop service
|
||||||
|
sudo systemctl stop reflector-gpu
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
sudo systemctl status reflector-gpu
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitor GPU
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check GPU usage
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
# Watch in real-time
|
||||||
|
watch -n 1 nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
**Typical GPU memory usage:**
|
||||||
|
- Idle (models loaded): ~3GB VRAM
|
||||||
|
- During transcription: ~4-5GB VRAM
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Notes
|
||||||
|
|
||||||
|
**Tesla T4 benchmarks:**
|
||||||
|
- Transcription: ~2-3x real-time (10 min audio in 3-5 min)
|
||||||
|
- Diarization: ~1.5x real-time
|
||||||
|
- Max concurrent requests: 2-3 (depends on audio length)
|
||||||
|
- First request warmup: ~10 seconds (model loading)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### nvidia-smi fails after driver install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Manually load kernel modules
|
||||||
|
sudo modprobe nvidia
|
||||||
|
nvidia-smi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Service fails with "Could not download pyannote pipeline"
|
||||||
|
|
||||||
|
1. Verify HF_TOKEN is valid: `echo $HF_TOKEN`
|
||||||
|
2. Check model access at https://huggingface.co/pyannote/speaker-diarization-3.1
|
||||||
|
3. Regenerate service/compose with correct token
|
||||||
|
4. Restart service
|
||||||
|
|
||||||
|
### cuDNN library loading errors (Systemd only)
|
||||||
|
|
||||||
|
Symptom: `Unable to load libcudnn_cnn.so`
|
||||||
|
|
||||||
|
Regenerate the systemd service file - the `LD_LIBRARY_PATH` must include all NVIDIA package directories.
|
||||||
|
|
||||||
|
### Cannot connect to HTTPS endpoint
|
||||||
|
|
||||||
|
1. Verify DNS resolves: `dig +short gpu.example.com`
|
||||||
|
2. Check firewall: `sudo ufw status` (ports 80, 443 must be open)
|
||||||
|
3. Check Caddy: `sudo systemctl status caddy`
|
||||||
|
4. View Caddy logs: `sudo journalctl -u caddy -n 50`
|
||||||
|
|
||||||
|
### SSL certificate not provisioning
|
||||||
|
|
||||||
|
Requirements for Let's Encrypt:
|
||||||
|
- Ports 80 and 443 publicly accessible
|
||||||
|
- DNS resolves to server's public IP
|
||||||
|
- Valid domain (not localhost or private IP)
|
||||||
|
|
||||||
|
### Docker container won't start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check logs
|
||||||
|
sudo docker compose logs
|
||||||
|
|
||||||
|
# Common issues:
|
||||||
|
# - Port 8000 already in use
|
||||||
|
# - GPU not accessible (nvidia-ctk not configured)
|
||||||
|
# - Missing .env file
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
1. **API Key**: Keep `REFLECTOR_GPU_APIKEY` secret, rotate periodically
|
||||||
|
2. **HuggingFace Token**: Treat as password, never commit to git
|
||||||
|
3. **Firewall**: Only expose ports 80 and 443 publicly
|
||||||
|
4. **Updates**: Regularly update system packages
|
||||||
|
5. **Monitoring**: Set up alerts for service failures
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Updating
|
||||||
|
|
||||||
|
### Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/reflector/gpu/self_hosted
|
||||||
|
git pull
|
||||||
|
sudo docker compose build
|
||||||
|
sudo docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Systemd
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ~/reflector/gpu/self_hosted
|
||||||
|
git pull
|
||||||
|
uv sync
|
||||||
|
sudo systemctl restart reflector-gpu
|
||||||
|
```
|
||||||
@@ -24,6 +24,12 @@ app = modal.App(name="reflector-diarizer")
|
|||||||
upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True)
|
upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True)
|
||||||
|
|
||||||
|
|
||||||
|
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify the audio format detection logic, you MUST update all copies:
|
||||||
|
# - gpu/self_hosted/app/utils.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/modal_deployments/reflector_diarizer.py (this file)
|
||||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
url_path = parsed_url.path
|
url_path = parsed_url.path
|
||||||
@@ -39,6 +45,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
|||||||
return AudioFileExtension("wav")
|
return AudioFileExtension("wav")
|
||||||
if "audio/mp4" in content_type:
|
if "audio/mp4" in content_type:
|
||||||
return AudioFileExtension("mp4")
|
return AudioFileExtension("mp4")
|
||||||
|
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||||
|
return AudioFileExtension("webm")
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unsupported audio format for URL: {url}. "
|
f"Unsupported audio format for URL: {url}. "
|
||||||
|
|||||||
@@ -99,6 +99,12 @@ image = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify the audio format detection logic, you MUST update all copies:
|
||||||
|
# - gpu/self_hosted/app/utils.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/modal_deployments/reflector_diarizer.py
|
||||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
url_path = parsed_url.path
|
url_path = parsed_url.path
|
||||||
@@ -114,6 +120,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
|||||||
return AudioFileExtension("wav")
|
return AudioFileExtension("wav")
|
||||||
if "audio/mp4" in content_type:
|
if "audio/mp4" in content_type:
|
||||||
return AudioFileExtension("mp4")
|
return AudioFileExtension("mp4")
|
||||||
|
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||||
|
return AudioFileExtension("webm")
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unsupported audio format for URL: {url}. "
|
f"Unsupported audio format for URL: {url}. "
|
||||||
@@ -316,6 +324,11 @@ class TranscriberWhisperFile:
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from silero_vad import VADIterator
|
from silero_vad import VADIterator
|
||||||
|
|
||||||
|
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify this function, you MUST update all copies:
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (this file)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/self_hosted/app/services/transcriber.py
|
||||||
def vad_segments(
|
def vad_segments(
|
||||||
audio_array,
|
audio_array,
|
||||||
sample_rate: int = SAMPLERATE,
|
sample_rate: int = SAMPLERATE,
|
||||||
@@ -323,6 +336,7 @@ class TranscriberWhisperFile:
|
|||||||
) -> Generator[TimeSegment, None, None]:
|
) -> Generator[TimeSegment, None, None]:
|
||||||
"""Generate speech segments as TimeSegment using Silero VAD."""
|
"""Generate speech segments as TimeSegment using Silero VAD."""
|
||||||
iterator = VADIterator(self.vad_model, sampling_rate=sample_rate)
|
iterator = VADIterator(self.vad_model, sampling_rate=sample_rate)
|
||||||
|
audio_duration = len(audio_array) / float(SAMPLERATE)
|
||||||
start = None
|
start = None
|
||||||
for i in range(0, len(audio_array), window_size):
|
for i in range(0, len(audio_array), window_size):
|
||||||
chunk = audio_array[i : i + window_size]
|
chunk = audio_array[i : i + window_size]
|
||||||
@@ -342,6 +356,9 @@ class TranscriberWhisperFile:
|
|||||||
start / float(SAMPLERATE), end / float(SAMPLERATE)
|
start / float(SAMPLERATE), end / float(SAMPLERATE)
|
||||||
)
|
)
|
||||||
start = None
|
start = None
|
||||||
|
# Handle case where audio ends while speech is still active
|
||||||
|
if start is not None:
|
||||||
|
yield TimeSegment(start / float(SAMPLERATE), audio_duration)
|
||||||
iterator.reset_states()
|
iterator.reset_states()
|
||||||
|
|
||||||
upload_volume.reload()
|
upload_volume.reload()
|
||||||
@@ -407,6 +424,12 @@ class TranscriberWhisperFile:
|
|||||||
return {"text": " ".join(all_text), "words": all_words}
|
return {"text": " ".join(all_text), "words": all_words}
|
||||||
|
|
||||||
|
|
||||||
|
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify the audio format detection logic, you MUST update all copies:
|
||||||
|
# - gpu/self_hosted/app/utils.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/modal_deployments/reflector_diarizer.py
|
||||||
def detect_audio_format(url: str, headers: dict) -> str:
|
def detect_audio_format(url: str, headers: dict) -> str:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
@@ -424,6 +447,8 @@ def detect_audio_format(url: str, headers: dict) -> str:
|
|||||||
return "wav"
|
return "wav"
|
||||||
if "audio/mp4" in content_type:
|
if "audio/mp4" in content_type:
|
||||||
return "mp4"
|
return "mp4"
|
||||||
|
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||||
|
return "webm"
|
||||||
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
|
|||||||
@@ -90,6 +90,12 @@ image = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify the audio format detection logic, you MUST update all copies:
|
||||||
|
# - gpu/self_hosted/app/utils.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
|
||||||
|
# - gpu/modal_deployments/reflector_diarizer.py
|
||||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
url_path = parsed_url.path
|
url_path = parsed_url.path
|
||||||
@@ -105,6 +111,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
|||||||
return AudioFileExtension("wav")
|
return AudioFileExtension("wav")
|
||||||
if "audio/mp4" in content_type:
|
if "audio/mp4" in content_type:
|
||||||
return AudioFileExtension("mp4")
|
return AudioFileExtension("mp4")
|
||||||
|
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||||
|
return AudioFileExtension("webm")
|
||||||
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Unsupported audio format for URL: {url}. "
|
f"Unsupported audio format for URL: {url}. "
|
||||||
@@ -301,6 +309,11 @@ class TranscriberParakeetFile:
|
|||||||
audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True)
|
audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True)
|
||||||
return audio_array
|
return audio_array
|
||||||
|
|
||||||
|
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify this function, you MUST update all copies:
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
|
||||||
|
# - gpu/self_hosted/app/services/transcriber.py
|
||||||
def vad_segment_generator(
|
def vad_segment_generator(
|
||||||
audio_array,
|
audio_array,
|
||||||
) -> Generator[TimeSegment, None, None]:
|
) -> Generator[TimeSegment, None, None]:
|
||||||
|
|||||||
137
gpu/self_hosted/DEV_SETUP.md
Normal file
137
gpu/self_hosted/DEV_SETUP.md
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
# Local Development GPU Setup
|
||||||
|
|
||||||
|
Run transcription and diarization locally for development/testing.
|
||||||
|
|
||||||
|
> **For production deployment**, see the [Self-Hosted GPU Setup Guide](../../docs/docs/installation/self-hosted-gpu-setup.md).
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
1. **Python 3.12+** and **uv** package manager
|
||||||
|
2. **FFmpeg** installed and on PATH
|
||||||
|
3. **HuggingFace account** with access to pyannote models
|
||||||
|
|
||||||
|
### Accept Pyannote Licenses (Required)
|
||||||
|
|
||||||
|
Before first run, accept licenses for these gated models (logged into HuggingFace):
|
||||||
|
- https://hf.co/pyannote/speaker-diarization-3.1
|
||||||
|
- https://hf.co/pyannote/segmentation-3.0
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Install dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd gpu/self_hosted
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start the GPU service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd gpu/self_hosted
|
||||||
|
HF_TOKEN=<your-huggingface-token> \
|
||||||
|
REFLECTOR_GPU_APIKEY=dev-key-12345 \
|
||||||
|
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: The `.env` file is NOT auto-loaded. Pass env vars explicitly or use:
|
||||||
|
```bash
|
||||||
|
export HF_TOKEN=<your-token>
|
||||||
|
export REFLECTOR_GPU_APIKEY=dev-key-12345
|
||||||
|
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Configure Reflector to use local GPU
|
||||||
|
|
||||||
|
Edit `server/.env`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Transcription - local GPU service
|
||||||
|
TRANSCRIPT_BACKEND=modal
|
||||||
|
TRANSCRIPT_URL=http://host.docker.internal:8000
|
||||||
|
TRANSCRIPT_MODAL_API_KEY=dev-key-12345
|
||||||
|
|
||||||
|
# Diarization - local GPU service
|
||||||
|
DIARIZATION_BACKEND=modal
|
||||||
|
DIARIZATION_URL=http://host.docker.internal:8000
|
||||||
|
DIARIZATION_MODAL_API_KEY=dev-key-12345
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: Use `host.docker.internal` because Reflector server runs in Docker.
|
||||||
|
|
||||||
|
### 4. Restart Reflector server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd server
|
||||||
|
docker compose restart server worker
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Test transcription
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s -X POST http://localhost:8000/v1/audio/transcriptions \
|
||||||
|
-H "Authorization: Bearer dev-key-12345" \
|
||||||
|
-F "file=@/path/to/audio.wav" \
|
||||||
|
-F "language=en"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Test diarization
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s -X POST "http://localhost:8000/diarize?audio_file_url=<audio-url>" \
|
||||||
|
-H "Authorization: Bearer dev-key-12345"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Platform Notes
|
||||||
|
|
||||||
|
### macOS (ARM)
|
||||||
|
|
||||||
|
Docker build fails - CUDA packages are x86_64 only. Use local Python instead:
|
||||||
|
```bash
|
||||||
|
uv sync
|
||||||
|
HF_TOKEN=xxx REFLECTOR_GPU_APIKEY=xxx .venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Linux with NVIDIA GPU
|
||||||
|
|
||||||
|
Docker works with CUDA acceleration:
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### CPU-only
|
||||||
|
|
||||||
|
Works on any platform, just slower. PyTorch auto-detects and falls back to CPU.
|
||||||
|
|
||||||
|
## Switching Back to Modal.com
|
||||||
|
|
||||||
|
Edit `server/.env`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
TRANSCRIPT_BACKEND=modal
|
||||||
|
TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-parakeet-web.modal.run
|
||||||
|
TRANSCRIPT_MODAL_API_KEY=<modal-api-key>
|
||||||
|
|
||||||
|
DIARIZATION_BACKEND=modal
|
||||||
|
DIARIZATION_URL=https://monadical-sas--reflector-diarizer-web.modal.run
|
||||||
|
DIARIZATION_MODAL_API_KEY=<modal-api-key>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### "Could not download pyannote pipeline"
|
||||||
|
- Accept model licenses at HuggingFace (see Prerequisites)
|
||||||
|
- Verify HF_TOKEN is set and valid
|
||||||
|
|
||||||
|
### Service won't start
|
||||||
|
- Check port 8000 is free: `lsof -i :8000`
|
||||||
|
- Kill orphan processes if needed
|
||||||
|
|
||||||
|
### Transcription returns empty text
|
||||||
|
- Ensure audio contains speech (not just tones/silence)
|
||||||
|
- Check audio format is supported (wav, mp3, etc.)
|
||||||
|
|
||||||
|
### Deprecation warnings from torchaudio/pyannote
|
||||||
|
- Safe to ignore - doesn't affect functionality
|
||||||
@@ -56,9 +56,13 @@ Docker
|
|||||||
|
|
||||||
- Not yet provided in this directory. A Dockerfile will be added later. For now, use Local run above
|
- Not yet provided in this directory. A Dockerfile will be added later. For now, use Local run above
|
||||||
|
|
||||||
Conformance tests
|
# Setup
|
||||||
|
|
||||||
# From this directory
|
[SETUP.md](SETUP.md)
|
||||||
|
|
||||||
|
# Conformance tests
|
||||||
|
|
||||||
|
## From this directory
|
||||||
|
|
||||||
TRANSCRIPT_URL=http://localhost:8000 \
|
TRANSCRIPT_URL=http://localhost:8000 \
|
||||||
TRANSCRIPT_API_KEY=dev-key \
|
TRANSCRIPT_API_KEY=dev-key \
|
||||||
|
|||||||
@@ -129,6 +129,11 @@ class WhisperService:
|
|||||||
audio = np.frombuffer(proc.stdout, dtype=np.float32)
|
audio = np.frombuffer(proc.stdout, dtype=np.float32)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify this function, you MUST update all copies:
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/self_hosted/app/services/transcriber.py (this file)
|
||||||
def vad_segments(
|
def vad_segments(
|
||||||
audio_array,
|
audio_array,
|
||||||
sample_rate: int = SAMPLE_RATE,
|
sample_rate: int = SAMPLE_RATE,
|
||||||
@@ -153,6 +158,10 @@ class WhisperService:
|
|||||||
end = speech["end"]
|
end = speech["end"]
|
||||||
yield (start / float(SAMPLE_RATE), end / float(SAMPLE_RATE))
|
yield (start / float(SAMPLE_RATE), end / float(SAMPLE_RATE))
|
||||||
start = None
|
start = None
|
||||||
|
# Handle case where audio ends while speech is still active
|
||||||
|
if start is not None:
|
||||||
|
audio_duration = len(audio_array) / float(sample_rate)
|
||||||
|
yield (start / float(SAMPLE_RATE), audio_duration)
|
||||||
iterator.reset_states()
|
iterator.reset_states()
|
||||||
|
|
||||||
audio_array = load_audio_via_ffmpeg(file_path, SAMPLE_RATE)
|
audio_array = load_audio_via_ffmpeg(file_path, SAMPLE_RATE)
|
||||||
|
|||||||
@@ -34,6 +34,12 @@ def ensure_dirs():
|
|||||||
UPLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
UPLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||||
|
# If you modify the audio format detection logic, you MUST update all copies:
|
||||||
|
# - gpu/self_hosted/app/utils.py (this file)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||||
|
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||||
|
# - gpu/modal_deployments/reflector_diarizer.py
|
||||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
|
def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
|
||||||
url_path = urlparse(url).path
|
url_path = urlparse(url).path
|
||||||
for ext in SUPPORTED_FILE_EXTENSIONS:
|
for ext in SUPPORTED_FILE_EXTENSIONS:
|
||||||
@@ -47,6 +53,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
|
|||||||
return "wav"
|
return "wav"
|
||||||
if "audio/mp4" in content_type:
|
if "audio/mp4" in content_type:
|
||||||
return "mp4"
|
return "mp4"
|
||||||
|
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||||
|
return "webm"
|
||||||
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
|
|||||||
@@ -30,7 +30,9 @@ class Settings(BaseSettings):
|
|||||||
AUDIO_CHUNKER_BACKEND: str = "frames"
|
AUDIO_CHUNKER_BACKEND: str = "frames"
|
||||||
|
|
||||||
# Audio Transcription
|
# Audio Transcription
|
||||||
# backends: whisper, modal
|
# backends:
|
||||||
|
# - whisper: in-process model loading (no HTTP, runs in same process)
|
||||||
|
# - modal: HTTP API client (works with Modal.com OR self-hosted gpu/self_hosted/)
|
||||||
TRANSCRIPT_BACKEND: str = "whisper"
|
TRANSCRIPT_BACKEND: str = "whisper"
|
||||||
TRANSCRIPT_URL: str | None = None
|
TRANSCRIPT_URL: str | None = None
|
||||||
TRANSCRIPT_TIMEOUT: int = 90
|
TRANSCRIPT_TIMEOUT: int = 90
|
||||||
@@ -75,6 +77,9 @@ class Settings(BaseSettings):
|
|||||||
LLM_CONTEXT_WINDOW: int = 16000
|
LLM_CONTEXT_WINDOW: int = 16000
|
||||||
|
|
||||||
# Diarization
|
# Diarization
|
||||||
|
# backends:
|
||||||
|
# - pyannote: in-process model loading (no HTTP, runs in same process)
|
||||||
|
# - modal: HTTP API client (works with Modal.com OR self-hosted gpu/self_hosted/)
|
||||||
DIARIZATION_ENABLED: bool = True
|
DIARIZATION_ENABLED: bool = True
|
||||||
DIARIZATION_BACKEND: str = "modal"
|
DIARIZATION_BACKEND: str = "modal"
|
||||||
DIARIZATION_URL: str | None = None
|
DIARIZATION_URL: str | None = None
|
||||||
|
|||||||
Reference in New Issue
Block a user