mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
gpu self hosted setup guide (no-mistakes)
This commit is contained in:
@@ -24,6 +24,12 @@ app = modal.App(name="reflector-diarizer")
|
||||
upload_volume = modal.Volume.from_name("diarizer-uploads", create_if_missing=True)
|
||||
|
||||
|
||||
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||
# If you modify the audio format detection logic, you MUST update all copies:
|
||||
# - gpu/self_hosted/app/utils.py
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/modal_deployments/reflector_diarizer.py (this file)
|
||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
@@ -39,6 +45,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
||||
return AudioFileExtension("wav")
|
||||
if "audio/mp4" in content_type:
|
||||
return AudioFileExtension("mp4")
|
||||
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||
return AudioFileExtension("webm")
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported audio format for URL: {url}. "
|
||||
|
||||
@@ -99,6 +99,12 @@ image = (
|
||||
)
|
||||
|
||||
|
||||
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||
# If you modify the audio format detection logic, you MUST update all copies:
|
||||
# - gpu/self_hosted/app/utils.py
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/modal_deployments/reflector_diarizer.py
|
||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
@@ -114,6 +120,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
||||
return AudioFileExtension("wav")
|
||||
if "audio/mp4" in content_type:
|
||||
return AudioFileExtension("mp4")
|
||||
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||
return AudioFileExtension("webm")
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported audio format for URL: {url}. "
|
||||
@@ -316,6 +324,11 @@ class TranscriberWhisperFile:
|
||||
import numpy as np
|
||||
from silero_vad import VADIterator
|
||||
|
||||
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||
# If you modify this function, you MUST update all copies:
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (this file)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/self_hosted/app/services/transcriber.py
|
||||
def vad_segments(
|
||||
audio_array,
|
||||
sample_rate: int = SAMPLERATE,
|
||||
@@ -323,6 +336,7 @@ class TranscriberWhisperFile:
|
||||
) -> Generator[TimeSegment, None, None]:
|
||||
"""Generate speech segments as TimeSegment using Silero VAD."""
|
||||
iterator = VADIterator(self.vad_model, sampling_rate=sample_rate)
|
||||
audio_duration = len(audio_array) / float(SAMPLERATE)
|
||||
start = None
|
||||
for i in range(0, len(audio_array), window_size):
|
||||
chunk = audio_array[i : i + window_size]
|
||||
@@ -342,6 +356,9 @@ class TranscriberWhisperFile:
|
||||
start / float(SAMPLERATE), end / float(SAMPLERATE)
|
||||
)
|
||||
start = None
|
||||
# Handle case where audio ends while speech is still active
|
||||
if start is not None:
|
||||
yield TimeSegment(start / float(SAMPLERATE), audio_duration)
|
||||
iterator.reset_states()
|
||||
|
||||
upload_volume.reload()
|
||||
@@ -407,6 +424,12 @@ class TranscriberWhisperFile:
|
||||
return {"text": " ".join(all_text), "words": all_words}
|
||||
|
||||
|
||||
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||
# If you modify the audio format detection logic, you MUST update all copies:
|
||||
# - gpu/self_hosted/app/utils.py
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (this file - 2 copies!)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/modal_deployments/reflector_diarizer.py
|
||||
def detect_audio_format(url: str, headers: dict) -> str:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -424,6 +447,8 @@ def detect_audio_format(url: str, headers: dict) -> str:
|
||||
return "wav"
|
||||
if "audio/mp4" in content_type:
|
||||
return "mp4"
|
||||
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||
return "webm"
|
||||
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
||||
@@ -90,6 +90,12 @@ image = (
|
||||
)
|
||||
|
||||
|
||||
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||
# If you modify the audio format detection logic, you MUST update all copies:
|
||||
# - gpu/self_hosted/app/utils.py
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
|
||||
# - gpu/modal_deployments/reflector_diarizer.py
|
||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension:
|
||||
parsed_url = urlparse(url)
|
||||
url_path = parsed_url.path
|
||||
@@ -105,6 +111,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtens
|
||||
return AudioFileExtension("wav")
|
||||
if "audio/mp4" in content_type:
|
||||
return AudioFileExtension("mp4")
|
||||
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||
return AudioFileExtension("webm")
|
||||
|
||||
raise ValueError(
|
||||
f"Unsupported audio format for URL: {url}. "
|
||||
@@ -301,6 +309,11 @@ class TranscriberParakeetFile:
|
||||
audio_array, sample_rate = librosa.load(file_path, sr=SAMPLERATE, mono=True)
|
||||
return audio_array
|
||||
|
||||
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||
# If you modify this function, you MUST update all copies:
|
||||
# - gpu/modal_deployments/reflector_transcriber.py
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py (this file)
|
||||
# - gpu/self_hosted/app/services/transcriber.py
|
||||
def vad_segment_generator(
|
||||
audio_array,
|
||||
) -> Generator[TimeSegment, None, None]:
|
||||
|
||||
137
gpu/self_hosted/DEV_SETUP.md
Normal file
137
gpu/self_hosted/DEV_SETUP.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Local Development GPU Setup
|
||||
|
||||
Run transcription and diarization locally for development/testing.
|
||||
|
||||
> **For production deployment**, see the [Self-Hosted GPU Setup Guide](../../docs/docs/installation/self-hosted-gpu-setup.md).
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Python 3.12+** and **uv** package manager
|
||||
2. **FFmpeg** installed and on PATH
|
||||
3. **HuggingFace account** with access to pyannote models
|
||||
|
||||
### Accept Pyannote Licenses (Required)
|
||||
|
||||
Before first run, accept licenses for these gated models (logged into HuggingFace):
|
||||
- https://hf.co/pyannote/speaker-diarization-3.1
|
||||
- https://hf.co/pyannote/segmentation-3.0
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Install dependencies
|
||||
|
||||
```bash
|
||||
cd gpu/self_hosted
|
||||
uv sync
|
||||
```
|
||||
|
||||
### 2. Start the GPU service
|
||||
|
||||
```bash
|
||||
cd gpu/self_hosted
|
||||
HF_TOKEN=<your-huggingface-token> \
|
||||
REFLECTOR_GPU_APIKEY=dev-key-12345 \
|
||||
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
Note: The `.env` file is NOT auto-loaded. Pass env vars explicitly or use:
|
||||
```bash
|
||||
export HF_TOKEN=<your-token>
|
||||
export REFLECTOR_GPU_APIKEY=dev-key-12345
|
||||
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### 3. Configure Reflector to use local GPU
|
||||
|
||||
Edit `server/.env`:
|
||||
|
||||
```bash
|
||||
# Transcription - local GPU service
|
||||
TRANSCRIPT_BACKEND=modal
|
||||
TRANSCRIPT_URL=http://host.docker.internal:8000
|
||||
TRANSCRIPT_MODAL_API_KEY=dev-key-12345
|
||||
|
||||
# Diarization - local GPU service
|
||||
DIARIZATION_BACKEND=modal
|
||||
DIARIZATION_URL=http://host.docker.internal:8000
|
||||
DIARIZATION_MODAL_API_KEY=dev-key-12345
|
||||
```
|
||||
|
||||
Note: Use `host.docker.internal` because Reflector server runs in Docker.
|
||||
|
||||
### 4. Restart Reflector server
|
||||
|
||||
```bash
|
||||
cd server
|
||||
docker compose restart server worker
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Test transcription
|
||||
|
||||
```bash
|
||||
curl -s -X POST http://localhost:8000/v1/audio/transcriptions \
|
||||
-H "Authorization: Bearer dev-key-12345" \
|
||||
-F "file=@/path/to/audio.wav" \
|
||||
-F "language=en"
|
||||
```
|
||||
|
||||
### Test diarization
|
||||
|
||||
```bash
|
||||
curl -s -X POST "http://localhost:8000/diarize?audio_file_url=<audio-url>" \
|
||||
-H "Authorization: Bearer dev-key-12345"
|
||||
```
|
||||
|
||||
## Platform Notes
|
||||
|
||||
### macOS (ARM)
|
||||
|
||||
Docker build fails - CUDA packages are x86_64 only. Use local Python instead:
|
||||
```bash
|
||||
uv sync
|
||||
HF_TOKEN=xxx REFLECTOR_GPU_APIKEY=xxx .venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
### Linux with NVIDIA GPU
|
||||
|
||||
Docker works with CUDA acceleration:
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### CPU-only
|
||||
|
||||
Works on any platform, just slower. PyTorch auto-detects and falls back to CPU.
|
||||
|
||||
## Switching Back to Modal.com
|
||||
|
||||
Edit `server/.env`:
|
||||
|
||||
```bash
|
||||
TRANSCRIPT_BACKEND=modal
|
||||
TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-parakeet-web.modal.run
|
||||
TRANSCRIPT_MODAL_API_KEY=<modal-api-key>
|
||||
|
||||
DIARIZATION_BACKEND=modal
|
||||
DIARIZATION_URL=https://monadical-sas--reflector-diarizer-web.modal.run
|
||||
DIARIZATION_MODAL_API_KEY=<modal-api-key>
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "Could not download pyannote pipeline"
|
||||
- Accept model licenses at HuggingFace (see Prerequisites)
|
||||
- Verify HF_TOKEN is set and valid
|
||||
|
||||
### Service won't start
|
||||
- Check port 8000 is free: `lsof -i :8000`
|
||||
- Kill orphan processes if needed
|
||||
|
||||
### Transcription returns empty text
|
||||
- Ensure audio contains speech (not just tones/silence)
|
||||
- Check audio format is supported (wav, mp3, etc.)
|
||||
|
||||
### Deprecation warnings from torchaudio/pyannote
|
||||
- Safe to ignore - doesn't affect functionality
|
||||
@@ -56,9 +56,13 @@ Docker
|
||||
|
||||
- Not yet provided in this directory. A Dockerfile will be added later. For now, use Local run above
|
||||
|
||||
Conformance tests
|
||||
# Setup
|
||||
|
||||
# From this directory
|
||||
[SETUP.md](SETUP.md)
|
||||
|
||||
# Conformance tests
|
||||
|
||||
## From this directory
|
||||
|
||||
TRANSCRIPT_URL=http://localhost:8000 \
|
||||
TRANSCRIPT_API_KEY=dev-key \
|
||||
|
||||
@@ -129,6 +129,11 @@ class WhisperService:
|
||||
audio = np.frombuffer(proc.stdout, dtype=np.float32)
|
||||
return audio
|
||||
|
||||
# IMPORTANT: This VAD segment logic is duplicated in multiple files for deployment isolation.
|
||||
# If you modify this function, you MUST update all copies:
|
||||
# - gpu/modal_deployments/reflector_transcriber.py
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/self_hosted/app/services/transcriber.py (this file)
|
||||
def vad_segments(
|
||||
audio_array,
|
||||
sample_rate: int = SAMPLE_RATE,
|
||||
@@ -153,6 +158,10 @@ class WhisperService:
|
||||
end = speech["end"]
|
||||
yield (start / float(SAMPLE_RATE), end / float(SAMPLE_RATE))
|
||||
start = None
|
||||
# Handle case where audio ends while speech is still active
|
||||
if start is not None:
|
||||
audio_duration = len(audio_array) / float(sample_rate)
|
||||
yield (start / float(SAMPLE_RATE), audio_duration)
|
||||
iterator.reset_states()
|
||||
|
||||
audio_array = load_audio_via_ffmpeg(file_path, SAMPLE_RATE)
|
||||
|
||||
@@ -34,6 +34,12 @@ def ensure_dirs():
|
||||
UPLOADS_PATH.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# IMPORTANT: This function is duplicated in multiple files for deployment isolation.
|
||||
# If you modify the audio format detection logic, you MUST update all copies:
|
||||
# - gpu/self_hosted/app/utils.py (this file)
|
||||
# - gpu/modal_deployments/reflector_transcriber.py (2 copies)
|
||||
# - gpu/modal_deployments/reflector_transcriber_parakeet.py
|
||||
# - gpu/modal_deployments/reflector_diarizer.py
|
||||
def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
|
||||
url_path = urlparse(url).path
|
||||
for ext in SUPPORTED_FILE_EXTENSIONS:
|
||||
@@ -47,6 +53,8 @@ def detect_audio_format(url: str, headers: Mapping[str, str]) -> str:
|
||||
return "wav"
|
||||
if "audio/mp4" in content_type:
|
||||
return "mp4"
|
||||
if "audio/webm" in content_type or "video/webm" in content_type:
|
||||
return "webm"
|
||||
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
||||
Reference in New Issue
Block a user