diff --git a/server/docs/gpu/api-transcription.md b/server/docs/gpu/api-transcription.md new file mode 100644 index 00000000..7a15d793 --- /dev/null +++ b/server/docs/gpu/api-transcription.md @@ -0,0 +1,194 @@ +## Reflector GPU Transcription API (Specification) + +This document defines the Reflector GPU transcription API that all implementations must adhere to. Current implementations include NVIDIA Parakeet (NeMo) and Whisper (faster-whisper), both deployed on Modal.com. The API surface and response shapes are OpenAI/Whisper-compatible, so clients can switch implementations by changing only the base URL. + +### Base URL and Authentication + +- Example base URLs (Modal web endpoints): + + - Parakeet: `https://--reflector-transcriber-parakeet-web.modal.run` + - Whisper: `https://--reflector-transcriber-web.modal.run` + +- All endpoints are served under `/v1` and require a Bearer token: + +``` +Authorization: Bearer +``` + +Note: To switch implementations, deploy the desired variant and point `TRANSCRIPT_URL` to its base URL. The API is identical. + +### Supported file types + +`mp3, mp4, mpeg, mpga, m4a, wav, webm` + +### Models and languages + +- Parakeet (NVIDIA NeMo): default `nvidia/parakeet-tdt-0.6b-v2` + - Language support: only `en`. Other languages return HTTP 400. +- Whisper (faster-whisper): default `large-v2` (or deployment-specific) + - Language support: multilingual (per Whisper model capabilities). + +Note: The `model` parameter is accepted by all implementations for interface parity. Some backends may treat it as informational. + +### Endpoints + +#### POST /v1/audio/transcriptions + +Transcribe one or more uploaded audio files. + +Request: multipart/form-data + +- `file` (File) — optional. Single file to transcribe. +- `files` (File[]) — optional. One or more files to transcribe. +- `model` (string) — optional. Defaults to the implementation-specific model (see above). +- `language` (string) — optional, defaults to `en`. + - Parakeet: only `en` is accepted; other values return HTTP 400 + - Whisper: model-dependent; typically multilingual +- `batch` (boolean) — optional, defaults to `false`. + +Notes: + +- Provide either `file` or `files`, not both. If neither is provided, HTTP 400. +- `batch` requires `files`; using `batch=true` without `files` returns HTTP 400. +- Response shape for multiple files is the same regardless of `batch`. +- Files sent to this endpoint are processed in a single pass (no VAD/chunking). This is intended for short clips (roughly ≤ 30s; depends on GPU memory/model). For longer audio, prefer `/v1/audio/transcriptions-from-url` which supports VAD-based chunking. + +Responses + +Single file response: + +```json +{ + "text": "transcribed text", + "words": [ + { "word": "hello", "start": 0.0, "end": 0.5 }, + { "word": "world", "start": 0.5, "end": 1.0 } + ], + "filename": "audio.mp3" +} +``` + +Multiple files response: + +```json +{ + "results": [ + {"filename": "a1.mp3", "text": "...", "words": [...]}, + {"filename": "a2.mp3", "text": "...", "words": [...]}] +} +``` + +Notes: + +- Word objects always include keys: `word`, `start`, `end`. +- Some implementations may include a trailing space in `word` to match Whisper tokenization behavior; clients should trim if needed. + +Example curl (single file): + +```bash +curl -X POST \ + -H "Authorization: Bearer $REFLECTOR_GPU_APIKEY" \ + -F "file=@/path/to/audio.mp3" \ + -F "language=en" \ + "$BASE_URL/v1/audio/transcriptions" +``` + +Example curl (multiple files, batch): + +```bash +curl -X POST \ + -H "Authorization: Bearer $REFLECTOR_GPU_APIKEY" \ + -F "files=@/path/a1.mp3" -F "files=@/path/a2.mp3" \ + -F "batch=true" -F "language=en" \ + "$BASE_URL/v1/audio/transcriptions" +``` + +#### POST /v1/audio/transcriptions-from-url + +Transcribe a single remote audio file by URL. + +Request: application/json + +Body parameters: + +- `audio_file_url` (string) — required. URL of the audio file to transcribe. +- `model` (string) — optional. Defaults to the implementation-specific model (see above). +- `language` (string) — optional, defaults to `en`. Parakeet only accepts `en`. +- `timestamp_offset` (number) — optional, defaults to `0.0`. Added to each word's `start`/`end` in the response. + +```json +{ + "audio_file_url": "https://example.com/audio.mp3", + "model": "nvidia/parakeet-tdt-0.6b-v2", + "language": "en", + "timestamp_offset": 0.0 +} +``` + +Response: + +```json +{ + "text": "transcribed text", + "words": [ + { "word": "hello", "start": 10.0, "end": 10.5 }, + { "word": "world", "start": 10.5, "end": 11.0 } + ] +} +``` + +Notes: + +- `timestamp_offset` is added to each word’s `start`/`end` in the response. +- Implementations may perform VAD-based chunking and batching for long-form audio; word timings are adjusted accordingly. + +Example curl: + +```bash +curl -X POST \ + -H "Authorization: Bearer $REFLECTOR_GPU_APIKEY" \ + -H "Content-Type: application/json" \ + -d '{ + "audio_file_url": "https://example.com/audio.mp3", + "language": "en", + "timestamp_offset": 0 + }' \ + "$BASE_URL/v1/audio/transcriptions-from-url" +``` + +### Error handling + +- 400 Bad Request + - Parakeet: `language` other than `en` + - Missing required parameters (`file`/`files` for upload; `audio_file_url` for URL endpoint) + - Unsupported file extension +- 401 Unauthorized + - Missing or invalid Bearer token +- 404 Not Found + - `audio_file_url` does not exist + +### Implementation details + +- GPUs: A10G for small-file/live, L40S for large-file URL transcription (subject to deployment) +- VAD chunking and segment batching; word timings adjusted and overlapping ends constrained +- Pads very short segments (< 0.5s) to avoid model crashes on some backends + +### Server configuration (Reflector API) + +Set the Reflector server to use the Modal backend and point `TRANSCRIPT_URL` to your chosen deployment: + +``` +TRANSCRIPT_BACKEND=modal +TRANSCRIPT_URL=https://--reflector-transcriber-parakeet-web.modal.run +TRANSCRIPT_MODAL_API_KEY= +``` + +### Conformance tests + +Use the pytest-based conformance tests to validate any new implementation (including self-hosted) against this spec: + +``` +TRANSCRIPT_URL=https:// \ +TRANSCRIPT_MODAL_API_KEY=your-api-key \ +uv run -m pytest -m gpu_modal --no-cov server/tests/test_gpu_modal_transcript.py +``` diff --git a/server/gpu/modal_deployments/reflector_transcriber.py b/server/gpu/modal_deployments/reflector_transcriber.py index 4bbbe512..3be25542 100644 --- a/server/gpu/modal_deployments/reflector_transcriber.py +++ b/server/gpu/modal_deployments/reflector_transcriber.py @@ -1,41 +1,78 @@ import os -import tempfile +import sys import threading +import uuid +from typing import Generator, Mapping, NamedTuple, NewType, TypedDict +from urllib.parse import urlparse import modal -from pydantic import BaseModel - -MODELS_DIR = "/models" MODEL_NAME = "large-v2" MODEL_COMPUTE_TYPE: str = "float16" MODEL_NUM_WORKERS: int = 1 - MINUTES = 60 # seconds +SAMPLERATE = 16000 +UPLOADS_PATH = "/uploads" +CACHE_PATH = "/models" +SUPPORTED_FILE_EXTENSIONS = ["mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm"] +VAD_CONFIG = { + "batch_max_duration": 30.0, + "silence_padding": 0.5, + "window_size": 512, +} -volume = modal.Volume.from_name("models", create_if_missing=True) + +WhisperUniqFilename = NewType("WhisperUniqFilename", str) +AudioFileExtension = NewType("AudioFileExtension", str) app = modal.App("reflector-transcriber") +model_cache = modal.Volume.from_name("models", create_if_missing=True) +upload_volume = modal.Volume.from_name("whisper-uploads", create_if_missing=True) + + +class TimeSegment(NamedTuple): + """Represents a time segment with start and end times.""" + + start: float + end: float + + +class AudioSegment(NamedTuple): + """Represents an audio segment with timing and audio data.""" + + start: float + end: float + audio: any + + +class TranscriptResult(NamedTuple): + """Represents a transcription result with text and word timings.""" + + text: str + words: list["WordTiming"] + + +class WordTiming(TypedDict): + """Represents a word with its timing information.""" + + word: str + start: float + end: float + def download_model(): from faster_whisper import download_model - volume.reload() + model_cache.reload() - download_model(MODEL_NAME, cache_dir=MODELS_DIR) + download_model(MODEL_NAME, cache_dir=CACHE_PATH) - volume.commit() + model_cache.commit() image = ( modal.Image.debian_slim(python_version="3.12") - .pip_install( - "huggingface_hub==0.27.1", - "hf-transfer==0.1.9", - "torch==2.5.1", - "faster-whisper==1.1.1", - ) .env( { "HF_HUB_ENABLE_HF_TRANSFER": "1", @@ -45,19 +82,98 @@ image = ( ), } ) - .run_function(download_model, volumes={MODELS_DIR: volume}) + .apt_install("ffmpeg") + .pip_install( + "huggingface_hub==0.27.1", + "hf-transfer==0.1.9", + "torch==2.5.1", + "faster-whisper==1.1.1", + "fastapi==0.115.12", + "requests", + "librosa==0.10.1", + "numpy<2", + "silero-vad==5.1.0", + ) + .run_function(download_model, volumes={CACHE_PATH: model_cache}) ) +def detect_audio_format(url: str, headers: Mapping[str, str]) -> AudioFileExtension: + parsed_url = urlparse(url) + url_path = parsed_url.path + + for ext in SUPPORTED_FILE_EXTENSIONS: + if url_path.lower().endswith(f".{ext}"): + return AudioFileExtension(ext) + + content_type = headers.get("content-type", "").lower() + if "audio/mpeg" in content_type or "audio/mp3" in content_type: + return AudioFileExtension("mp3") + if "audio/wav" in content_type: + return AudioFileExtension("wav") + if "audio/mp4" in content_type: + return AudioFileExtension("mp4") + + raise ValueError( + f"Unsupported audio format for URL: {url}. " + f"Supported extensions: {', '.join(SUPPORTED_FILE_EXTENSIONS)}" + ) + + +def download_audio_to_volume( + audio_file_url: str, +) -> tuple[WhisperUniqFilename, AudioFileExtension]: + import requests + from fastapi import HTTPException + + response = requests.head(audio_file_url, allow_redirects=True) + if response.status_code == 404: + raise HTTPException(status_code=404, detail="Audio file not found") + + response = requests.get(audio_file_url, allow_redirects=True) + response.raise_for_status() + + audio_suffix = detect_audio_format(audio_file_url, response.headers) + unique_filename = WhisperUniqFilename(f"{uuid.uuid4()}.{audio_suffix}") + file_path = f"{UPLOADS_PATH}/{unique_filename}" + + with open(file_path, "wb") as f: + f.write(response.content) + + upload_volume.commit() + return unique_filename, audio_suffix + + +def pad_audio(audio_array, sample_rate: int = SAMPLERATE): + """Add 0.5s of silence if audio is shorter than the silence_padding window. + + Whisper does not require this strictly, but aligning behavior with Parakeet + avoids edge-case crashes on extremely short inputs and makes comparisons easier. + """ + import numpy as np + + audio_duration = len(audio_array) / sample_rate + if audio_duration < VAD_CONFIG["silence_padding"]: + silence_samples = int(sample_rate * VAD_CONFIG["silence_padding"]) + silence = np.zeros(silence_samples, dtype=np.float32) + return np.concatenate([audio_array, silence]) + return audio_array + + @app.cls( gpu="A10G", timeout=5 * MINUTES, scaledown_window=5 * MINUTES, - allow_concurrent_inputs=6, image=image, - volumes={MODELS_DIR: volume}, + volumes={CACHE_PATH: model_cache, UPLOADS_PATH: upload_volume}, ) -class Transcriber: +@modal.concurrent(max_inputs=10) +class TranscriberWhisperLive: + """Live transcriber class for small audio segments (A10G). + + Mirrors the Parakeet live class API but uses Faster-Whisper under the hood. + """ + @modal.enter() def enter(self): import faster_whisper @@ -71,23 +187,200 @@ class Transcriber: device=self.device, compute_type=MODEL_COMPUTE_TYPE, num_workers=MODEL_NUM_WORKERS, - download_root=MODELS_DIR, + download_root=CACHE_PATH, local_files_only=True, ) + print(f"Model is on device: {self.device}") @modal.method() def transcribe_segment( self, - audio_data: str, - audio_suffix: str, - language: str, + filename: str, + language: str = "en", ): - with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: - fp.write(audio_data) + """Transcribe a single uploaded audio file by filename.""" + upload_volume.reload() + + file_path = f"{UPLOADS_PATH}/{filename}" + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + with self.lock: + with NoStdStreams(): + segments, _ = self.model.transcribe( + file_path, + language=language, + beam_size=5, + word_timestamps=True, + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 500}, + ) + + segments = list(segments) + text = "".join(segment.text for segment in segments).strip() + words = [ + { + "word": word.word, + "start": round(float(word.start), 2), + "end": round(float(word.end), 2), + } + for segment in segments + for word in segment.words + ] + + return {"text": text, "words": words} + + @modal.method() + def transcribe_batch( + self, + filenames: list[str], + language: str = "en", + ): + """Transcribe multiple uploaded audio files and return per-file results.""" + upload_volume.reload() + + results = [] + for filename in filenames: + file_path = f"{UPLOADS_PATH}/{filename}" + if not os.path.exists(file_path): + raise FileNotFoundError(f"Batch file not found: {file_path}") + + with self.lock: + with NoStdStreams(): + segments, _ = self.model.transcribe( + file_path, + language=language, + beam_size=5, + word_timestamps=True, + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 500}, + ) + + segments = list(segments) + text = "".join(seg.text for seg in segments).strip() + words = [ + { + "word": w.word, + "start": round(float(w.start), 2), + "end": round(float(w.end), 2), + } + for seg in segments + for w in seg.words + ] + + results.append( + { + "filename": filename, + "text": text, + "words": words, + } + ) + + return results + + +@app.cls( + gpu="L40S", + timeout=15 * MINUTES, + image=image, + volumes={CACHE_PATH: model_cache, UPLOADS_PATH: upload_volume}, +) +class TranscriberWhisperFile: + """File transcriber for larger/longer audio, using VAD-driven batching (L40S).""" + + @modal.enter() + def enter(self): + import faster_whisper + import torch + from silero_vad import load_silero_vad + + self.lock = threading.Lock() + self.use_gpu = torch.cuda.is_available() + self.device = "cuda" if self.use_gpu else "cpu" + self.model = faster_whisper.WhisperModel( + MODEL_NAME, + device=self.device, + compute_type=MODEL_COMPUTE_TYPE, + num_workers=MODEL_NUM_WORKERS, + download_root=CACHE_PATH, + local_files_only=True, + ) + self.vad_model = load_silero_vad(onnx=False) + + @modal.method() + def transcribe_segment( + self, filename: str, timestamp_offset: float = 0.0, language: str = "en" + ): + import librosa + import numpy as np + from silero_vad import VADIterator + + def vad_segments( + audio_array, + sample_rate: int = SAMPLERATE, + window_size: int = VAD_CONFIG["window_size"], + ) -> Generator[TimeSegment, None, None]: + """Generate speech segments as TimeSegment using Silero VAD.""" + iterator = VADIterator(self.vad_model, sampling_rate=sample_rate) + start = None + for i in range(0, len(audio_array), window_size): + chunk = audio_array[i : i + window_size] + if len(chunk) < window_size: + chunk = np.pad( + chunk, (0, window_size - len(chunk)), mode="constant" + ) + speech = iterator(chunk) + if not speech: + continue + if "start" in speech: + start = speech["start"] + continue + if "end" in speech and start is not None: + end = speech["end"] + yield TimeSegment( + start / float(SAMPLERATE), end / float(SAMPLERATE) + ) + start = None + iterator.reset_states() + + upload_volume.reload() + file_path = f"{UPLOADS_PATH}/{filename}" + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + audio_array, _sr = librosa.load(file_path, sr=SAMPLERATE, mono=True) + + # Batch segments up to ~30s windows by merging contiguous VAD segments + merged_batches: list[TimeSegment] = [] + batch_start = None + batch_end = None + max_duration = VAD_CONFIG["batch_max_duration"] + for segment in vad_segments(audio_array): + seg_start, seg_end = segment.start, segment.end + if batch_start is None: + batch_start, batch_end = seg_start, seg_end + continue + if seg_end - batch_start <= max_duration: + batch_end = seg_end + else: + merged_batches.append(TimeSegment(batch_start, batch_end)) + batch_start, batch_end = seg_start, seg_end + if batch_start is not None and batch_end is not None: + merged_batches.append(TimeSegment(batch_start, batch_end)) + + all_text = [] + all_words = [] + + for segment in merged_batches: + start_time, end_time = segment.start, segment.end + s_idx = int(start_time * SAMPLERATE) + e_idx = int(end_time * SAMPLERATE) + segment = audio_array[s_idx:e_idx] + segment = pad_audio(segment, SAMPLERATE) with self.lock: segments, _ = self.model.transcribe( - fp.name, + segment, language=language, beam_size=5, word_timestamps=True, @@ -96,66 +389,220 @@ class Transcriber: ) segments = list(segments) - text = "".join(segment.text for segment in segments) + text = "".join(seg.text for seg in segments).strip() words = [ - {"word": word.word, "start": word.start, "end": word.end} - for segment in segments - for word in segment.words + { + "word": w.word, + "start": round(float(w.start) + start_time + timestamp_offset, 2), + "end": round(float(w.end) + start_time + timestamp_offset, 2), + } + for seg in segments + for w in seg.words ] + if text: + all_text.append(text) + all_words.extend(words) - return {"text": text, "words": words} + return {"text": " ".join(all_text), "words": all_words} + + +def detect_audio_format(url: str, headers: dict) -> str: + from urllib.parse import urlparse + + from fastapi import HTTPException + + url_path = urlparse(url).path + for ext in SUPPORTED_FILE_EXTENSIONS: + if url_path.lower().endswith(f".{ext}"): + return ext + + content_type = headers.get("content-type", "").lower() + if "audio/mpeg" in content_type or "audio/mp3" in content_type: + return "mp3" + if "audio/wav" in content_type: + return "wav" + if "audio/mp4" in content_type: + return "mp4" + + raise HTTPException( + status_code=400, + detail=( + f"Unsupported audio format for URL. Supported extensions: {', '.join(SUPPORTED_FILE_EXTENSIONS)}" + ), + ) + + +def download_audio_to_volume(audio_file_url: str) -> tuple[str, str]: + import requests + from fastapi import HTTPException + + response = requests.head(audio_file_url, allow_redirects=True) + if response.status_code == 404: + raise HTTPException(status_code=404, detail="Audio file not found") + + response = requests.get(audio_file_url, allow_redirects=True) + response.raise_for_status() + + audio_suffix = detect_audio_format(audio_file_url, response.headers) + unique_filename = f"{uuid.uuid4()}.{audio_suffix}" + file_path = f"{UPLOADS_PATH}/{unique_filename}" + + with open(file_path, "wb") as f: + f.write(response.content) + + upload_volume.commit() + return unique_filename, audio_suffix @app.function( scaledown_window=60, - timeout=60, - allow_concurrent_inputs=40, + timeout=600, secrets=[ modal.Secret.from_name("reflector-gpu"), ], - volumes={MODELS_DIR: volume}, + volumes={CACHE_PATH: model_cache, UPLOADS_PATH: upload_volume}, + image=image, ) +@modal.concurrent(max_inputs=40) @modal.asgi_app() def web(): - from fastapi import Body, Depends, FastAPI, HTTPException, UploadFile, status + from fastapi import ( + Body, + Depends, + FastAPI, + Form, + HTTPException, + UploadFile, + status, + ) from fastapi.security import OAuth2PasswordBearer - from typing_extensions import Annotated - transcriber = Transcriber() + transcriber_live = TranscriberWhisperLive() + transcriber_file = TranscriberWhisperFile() app = FastAPI() oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") - supported_file_types = ["mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm"] - def apikey_auth(apikey: str = Depends(oauth2_scheme)): - if apikey != os.environ["REFLECTOR_GPU_APIKEY"]: - raise HTTPException( - status_code=status.HTTP_401_UNAUTHORIZED, - detail="Invalid API key", - headers={"WWW-Authenticate": "Bearer"}, - ) + if apikey == os.environ["REFLECTOR_GPU_APIKEY"]: + return + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + headers={"WWW-Authenticate": "Bearer"}, + ) - class TranscriptResponse(BaseModel): - result: dict + class TranscriptResponse(dict): + pass @app.post("/v1/audio/transcriptions", dependencies=[Depends(apikey_auth)]) def transcribe( - file: UploadFile, - model: str = "whisper-1", - language: Annotated[str, Body(...)] = "en", - ) -> TranscriptResponse: - audio_data = file.file.read() - audio_suffix = file.filename.split(".")[-1] - assert audio_suffix in supported_file_types + file: UploadFile = None, + files: list[UploadFile] | None = None, + model: str = Form(MODEL_NAME), + language: str = Form("en"), + batch: bool = Form(False), + ): + if not file and not files: + raise HTTPException( + status_code=400, detail="Either 'file' or 'files' parameter is required" + ) + if batch and not files: + raise HTTPException( + status_code=400, detail="Batch transcription requires 'files'" + ) - func = transcriber.transcribe_segment.spawn( - audio_data=audio_data, - audio_suffix=audio_suffix, - language=language, - ) - result = func.get() - return result + upload_files = [file] if file else files + + uploaded_filenames: list[str] = [] + for upload_file in upload_files: + audio_suffix = upload_file.filename.split(".")[-1] + if audio_suffix not in SUPPORTED_FILE_EXTENSIONS: + raise HTTPException( + status_code=400, + detail=( + f"Unsupported audio format. Supported extensions: {', '.join(SUPPORTED_FILE_EXTENSIONS)}" + ), + ) + + unique_filename = f"{uuid.uuid4()}.{audio_suffix}" + file_path = f"{UPLOADS_PATH}/{unique_filename}" + with open(file_path, "wb") as f: + content = upload_file.file.read() + f.write(content) + uploaded_filenames.append(unique_filename) + + upload_volume.commit() + + try: + if batch and len(upload_files) > 1: + func = transcriber_live.transcribe_batch.spawn( + filenames=uploaded_filenames, + language=language, + ) + results = func.get() + return {"results": results} + + results = [] + for filename in uploaded_filenames: + func = transcriber_live.transcribe_segment.spawn( + filename=filename, + language=language, + ) + result = func.get() + result["filename"] = filename + results.append(result) + + return {"results": results} if len(results) > 1 else results[0] + finally: + for filename in uploaded_filenames: + try: + file_path = f"{UPLOADS_PATH}/{filename}" + os.remove(file_path) + except Exception: + pass + upload_volume.commit() + + @app.post("/v1/audio/transcriptions-from-url", dependencies=[Depends(apikey_auth)]) + def transcribe_from_url( + audio_file_url: str = Body( + ..., description="URL of the audio file to transcribe" + ), + model: str = Body(MODEL_NAME), + language: str = Body("en"), + timestamp_offset: float = Body(0.0), + ): + unique_filename, _audio_suffix = download_audio_to_volume(audio_file_url) + try: + func = transcriber_file.transcribe_segment.spawn( + filename=unique_filename, + timestamp_offset=timestamp_offset, + language=language, + ) + result = func.get() + return result + finally: + try: + file_path = f"{UPLOADS_PATH}/{unique_filename}" + os.remove(file_path) + upload_volume.commit() + except Exception: + pass return app + + +class NoStdStreams: + def __init__(self): + self.devnull = open(os.devnull, "w") + + def __enter__(self): + self._stdout, self._stderr = sys.stdout, sys.stderr + self._stdout.flush() + self._stderr.flush() + sys.stdout, sys.stderr = self.devnull, self.devnull + + def __exit__(self, exc_type, exc_value, traceback): + sys.stdout, sys.stderr = self._stdout, self._stderr + self.devnull.close() diff --git a/server/tests/test_gpu_modal_transcript.py b/server/tests/test_gpu_modal_transcript.py index 9b37fbe6..9a152185 100644 --- a/server/tests/test_gpu_modal_transcript.py +++ b/server/tests/test_gpu_modal_transcript.py @@ -272,6 +272,9 @@ class TestGPUModalTranscript: for f in temp_files: Path(f).unlink(missing_ok=True) + @pytest.mark.skipif( + not "parakeet" in get_model_name(), reason="Parakeet only supports English" + ) def test_transcriptions_error_handling(self): """Test error handling for invalid requests.""" url = get_modal_transcript_url()