mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
gpu: improve concurrency on modal - coauthored with Gokul (#286)
This commit is contained in:
@@ -5,6 +5,7 @@ Reflector GPU backend - LLM
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
import modal
|
||||
@@ -67,7 +68,7 @@ llm_image = (
|
||||
gpu="A100",
|
||||
timeout=60 * 5,
|
||||
container_idle_timeout=60 * 5,
|
||||
concurrency_limit=2,
|
||||
allow_concurrent_inputs=15,
|
||||
image=llm_image,
|
||||
)
|
||||
class LLM:
|
||||
@@ -108,6 +109,8 @@ class LLM:
|
||||
self.gen_cfg = gen_cfg
|
||||
self.GenerationConfig = GenerationConfig
|
||||
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def __exit__(self, *args):
|
||||
print("Exit llm")
|
||||
|
||||
@@ -123,6 +126,7 @@ class LLM:
|
||||
gen_cfg = self.gen_cfg
|
||||
|
||||
# If a gen_schema is given, conform to gen_schema
|
||||
with self.lock:
|
||||
if gen_schema:
|
||||
import jsonformer
|
||||
|
||||
@@ -158,6 +162,7 @@ class LLM:
|
||||
@stub.function(
|
||||
container_idle_timeout=60 * 10,
|
||||
timeout=60 * 5,
|
||||
allow_concurrent_inputs=45,
|
||||
secrets=[
|
||||
Secret.from_name("reflector-gpu"),
|
||||
],
|
||||
@@ -187,7 +192,7 @@ def web():
|
||||
gen_cfg: Optional[dict] = None
|
||||
|
||||
@app.post("/llm", dependencies=[Depends(apikey_auth)])
|
||||
async def llm(
|
||||
def llm(
|
||||
req: LLMRequest,
|
||||
):
|
||||
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
|
||||
|
||||
@@ -5,6 +5,7 @@ Reflector GPU backend - LLM
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
import modal
|
||||
@@ -67,7 +68,7 @@ llm_image = (
|
||||
gpu="A10G",
|
||||
timeout=60 * 5,
|
||||
container_idle_timeout=60 * 5,
|
||||
concurrency_limit=2,
|
||||
allow_concurrent_inputs=10,
|
||||
image=llm_image,
|
||||
)
|
||||
class LLM:
|
||||
@@ -111,6 +112,7 @@ class LLM:
|
||||
self.tokenizer = tokenizer
|
||||
self.gen_cfg = gen_cfg
|
||||
self.GenerationConfig = GenerationConfig
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def __exit__(self, *args):
|
||||
print("Exit llm")
|
||||
@@ -129,6 +131,7 @@ class LLM:
|
||||
gen_cfg = self.gen_cfg
|
||||
|
||||
# If a gen_schema is given, conform to gen_schema
|
||||
with self.lock:
|
||||
if gen_schema:
|
||||
import jsonformer
|
||||
|
||||
@@ -167,6 +170,7 @@ class LLM:
|
||||
@stub.function(
|
||||
container_idle_timeout=60 * 10,
|
||||
timeout=60 * 5,
|
||||
allow_concurrent_inputs=30,
|
||||
secrets=[
|
||||
Secret.from_name("reflector-gpu"),
|
||||
],
|
||||
@@ -196,7 +200,7 @@ def web():
|
||||
gen_cfg: Optional[dict] = None
|
||||
|
||||
@app.post("/llm", dependencies=[Depends(apikey_auth)])
|
||||
async def llm(
|
||||
def llm(
|
||||
req: LLMRequest,
|
||||
):
|
||||
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
|
||||
|
||||
@@ -5,6 +5,7 @@ Reflector GPU backend - transcriber
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import threading
|
||||
|
||||
from modal import Image, Secret, Stub, asgi_app, method
|
||||
from pydantic import BaseModel
|
||||
@@ -78,6 +79,7 @@ transcriber_image = (
|
||||
gpu="A10G",
|
||||
timeout=60 * 5,
|
||||
container_idle_timeout=60 * 5,
|
||||
allow_concurrent_inputs=6,
|
||||
image=transcriber_image,
|
||||
)
|
||||
class Transcriber:
|
||||
@@ -85,6 +87,7 @@ class Transcriber:
|
||||
import faster_whisper
|
||||
import torch
|
||||
|
||||
self.lock = threading.Lock()
|
||||
self.use_gpu = torch.cuda.is_available()
|
||||
self.device = "cuda" if self.use_gpu else "cpu"
|
||||
self.model = faster_whisper.WhisperModel(
|
||||
@@ -106,6 +109,7 @@ class Transcriber:
|
||||
with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp:
|
||||
fp.write(audio_data)
|
||||
|
||||
with self.lock:
|
||||
segments, _ = self.model.transcribe(
|
||||
fp.name,
|
||||
language=source_language,
|
||||
@@ -147,6 +151,7 @@ class Transcriber:
|
||||
@stub.function(
|
||||
container_idle_timeout=60,
|
||||
timeout=60,
|
||||
allow_concurrent_inputs=40,
|
||||
secrets=[
|
||||
Secret.from_name("reflector-gpu"),
|
||||
],
|
||||
@@ -176,12 +181,12 @@ def web():
|
||||
result: dict
|
||||
|
||||
@app.post("/transcribe", dependencies=[Depends(apikey_auth)])
|
||||
async def transcribe(
|
||||
def transcribe(
|
||||
file: UploadFile,
|
||||
source_language: Annotated[str, Body(...)] = "en",
|
||||
timestamp: Annotated[float, Body()] = 0.0
|
||||
) -> TranscriptResponse:
|
||||
audio_data = await file.read()
|
||||
audio_data = file.file.read()
|
||||
audio_suffix = file.filename.split(".")[-1]
|
||||
assert audio_suffix in supported_audio_file_types
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ Reflector GPU backend - transcriber
|
||||
"""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import threading
|
||||
|
||||
from modal import Image, Secret, Stub, asgi_app, method
|
||||
from pydantic import BaseModel
|
||||
@@ -129,6 +129,7 @@ transcriber_image = (
|
||||
gpu="A10G",
|
||||
timeout=60 * 5,
|
||||
container_idle_timeout=60 * 5,
|
||||
allow_concurrent_inputs=4,
|
||||
image=transcriber_image,
|
||||
)
|
||||
class Translator:
|
||||
@@ -136,6 +137,7 @@ class Translator:
|
||||
import torch
|
||||
from seamless_communication.models.inference.translator import Translator
|
||||
|
||||
self.lock = threading.Lock()
|
||||
self.use_gpu = torch.cuda.is_available()
|
||||
self.device = "cuda" if self.use_gpu else "cpu"
|
||||
self.translator = Translator(
|
||||
@@ -168,6 +170,7 @@ class Translator:
|
||||
source_language: str,
|
||||
target_language: str
|
||||
):
|
||||
with self.lock:
|
||||
translated_text, _, _ = self.translator.predict(
|
||||
text,
|
||||
"t2tt",
|
||||
@@ -189,6 +192,7 @@ class Translator:
|
||||
@stub.function(
|
||||
container_idle_timeout=60,
|
||||
timeout=60,
|
||||
allow_concurrent_inputs=40,
|
||||
secrets=[
|
||||
Secret.from_name("reflector-gpu"),
|
||||
],
|
||||
@@ -217,7 +221,7 @@ def web():
|
||||
result: dict
|
||||
|
||||
@app.post("/translate", dependencies=[Depends(apikey_auth)])
|
||||
async def translate(
|
||||
def translate(
|
||||
text: str,
|
||||
source_language: Annotated[str, Body(...)] = "en",
|
||||
target_language: Annotated[str, Body(...)] = "fr",
|
||||
@@ -230,8 +234,4 @@ def web():
|
||||
result = func.get()
|
||||
return result
|
||||
|
||||
@app.post("/warmup", dependencies=[Depends(apikey_auth)])
|
||||
async def warmup():
|
||||
return translatorstub.warmup.spawn().get()
|
||||
|
||||
return app
|
||||
|
||||
Reference in New Issue
Block a user