gpu: improve concurrency on modal - coauthored with Gokul (#286)

2026-04-26 15:15:19 +00:00 · 2023-10-13 21:15:57 +02:00
parent 1d92d43fe0
commit 6c1869b79a
4 changed files with 86 additions and 72 deletions
--- a/server/gpu/modal/reflector_llm.py
+++ b/server/gpu/modal/reflector_llm.py
@@ -5,6 +5,7 @@ Reflector GPU backend - LLM
 """
 import json
 import os
 import threading
 from typing import Optional
 import modal
@@ -67,7 +68,7 @@ llm_image = (
    gpu="A100",
    timeout=60 * 5,
    container_idle_timeout=60 * 5,
-    concurrency_limit=2,
+    allow_concurrent_inputs=15,
    image=llm_image,
 )
 class LLM:
@@ -108,6 +109,8 @@ class LLM:
        self.gen_cfg = gen_cfg
        self.GenerationConfig = GenerationConfig
        self.lock = threading.Lock()
    def __exit__(self, *args):
        print("Exit llm")
@@ -123,6 +126,7 @@ class LLM:
            gen_cfg = self.gen_cfg
        # If a gen_schema is given, conform to gen_schema
        with self.lock:
            if gen_schema:
                import jsonformer
@@ -158,6 +162,7 @@ class LLM:
@stub.function(
    container_idle_timeout=60 * 10,
    timeout=60 * 5,
    allow_concurrent_inputs=45,
    secrets=[
        Secret.from_name("reflector-gpu"),
    ],
@@ -187,7 +192,7 @@ def web():
        gen_cfg: Optional[dict] = None
    @app.post("/llm", dependencies=[Depends(apikey_auth)])
-    async def llm(
+    def llm(
        req: LLMRequest,
    ):
        gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
--- a/server/gpu/modal/reflector_llm_zephyr.py
+++ b/server/gpu/modal/reflector_llm_zephyr.py
@@ -5,6 +5,7 @@ Reflector GPU backend - LLM
 """
 import json
 import os
 import threading
 from typing import Optional
 import modal
@@ -67,7 +68,7 @@ llm_image = (
    gpu="A10G",
    timeout=60 * 5,
    container_idle_timeout=60 * 5,
-    concurrency_limit=2,
+    allow_concurrent_inputs=10,
    image=llm_image,
 )
 class LLM:
@@ -111,6 +112,7 @@ class LLM:
        self.tokenizer = tokenizer
        self.gen_cfg = gen_cfg
        self.GenerationConfig = GenerationConfig
        self.lock = threading.Lock()
    def __exit__(self, *args):
        print("Exit llm")
@@ -129,6 +131,7 @@ class LLM:
            gen_cfg = self.gen_cfg
        # If a gen_schema is given, conform to gen_schema
        with self.lock:
            if gen_schema:
                import jsonformer
@@ -167,6 +170,7 @@ class LLM:
@stub.function(
    container_idle_timeout=60 * 10,
    timeout=60 * 5,
    allow_concurrent_inputs=30,
    secrets=[
        Secret.from_name("reflector-gpu"),
    ],
@@ -196,7 +200,7 @@ def web():
        gen_cfg: Optional[dict] = None
    @app.post("/llm", dependencies=[Depends(apikey_auth)])
-    async def llm(
+    def llm(
            req: LLMRequest,
    ):
        gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
--- a/server/gpu/modal/reflector_transcriber.py
+++ b/server/gpu/modal/reflector_transcriber.py
@@ -5,6 +5,7 @@ Reflector GPU backend - transcriber
 import os
 import tempfile
 import threading
 from modal import Image, Secret, Stub, asgi_app, method
 from pydantic import BaseModel
@@ -78,6 +79,7 @@ transcriber_image = (
    gpu="A10G",
    timeout=60 * 5,
    container_idle_timeout=60 * 5,
    allow_concurrent_inputs=6,
    image=transcriber_image,
 )
 class Transcriber:
@@ -85,6 +87,7 @@ class Transcriber:
        import faster_whisper
        import torch
        self.lock = threading.Lock()
        self.use_gpu = torch.cuda.is_available()
        self.device = "cuda" if self.use_gpu else "cpu"
        self.model = faster_whisper.WhisperModel(
@@ -106,6 +109,7 @@ class Transcriber:
        with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp:
            fp.write(audio_data)
            with self.lock:
                segments, _ = self.model.transcribe(
                    fp.name,
                    language=source_language,
@@ -147,6 +151,7 @@ class Transcriber:
@stub.function(
    container_idle_timeout=60,
    timeout=60,
    allow_concurrent_inputs=40,
    secrets=[
        Secret.from_name("reflector-gpu"),
    ],
@@ -176,12 +181,12 @@ def web():
        result: dict
    @app.post("/transcribe", dependencies=[Depends(apikey_auth)])
-    async def transcribe(
+    def transcribe(
        file: UploadFile,
        source_language: Annotated[str, Body(...)] = "en",
        timestamp: Annotated[float, Body()] = 0.0
    ) -> TranscriptResponse:
-        audio_data = await file.read()
+        audio_data = file.file.read()
        audio_suffix = file.filename.split(".")[-1]
        assert audio_suffix in supported_audio_file_types
--- a/server/gpu/modal/reflector_translator.py
+++ b/server/gpu/modal/reflector_translator.py
@@ -4,7 +4,7 @@ Reflector GPU backend - transcriber
 """
 import os
-import tempfile
+import threading
 from modal import Image, Secret, Stub, asgi_app, method
 from pydantic import BaseModel
@@ -129,6 +129,7 @@ transcriber_image = (
    gpu="A10G",
    timeout=60 * 5,
    container_idle_timeout=60 * 5,
    allow_concurrent_inputs=4,
    image=transcriber_image,
 )
 class Translator:
@@ -136,6 +137,7 @@ class Translator:
        import torch
        from seamless_communication.models.inference.translator import Translator
        self.lock = threading.Lock()
        self.use_gpu = torch.cuda.is_available()
        self.device = "cuda" if self.use_gpu else "cpu"
        self.translator = Translator(
@@ -168,6 +170,7 @@ class Translator:
            source_language: str,
            target_language: str
    ):
        with self.lock:
            translated_text, _, _ = self.translator.predict(
                text,
                "t2tt",
@@ -189,6 +192,7 @@ class Translator:
@stub.function(
    container_idle_timeout=60,
    timeout=60,
    allow_concurrent_inputs=40,
    secrets=[
        Secret.from_name("reflector-gpu"),
    ],
@@ -217,7 +221,7 @@ def web():
        result: dict
    @app.post("/translate", dependencies=[Depends(apikey_auth)])
-    async def translate(
+    def translate(
            text: str,
            source_language: Annotated[str, Body(...)] = "en",
            target_language: Annotated[str, Body(...)] = "fr",
@@ -230,8 +234,4 @@ def web():
        result = func.get()
        return result
    @app.post("/warmup", dependencies=[Depends(apikey_auth)])
    async def warmup():
        return translatorstub.warmup.spawn().get()
    return app