From 6c1869b79ac2487c3906a12e05b4d3eb1bba7e9d Mon Sep 17 00:00:00 2001 From: Mathieu Virbel Date: Fri, 13 Oct 2023 21:15:57 +0200 Subject: [PATCH] gpu: improve concurrency on modal - coauthored with Gokul (#286) --- server/gpu/modal/reflector_llm.py | 51 +++++++++++---------- server/gpu/modal/reflector_llm_zephyr.py | 56 ++++++++++++----------- server/gpu/modal/reflector_transcriber.py | 25 ++++++---- server/gpu/modal/reflector_translator.py | 26 +++++------ 4 files changed, 86 insertions(+), 72 deletions(-) diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py index 7d7bb57d..02feedb7 100644 --- a/server/gpu/modal/reflector_llm.py +++ b/server/gpu/modal/reflector_llm.py @@ -5,6 +5,7 @@ Reflector GPU backend - LLM """ import json import os +import threading from typing import Optional import modal @@ -67,7 +68,7 @@ llm_image = ( gpu="A100", timeout=60 * 5, container_idle_timeout=60 * 5, - concurrency_limit=2, + allow_concurrent_inputs=15, image=llm_image, ) class LLM: @@ -108,6 +109,8 @@ class LLM: self.gen_cfg = gen_cfg self.GenerationConfig = GenerationConfig + self.lock = threading.Lock() + def __exit__(self, *args): print("Exit llm") @@ -123,30 +126,31 @@ class LLM: gen_cfg = self.gen_cfg # If a gen_schema is given, conform to gen_schema - if gen_schema: - import jsonformer + with self.lock: + if gen_schema: + import jsonformer - print(f"Schema {gen_schema=}") - jsonformer_llm = jsonformer.Jsonformer( - model=self.model, - tokenizer=self.tokenizer, - json_schema=json.loads(gen_schema), - prompt=prompt, - max_string_token_length=gen_cfg.max_new_tokens - ) - response = jsonformer_llm() - else: - # If no gen_schema, perform prompt only generation + print(f"Schema {gen_schema=}") + jsonformer_llm = jsonformer.Jsonformer( + model=self.model, + tokenizer=self.tokenizer, + json_schema=json.loads(gen_schema), + prompt=prompt, + max_string_token_length=gen_cfg.max_new_tokens + ) + response = jsonformer_llm() + else: + # If no gen_schema, perform prompt only generation - # tokenize prompt - input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to( - self.model.device - ) - output = self.model.generate(input_ids, generation_config=gen_cfg) + # tokenize prompt + input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to( + self.model.device + ) + output = self.model.generate(input_ids, generation_config=gen_cfg) - # decode output - response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) - response = response[len(prompt):] + # decode output + response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) + response = response[len(prompt):] print(f"Generated {response=}") return {"text": response} @@ -158,6 +162,7 @@ class LLM: @stub.function( container_idle_timeout=60 * 10, timeout=60 * 5, + allow_concurrent_inputs=45, secrets=[ Secret.from_name("reflector-gpu"), ], @@ -187,7 +192,7 @@ def web(): gen_cfg: Optional[dict] = None @app.post("/llm", dependencies=[Depends(apikey_auth)]) - async def llm( + def llm( req: LLMRequest, ): gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None diff --git a/server/gpu/modal/reflector_llm_zephyr.py b/server/gpu/modal/reflector_llm_zephyr.py index 1000de4e..cbb436b0 100644 --- a/server/gpu/modal/reflector_llm_zephyr.py +++ b/server/gpu/modal/reflector_llm_zephyr.py @@ -5,6 +5,7 @@ Reflector GPU backend - LLM """ import json import os +import threading from typing import Optional import modal @@ -67,7 +68,7 @@ llm_image = ( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, - concurrency_limit=2, + allow_concurrent_inputs=10, image=llm_image, ) class LLM: @@ -111,6 +112,7 @@ class LLM: self.tokenizer = tokenizer self.gen_cfg = gen_cfg self.GenerationConfig = GenerationConfig + self.lock = threading.Lock() def __exit__(self, *args): print("Exit llm") @@ -129,33 +131,34 @@ class LLM: gen_cfg = self.gen_cfg # If a gen_schema is given, conform to gen_schema - if gen_schema: - import jsonformer + with self.lock: + if gen_schema: + import jsonformer - print(f"Schema {gen_schema=}") - jsonformer_llm = jsonformer.Jsonformer( - model=self.model, - tokenizer=self.tokenizer, - json_schema=json.loads(gen_schema), - prompt=prompt, - max_string_token_length=gen_cfg.max_new_tokens - ) - response = jsonformer_llm() - else: - # If no gen_schema, perform prompt only generation + print(f"Schema {gen_schema=}") + jsonformer_llm = jsonformer.Jsonformer( + model=self.model, + tokenizer=self.tokenizer, + json_schema=json.loads(gen_schema), + prompt=prompt, + max_string_token_length=gen_cfg.max_new_tokens + ) + response = jsonformer_llm() + else: + # If no gen_schema, perform prompt only generation - # tokenize prompt - input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to( - self.model.device - ) - output = self.model.generate(input_ids, generation_config=gen_cfg) + # tokenize prompt + input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to( + self.model.device + ) + output = self.model.generate(input_ids, generation_config=gen_cfg) - # decode output - response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) - response = response[len(prompt):] - response = { - "long_summary": response - } + # decode output + response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) + response = response[len(prompt):] + response = { + "long_summary": response + } print(f"Generated {response=}") return {"text": response} @@ -167,6 +170,7 @@ class LLM: @stub.function( container_idle_timeout=60 * 10, timeout=60 * 5, + allow_concurrent_inputs=30, secrets=[ Secret.from_name("reflector-gpu"), ], @@ -196,7 +200,7 @@ def web(): gen_cfg: Optional[dict] = None @app.post("/llm", dependencies=[Depends(apikey_auth)]) - async def llm( + def llm( req: LLMRequest, ): gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index 69558c8e..bee9ccd1 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -5,6 +5,7 @@ Reflector GPU backend - transcriber import os import tempfile +import threading from modal import Image, Secret, Stub, asgi_app, method from pydantic import BaseModel @@ -78,6 +79,7 @@ transcriber_image = ( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, + allow_concurrent_inputs=6, image=transcriber_image, ) class Transcriber: @@ -85,6 +87,7 @@ class Transcriber: import faster_whisper import torch + self.lock = threading.Lock() self.use_gpu = torch.cuda.is_available() self.device = "cuda" if self.use_gpu else "cpu" self.model = faster_whisper.WhisperModel( @@ -106,14 +109,15 @@ class Transcriber: with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: fp.write(audio_data) - segments, _ = self.model.transcribe( - fp.name, - language=source_language, - beam_size=5, - word_timestamps=True, - vad_filter=True, - vad_parameters={"min_silence_duration_ms": 500}, - ) + with self.lock: + segments, _ = self.model.transcribe( + fp.name, + language=source_language, + beam_size=5, + word_timestamps=True, + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 500}, + ) multilingual_transcript = {} transcript_source_lang = "" @@ -147,6 +151,7 @@ class Transcriber: @stub.function( container_idle_timeout=60, timeout=60, + allow_concurrent_inputs=40, secrets=[ Secret.from_name("reflector-gpu"), ], @@ -176,12 +181,12 @@ def web(): result: dict @app.post("/transcribe", dependencies=[Depends(apikey_auth)]) - async def transcribe( + def transcribe( file: UploadFile, source_language: Annotated[str, Body(...)] = "en", timestamp: Annotated[float, Body()] = 0.0 ) -> TranscriptResponse: - audio_data = await file.read() + audio_data = file.file.read() audio_suffix = file.filename.split(".")[-1] assert audio_suffix in supported_audio_file_types diff --git a/server/gpu/modal/reflector_translator.py b/server/gpu/modal/reflector_translator.py index 69ea719a..6b035174 100644 --- a/server/gpu/modal/reflector_translator.py +++ b/server/gpu/modal/reflector_translator.py @@ -4,7 +4,7 @@ Reflector GPU backend - transcriber """ import os -import tempfile +import threading from modal import Image, Secret, Stub, asgi_app, method from pydantic import BaseModel @@ -129,6 +129,7 @@ transcriber_image = ( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, + allow_concurrent_inputs=4, image=transcriber_image, ) class Translator: @@ -136,6 +137,7 @@ class Translator: import torch from seamless_communication.models.inference.translator import Translator + self.lock = threading.Lock() self.use_gpu = torch.cuda.is_available() self.device = "cuda" if self.use_gpu else "cpu" self.translator = Translator( @@ -168,13 +170,14 @@ class Translator: source_language: str, target_language: str ): - translated_text, _, _ = self.translator.predict( - text, - "t2tt", - src_lang=self.get_seamless_lang_code(source_language), - tgt_lang=self.get_seamless_lang_code(target_language), - ngram_filtering=True - ) + with self.lock: + translated_text, _, _ = self.translator.predict( + text, + "t2tt", + src_lang=self.get_seamless_lang_code(source_language), + tgt_lang=self.get_seamless_lang_code(target_language), + ngram_filtering=True + ) return { "text": { source_language: text, @@ -189,6 +192,7 @@ class Translator: @stub.function( container_idle_timeout=60, timeout=60, + allow_concurrent_inputs=40, secrets=[ Secret.from_name("reflector-gpu"), ], @@ -217,7 +221,7 @@ def web(): result: dict @app.post("/translate", dependencies=[Depends(apikey_auth)]) - async def translate( + def translate( text: str, source_language: Annotated[str, Body(...)] = "en", target_language: Annotated[str, Body(...)] = "fr", @@ -230,8 +234,4 @@ def web(): result = func.get() return result - @app.post("/warmup", dependencies=[Depends(apikey_auth)]) - async def warmup(): - return translatorstub.warmup.spawn().get() - return app