diff --git a/server/gpu/modal_deployments/reflector_diarizer.py b/server/gpu/modal_deployments/reflector_diarizer.py index 0f9178f4..639b983e 100644 --- a/server/gpu/modal_deployments/reflector_diarizer.py +++ b/server/gpu/modal_deployments/reflector_diarizer.py @@ -72,7 +72,7 @@ diarizer_image = ( @app.cls( gpu=modal.gpu.A100(size="40GB"), timeout=60 * 30, - container_idle_timeout=60, + scaledown_window=60, allow_concurrent_inputs=1, image=diarizer_image, ) @@ -126,7 +126,7 @@ class Diarizer: @app.function( timeout=60 * 10, - container_idle_timeout=60 * 3, + scaledown_window=60 * 3, allow_concurrent_inputs=40, secrets=[ Secret.from_name("reflector-gpu"), diff --git a/server/gpu/modal_deployments/reflector_llm.py b/server/gpu/modal_deployments/reflector_llm.py index 8faf5909..ea36a3ea 100644 --- a/server/gpu/modal_deployments/reflector_llm.py +++ b/server/gpu/modal_deployments/reflector_llm.py @@ -3,13 +3,14 @@ Reflector GPU backend - LLM =========================== """ + import json import os import threading from typing import Optional import modal -from modal import Image, Secret, App, asgi_app, method, enter, exit +from modal import App, Image, Secret, asgi_app, enter, exit, method # LLM LLM_MODEL: str = "lmsys/vicuna-13b-v1.5" @@ -56,7 +57,7 @@ llm_image = ( "accelerate==0.21.0", "einops==0.6.1", "hf-transfer~=0.1", - "huggingface_hub==0.16.4" + "huggingface_hub==0.16.4", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function(download_llm) @@ -67,7 +68,7 @@ llm_image = ( @app.cls( gpu="A100", timeout=60 * 5, - container_idle_timeout=60 * 5, + scaledown_window=60 * 5, allow_concurrent_inputs=15, image=llm_image, ) @@ -83,7 +84,7 @@ class LLM: torch_dtype=getattr(torch, LLM_TORCH_DTYPE), low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, cache_dir=IMAGE_MODEL_DIR, - local_files_only=True + local_files_only=True, ) # JSONFormer doesn't yet support generation configs @@ -97,9 +98,7 @@ class LLM: # load tokenizer print("Instance llm tokenizer") tokenizer = AutoTokenizer.from_pretrained( - LLM_MODEL, - cache_dir=IMAGE_MODEL_DIR, - local_files_only=True + LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True ) # move model to gpu @@ -119,7 +118,9 @@ class LLM: print("Exit llm") @method() - def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict: + def generate( + self, prompt: str, gen_schema: str | None, gen_cfg: str | None + ) -> dict: """ Perform a generation action using the LLM """ @@ -140,7 +141,7 @@ class LLM: tokenizer=self.tokenizer, json_schema=json.loads(gen_schema), prompt=prompt, - max_string_token_length=gen_cfg.max_new_tokens + max_string_token_length=gen_cfg.max_new_tokens, ) response = jsonformer_llm() else: @@ -153,18 +154,21 @@ class LLM: output = self.model.generate(input_ids, generation_config=gen_cfg) # decode output - response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) - response = response[len(prompt):] + response = self.tokenizer.decode( + output[0].cpu(), skip_special_tokens=True + ) + response = response[len(prompt) :] print(f"Generated {response=}") return {"text": response} + # ------------------------------------------------------------------- # Web API # ------------------------------------------------------------------- @app.function( - container_idle_timeout=60 * 10, + scaledown_window=60 * 10, timeout=60 * 5, allow_concurrent_inputs=45, secrets=[ @@ -201,7 +205,9 @@ def web(): ): gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None - func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg) + func = llmstub.generate.spawn( + prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg + ) result = func.get() return result diff --git a/server/gpu/modal_deployments/reflector_llm_zephyr.py b/server/gpu/modal_deployments/reflector_llm_zephyr.py index 18608acd..f5771738 100644 --- a/server/gpu/modal_deployments/reflector_llm_zephyr.py +++ b/server/gpu/modal_deployments/reflector_llm_zephyr.py @@ -3,13 +3,14 @@ Reflector GPU backend - LLM =========================== """ + import json import os import threading from typing import Optional import modal -from modal import Image, Secret, App, asgi_app, method, enter, exit +from modal import App, Image, Secret, asgi_app, enter, exit, method # LLM LLM_MODEL: str = "HuggingFaceH4/zephyr-7b-alpha" @@ -56,7 +57,7 @@ llm_image = ( "accelerate==0.21.0", "einops==0.6.1", "hf-transfer~=0.1", - "huggingface_hub==0.16.4" + "huggingface_hub==0.16.4", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function(download_llm) @@ -67,7 +68,7 @@ llm_image = ( @app.cls( gpu="A10G", timeout=60 * 5, - container_idle_timeout=60 * 5, + scaledown_window=60 * 5, allow_concurrent_inputs=10, image=llm_image, ) @@ -83,7 +84,7 @@ class LLM: torch_dtype=getattr(torch, LLM_TORCH_DTYPE), low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, cache_dir=IMAGE_MODEL_DIR, - local_files_only=True + local_files_only=True, ) # JSONFormer doesn't yet support generation configs @@ -97,9 +98,7 @@ class LLM: # load tokenizer print("Instance llm tokenizer") tokenizer = AutoTokenizer.from_pretrained( - LLM_MODEL, - cache_dir=IMAGE_MODEL_DIR, - local_files_only=True + LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True ) gen_cfg.pad_token_id = tokenizer.eos_token_id gen_cfg.eos_token_id = tokenizer.eos_token_id @@ -122,7 +121,9 @@ class LLM: print("Exit llm") @method() - def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict: + def generate( + self, prompt: str, gen_schema: str | None, gen_cfg: str | None + ) -> dict: """ Perform a generation action using the LLM """ @@ -145,7 +146,7 @@ class LLM: tokenizer=self.tokenizer, json_schema=json.loads(gen_schema), prompt=prompt, - max_string_token_length=gen_cfg.max_new_tokens + max_string_token_length=gen_cfg.max_new_tokens, ) response = jsonformer_llm() else: @@ -158,21 +159,22 @@ class LLM: output = self.model.generate(input_ids, generation_config=gen_cfg) # decode output - response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) - response = response[len(prompt):] - response = { - "long_summary": response - } + response = self.tokenizer.decode( + output[0].cpu(), skip_special_tokens=True + ) + response = response[len(prompt) :] + response = {"long_summary": response} print(f"Generated {response=}") return {"text": response} + # ------------------------------------------------------------------- # Web API # ------------------------------------------------------------------- @app.function( - container_idle_timeout=60 * 10, + scaledown_window=60 * 10, timeout=60 * 5, allow_concurrent_inputs=30, secrets=[ @@ -205,11 +207,13 @@ def web(): @app.post("/llm", dependencies=[Depends(apikey_auth)]) def llm( - req: LLMRequest, + req: LLMRequest, ): gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None - func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg) + func = llmstub.generate.spawn( + prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg + ) result = func.get() return result diff --git a/server/gpu/modal_deployments/reflector_transcriber.py b/server/gpu/modal_deployments/reflector_transcriber.py index d95bda52..4bbbe512 100644 --- a/server/gpu/modal_deployments/reflector_transcriber.py +++ b/server/gpu/modal_deployments/reflector_transcriber.py @@ -52,7 +52,7 @@ image = ( @app.cls( gpu="A10G", timeout=5 * MINUTES, - container_idle_timeout=5 * MINUTES, + scaledown_window=5 * MINUTES, allow_concurrent_inputs=6, image=image, volumes={MODELS_DIR: volume}, @@ -107,7 +107,7 @@ class Transcriber: @app.function( - container_idle_timeout=60, + scaledown_window=60, timeout=60, allow_concurrent_inputs=40, secrets=[ diff --git a/server/gpu/modal_deployments/reflector_translator.py b/server/gpu/modal_deployments/reflector_translator.py index a21c33fe..844f5094 100644 --- a/server/gpu/modal_deployments/reflector_translator.py +++ b/server/gpu/modal_deployments/reflector_translator.py @@ -6,7 +6,7 @@ Reflector GPU backend - transcriber import os import threading -from modal import Image, Secret, App, asgi_app, method, enter +from modal import App, Image, Secret, asgi_app, enter, method from pydantic import BaseModel # Seamless M4T @@ -137,7 +137,7 @@ transcriber_image = ( @app.cls( gpu="A10G", timeout=60 * 5, - container_idle_timeout=60 * 5, + scaledown_window=60 * 5, allow_concurrent_inputs=4, image=transcriber_image, ) @@ -169,195 +169,195 @@ class Translator: # TODO: Enhance with complete list of lang codes seamless_lang_code = { # Afrikaans - 'af': 'afr', + "af": "afr", # Amharic - 'am': 'amh', + "am": "amh", # Modern Standard Arabic - 'ar': 'arb', + "ar": "arb", # Moroccan Arabic - 'ary': 'ary', + "ary": "ary", # Egyptian Arabic - 'arz': 'arz', + "arz": "arz", # Assamese - 'as': 'asm', + "as": "asm", # North Azerbaijani - 'az': 'azj', + "az": "azj", # Belarusian - 'be': 'bel', + "be": "bel", # Bengali - 'bn': 'ben', + "bn": "ben", # Bosnian - 'bs': 'bos', + "bs": "bos", # Bulgarian - 'bg': 'bul', + "bg": "bul", # Catalan - 'ca': 'cat', + "ca": "cat", # Cebuano - 'ceb': 'ceb', + "ceb": "ceb", # Czech - 'cs': 'ces', + "cs": "ces", # Central Kurdish - 'ku': 'ckb', + "ku": "ckb", # Mandarin Chinese - 'cmn': 'cmn_Hant', + "cmn": "cmn_Hant", # Welsh - 'cy': 'cym', + "cy": "cym", # Danish - 'da': 'dan', + "da": "dan", # German - 'de': 'deu', + "de": "deu", # Greek - 'el': 'ell', + "el": "ell", # English - 'en': 'eng', + "en": "eng", # Estonian - 'et': 'est', + "et": "est", # Basque - 'eu': 'eus', + "eu": "eus", # Finnish - 'fi': 'fin', + "fi": "fin", # French - 'fr': 'fra', + "fr": "fra", # Irish - 'ga': 'gle', + "ga": "gle", # West Central Oromo, - 'gaz': 'gaz', + "gaz": "gaz", # Galician - 'gl': 'glg', + "gl": "glg", # Gujarati - 'gu': 'guj', + "gu": "guj", # Hebrew - 'he': 'heb', + "he": "heb", # Hindi - 'hi': 'hin', + "hi": "hin", # Croatian - 'hr': 'hrv', + "hr": "hrv", # Hungarian - 'hu': 'hun', + "hu": "hun", # Armenian - 'hy': 'hye', + "hy": "hye", # Igbo - 'ig': 'ibo', + "ig": "ibo", # Indonesian - 'id': 'ind', + "id": "ind", # Icelandic - 'is': 'isl', + "is": "isl", # Italian - 'it': 'ita', + "it": "ita", # Javanese - 'jv': 'jav', + "jv": "jav", # Japanese - 'ja': 'jpn', + "ja": "jpn", # Kannada - 'kn': 'kan', + "kn": "kan", # Georgian - 'ka': 'kat', + "ka": "kat", # Kazakh - 'kk': 'kaz', + "kk": "kaz", # Halh Mongolian - 'khk': 'khk', + "khk": "khk", # Khmer - 'km': 'khm', + "km": "khm", # Kyrgyz - 'ky': 'kir', + "ky": "kir", # Korean - 'ko': 'kor', + "ko": "kor", # Lao - 'lo': 'lao', + "lo": "lao", # Lithuanian - 'lt': 'lit', + "lt": "lit", # Ganda - 'lg': 'lug', + "lg": "lug", # Luo - 'luo': 'luo', + "luo": "luo", # Standard Latvian - 'lv': 'lvs', + "lv": "lvs", # Maithili - 'mai': 'mai', + "mai": "mai", # Malayalam - 'ml': 'mal', + "ml": "mal", # Marathi - 'mr': 'mar', + "mr": "mar", # Macedonian - 'mk': 'mkd', + "mk": "mkd", # Maltese - 'mt': 'mlt', + "mt": "mlt", # Meitei - 'mni': 'mni', + "mni": "mni", # Burmese - 'my': 'mya', + "my": "mya", # Dutch - 'nl': 'nld', + "nl": "nld", # Norwegian Nynorsk - 'nn': 'nno', + "nn": "nno", # Norwegian Bokmål - 'nb': 'nob', + "nb": "nob", # Nepali - 'ne': 'npi', + "ne": "npi", # Nyanja - 'ny': 'nya', + "ny": "nya", # Odia - 'or': 'ory', + "or": "ory", # Punjabi - 'pa': 'pan', + "pa": "pan", # Southern Pashto - 'pbt': 'pbt', + "pbt": "pbt", # Western Persian - 'pes': 'pes', + "pes": "pes", # Polish - 'pl': 'pol', + "pl": "pol", # Portuguese - 'pt': 'por', + "pt": "por", # Romanian - 'ro': 'ron', + "ro": "ron", # Russian - 'ru': 'rus', + "ru": "rus", # Slovak - 'sk': 'slk', + "sk": "slk", # Slovenian - 'sl': 'slv', + "sl": "slv", # Shona - 'sn': 'sna', + "sn": "sna", # Sindhi - 'sd': 'snd', + "sd": "snd", # Somali - 'so': 'som', + "so": "som", # Spanish - 'es': 'spa', + "es": "spa", # Serbian - 'sr': 'srp', + "sr": "srp", # Swedish - 'sv': 'swe', + "sv": "swe", # Swahili - 'sw': 'swh', + "sw": "swh", # Tamil - 'ta': 'tam', + "ta": "tam", # Telugu - 'te': 'tel', + "te": "tel", # Tajik - 'tg': 'tgk', + "tg": "tgk", # Tagalog - 'tl': 'tgl', + "tl": "tgl", # Thai - 'th': 'tha', + "th": "tha", # Turkish - 'tr': 'tur', + "tr": "tur", # Ukrainian - 'uk': 'ukr', + "uk": "ukr", # Urdu - 'ur': 'urd', + "ur": "urd", # Northern Uzbek - 'uz': 'uzn', + "uz": "uzn", # Vietnamese - 'vi': 'vie', + "vi": "vie", # Yoruba - 'yo': 'yor', + "yo": "yor", # Cantonese - 'yue': 'yue', + "yue": "yue", # Standard Malay - 'ms': 'zsm', + "ms": "zsm", # Zulu - 'zu': 'zul' + "zu": "zul", } return seamless_lang_code.get(lang_code, "eng") @@ -381,7 +381,7 @@ class Translator: @app.function( - container_idle_timeout=60, + scaledown_window=60, timeout=60, allow_concurrent_inputs=40, secrets=[ @@ -413,9 +413,9 @@ def web(): @app.post("/translate", dependencies=[Depends(apikey_auth)]) async def translate( - text: str, - source_language: Annotated[str, Body(...)] = "en", - target_language: Annotated[str, Body(...)] = "fr", + text: str, + source_language: Annotated[str, Body(...)] = "en", + target_language: Annotated[str, Body(...)] = "fr", ) -> TranslateResponse: func = translatorstub.translate_text.spawn( text=text, diff --git a/server/gpu/modal_deployments/reflector_vllm_hermes3.py b/server/gpu/modal_deployments/reflector_vllm_hermes3.py index d1c86be7..5eebf5c0 100644 --- a/server/gpu/modal_deployments/reflector_vllm_hermes3.py +++ b/server/gpu/modal_deployments/reflector_vllm_hermes3.py @@ -53,7 +53,7 @@ app = modal.App("reflector-vllm-hermes3") image=vllm_image, gpu=modal.gpu.A100(count=N_GPU, size="40GB"), timeout=60 * 5, - container_idle_timeout=60 * 5, + scaledown_window=60 * 5, allow_concurrent_inputs=100, secrets=[ modal.Secret.from_name("reflector-gpu"),