Upgrade modal apps

This commit is contained in:
2025-03-25 11:09:01 +01:00
parent 821d7c0692
commit cfb1b2f9bc
6 changed files with 146 additions and 136 deletions

View File

@@ -72,7 +72,7 @@ diarizer_image = (
@app.cls( @app.cls(
gpu=modal.gpu.A100(size="40GB"), gpu=modal.gpu.A100(size="40GB"),
timeout=60 * 30, timeout=60 * 30,
container_idle_timeout=60, scaledown_window=60,
allow_concurrent_inputs=1, allow_concurrent_inputs=1,
image=diarizer_image, image=diarizer_image,
) )
@@ -126,7 +126,7 @@ class Diarizer:
@app.function( @app.function(
timeout=60 * 10, timeout=60 * 10,
container_idle_timeout=60 * 3, scaledown_window=60 * 3,
allow_concurrent_inputs=40, allow_concurrent_inputs=40,
secrets=[ secrets=[
Secret.from_name("reflector-gpu"), Secret.from_name("reflector-gpu"),

View File

@@ -3,13 +3,14 @@ Reflector GPU backend - LLM
=========================== ===========================
""" """
import json import json
import os import os
import threading import threading
from typing import Optional from typing import Optional
import modal import modal
from modal import Image, Secret, App, asgi_app, method, enter, exit from modal import App, Image, Secret, asgi_app, enter, exit, method
# LLM # LLM
LLM_MODEL: str = "lmsys/vicuna-13b-v1.5" LLM_MODEL: str = "lmsys/vicuna-13b-v1.5"
@@ -56,7 +57,7 @@ llm_image = (
"accelerate==0.21.0", "accelerate==0.21.0",
"einops==0.6.1", "einops==0.6.1",
"hf-transfer~=0.1", "hf-transfer~=0.1",
"huggingface_hub==0.16.4" "huggingface_hub==0.16.4",
) )
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_llm) .run_function(download_llm)
@@ -67,7 +68,7 @@ llm_image = (
@app.cls( @app.cls(
gpu="A100", gpu="A100",
timeout=60 * 5, timeout=60 * 5,
container_idle_timeout=60 * 5, scaledown_window=60 * 5,
allow_concurrent_inputs=15, allow_concurrent_inputs=15,
image=llm_image, image=llm_image,
) )
@@ -83,7 +84,7 @@ class LLM:
torch_dtype=getattr(torch, LLM_TORCH_DTYPE), torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
cache_dir=IMAGE_MODEL_DIR, cache_dir=IMAGE_MODEL_DIR,
local_files_only=True local_files_only=True,
) )
# JSONFormer doesn't yet support generation configs # JSONFormer doesn't yet support generation configs
@@ -97,9 +98,7 @@ class LLM:
# load tokenizer # load tokenizer
print("Instance llm tokenizer") print("Instance llm tokenizer")
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
LLM_MODEL, LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
) )
# move model to gpu # move model to gpu
@@ -119,7 +118,9 @@ class LLM:
print("Exit llm") print("Exit llm")
@method() @method()
def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict: def generate(
self, prompt: str, gen_schema: str | None, gen_cfg: str | None
) -> dict:
""" """
Perform a generation action using the LLM Perform a generation action using the LLM
""" """
@@ -140,7 +141,7 @@ class LLM:
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
json_schema=json.loads(gen_schema), json_schema=json.loads(gen_schema),
prompt=prompt, prompt=prompt,
max_string_token_length=gen_cfg.max_new_tokens max_string_token_length=gen_cfg.max_new_tokens,
) )
response = jsonformer_llm() response = jsonformer_llm()
else: else:
@@ -153,18 +154,21 @@ class LLM:
output = self.model.generate(input_ids, generation_config=gen_cfg) output = self.model.generate(input_ids, generation_config=gen_cfg)
# decode output # decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) response = self.tokenizer.decode(
response = response[len(prompt):] output[0].cpu(), skip_special_tokens=True
)
response = response[len(prompt) :]
print(f"Generated {response=}") print(f"Generated {response=}")
return {"text": response} return {"text": response}
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Web API # Web API
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@app.function( @app.function(
container_idle_timeout=60 * 10, scaledown_window=60 * 10,
timeout=60 * 5, timeout=60 * 5,
allow_concurrent_inputs=45, allow_concurrent_inputs=45,
secrets=[ secrets=[
@@ -201,7 +205,9 @@ def web():
): ):
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None
func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg) func = llmstub.generate.spawn(
prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg
)
result = func.get() result = func.get()
return result return result

View File

@@ -3,13 +3,14 @@ Reflector GPU backend - LLM
=========================== ===========================
""" """
import json import json
import os import os
import threading import threading
from typing import Optional from typing import Optional
import modal import modal
from modal import Image, Secret, App, asgi_app, method, enter, exit from modal import App, Image, Secret, asgi_app, enter, exit, method
# LLM # LLM
LLM_MODEL: str = "HuggingFaceH4/zephyr-7b-alpha" LLM_MODEL: str = "HuggingFaceH4/zephyr-7b-alpha"
@@ -56,7 +57,7 @@ llm_image = (
"accelerate==0.21.0", "accelerate==0.21.0",
"einops==0.6.1", "einops==0.6.1",
"hf-transfer~=0.1", "hf-transfer~=0.1",
"huggingface_hub==0.16.4" "huggingface_hub==0.16.4",
) )
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_llm) .run_function(download_llm)
@@ -67,7 +68,7 @@ llm_image = (
@app.cls( @app.cls(
gpu="A10G", gpu="A10G",
timeout=60 * 5, timeout=60 * 5,
container_idle_timeout=60 * 5, scaledown_window=60 * 5,
allow_concurrent_inputs=10, allow_concurrent_inputs=10,
image=llm_image, image=llm_image,
) )
@@ -83,7 +84,7 @@ class LLM:
torch_dtype=getattr(torch, LLM_TORCH_DTYPE), torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
cache_dir=IMAGE_MODEL_DIR, cache_dir=IMAGE_MODEL_DIR,
local_files_only=True local_files_only=True,
) )
# JSONFormer doesn't yet support generation configs # JSONFormer doesn't yet support generation configs
@@ -97,9 +98,7 @@ class LLM:
# load tokenizer # load tokenizer
print("Instance llm tokenizer") print("Instance llm tokenizer")
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
LLM_MODEL, LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
) )
gen_cfg.pad_token_id = tokenizer.eos_token_id gen_cfg.pad_token_id = tokenizer.eos_token_id
gen_cfg.eos_token_id = tokenizer.eos_token_id gen_cfg.eos_token_id = tokenizer.eos_token_id
@@ -122,7 +121,9 @@ class LLM:
print("Exit llm") print("Exit llm")
@method() @method()
def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict: def generate(
self, prompt: str, gen_schema: str | None, gen_cfg: str | None
) -> dict:
""" """
Perform a generation action using the LLM Perform a generation action using the LLM
""" """
@@ -145,7 +146,7 @@ class LLM:
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
json_schema=json.loads(gen_schema), json_schema=json.loads(gen_schema),
prompt=prompt, prompt=prompt,
max_string_token_length=gen_cfg.max_new_tokens max_string_token_length=gen_cfg.max_new_tokens,
) )
response = jsonformer_llm() response = jsonformer_llm()
else: else:
@@ -158,21 +159,22 @@ class LLM:
output = self.model.generate(input_ids, generation_config=gen_cfg) output = self.model.generate(input_ids, generation_config=gen_cfg)
# decode output # decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) response = self.tokenizer.decode(
response = response[len(prompt):] output[0].cpu(), skip_special_tokens=True
response = { )
"long_summary": response response = response[len(prompt) :]
} response = {"long_summary": response}
print(f"Generated {response=}") print(f"Generated {response=}")
return {"text": response} return {"text": response}
# ------------------------------------------------------------------- # -------------------------------------------------------------------
# Web API # Web API
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@app.function( @app.function(
container_idle_timeout=60 * 10, scaledown_window=60 * 10,
timeout=60 * 5, timeout=60 * 5,
allow_concurrent_inputs=30, allow_concurrent_inputs=30,
secrets=[ secrets=[
@@ -205,11 +207,13 @@ def web():
@app.post("/llm", dependencies=[Depends(apikey_auth)]) @app.post("/llm", dependencies=[Depends(apikey_auth)])
def llm( def llm(
req: LLMRequest, req: LLMRequest,
): ):
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None
func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg) func = llmstub.generate.spawn(
prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg
)
result = func.get() result = func.get()
return result return result

View File

@@ -52,7 +52,7 @@ image = (
@app.cls( @app.cls(
gpu="A10G", gpu="A10G",
timeout=5 * MINUTES, timeout=5 * MINUTES,
container_idle_timeout=5 * MINUTES, scaledown_window=5 * MINUTES,
allow_concurrent_inputs=6, allow_concurrent_inputs=6,
image=image, image=image,
volumes={MODELS_DIR: volume}, volumes={MODELS_DIR: volume},
@@ -107,7 +107,7 @@ class Transcriber:
@app.function( @app.function(
container_idle_timeout=60, scaledown_window=60,
timeout=60, timeout=60,
allow_concurrent_inputs=40, allow_concurrent_inputs=40,
secrets=[ secrets=[

View File

@@ -6,7 +6,7 @@ Reflector GPU backend - transcriber
import os import os
import threading import threading
from modal import Image, Secret, App, asgi_app, method, enter from modal import App, Image, Secret, asgi_app, enter, method
from pydantic import BaseModel from pydantic import BaseModel
# Seamless M4T # Seamless M4T
@@ -137,7 +137,7 @@ transcriber_image = (
@app.cls( @app.cls(
gpu="A10G", gpu="A10G",
timeout=60 * 5, timeout=60 * 5,
container_idle_timeout=60 * 5, scaledown_window=60 * 5,
allow_concurrent_inputs=4, allow_concurrent_inputs=4,
image=transcriber_image, image=transcriber_image,
) )
@@ -169,195 +169,195 @@ class Translator:
# TODO: Enhance with complete list of lang codes # TODO: Enhance with complete list of lang codes
seamless_lang_code = { seamless_lang_code = {
# Afrikaans # Afrikaans
'af': 'afr', "af": "afr",
# Amharic # Amharic
'am': 'amh', "am": "amh",
# Modern Standard Arabic # Modern Standard Arabic
'ar': 'arb', "ar": "arb",
# Moroccan Arabic # Moroccan Arabic
'ary': 'ary', "ary": "ary",
# Egyptian Arabic # Egyptian Arabic
'arz': 'arz', "arz": "arz",
# Assamese # Assamese
'as': 'asm', "as": "asm",
# North Azerbaijani # North Azerbaijani
'az': 'azj', "az": "azj",
# Belarusian # Belarusian
'be': 'bel', "be": "bel",
# Bengali # Bengali
'bn': 'ben', "bn": "ben",
# Bosnian # Bosnian
'bs': 'bos', "bs": "bos",
# Bulgarian # Bulgarian
'bg': 'bul', "bg": "bul",
# Catalan # Catalan
'ca': 'cat', "ca": "cat",
# Cebuano # Cebuano
'ceb': 'ceb', "ceb": "ceb",
# Czech # Czech
'cs': 'ces', "cs": "ces",
# Central Kurdish # Central Kurdish
'ku': 'ckb', "ku": "ckb",
# Mandarin Chinese # Mandarin Chinese
'cmn': 'cmn_Hant', "cmn": "cmn_Hant",
# Welsh # Welsh
'cy': 'cym', "cy": "cym",
# Danish # Danish
'da': 'dan', "da": "dan",
# German # German
'de': 'deu', "de": "deu",
# Greek # Greek
'el': 'ell', "el": "ell",
# English # English
'en': 'eng', "en": "eng",
# Estonian # Estonian
'et': 'est', "et": "est",
# Basque # Basque
'eu': 'eus', "eu": "eus",
# Finnish # Finnish
'fi': 'fin', "fi": "fin",
# French # French
'fr': 'fra', "fr": "fra",
# Irish # Irish
'ga': 'gle', "ga": "gle",
# West Central Oromo, # West Central Oromo,
'gaz': 'gaz', "gaz": "gaz",
# Galician # Galician
'gl': 'glg', "gl": "glg",
# Gujarati # Gujarati
'gu': 'guj', "gu": "guj",
# Hebrew # Hebrew
'he': 'heb', "he": "heb",
# Hindi # Hindi
'hi': 'hin', "hi": "hin",
# Croatian # Croatian
'hr': 'hrv', "hr": "hrv",
# Hungarian # Hungarian
'hu': 'hun', "hu": "hun",
# Armenian # Armenian
'hy': 'hye', "hy": "hye",
# Igbo # Igbo
'ig': 'ibo', "ig": "ibo",
# Indonesian # Indonesian
'id': 'ind', "id": "ind",
# Icelandic # Icelandic
'is': 'isl', "is": "isl",
# Italian # Italian
'it': 'ita', "it": "ita",
# Javanese # Javanese
'jv': 'jav', "jv": "jav",
# Japanese # Japanese
'ja': 'jpn', "ja": "jpn",
# Kannada # Kannada
'kn': 'kan', "kn": "kan",
# Georgian # Georgian
'ka': 'kat', "ka": "kat",
# Kazakh # Kazakh
'kk': 'kaz', "kk": "kaz",
# Halh Mongolian # Halh Mongolian
'khk': 'khk', "khk": "khk",
# Khmer # Khmer
'km': 'khm', "km": "khm",
# Kyrgyz # Kyrgyz
'ky': 'kir', "ky": "kir",
# Korean # Korean
'ko': 'kor', "ko": "kor",
# Lao # Lao
'lo': 'lao', "lo": "lao",
# Lithuanian # Lithuanian
'lt': 'lit', "lt": "lit",
# Ganda # Ganda
'lg': 'lug', "lg": "lug",
# Luo # Luo
'luo': 'luo', "luo": "luo",
# Standard Latvian # Standard Latvian
'lv': 'lvs', "lv": "lvs",
# Maithili # Maithili
'mai': 'mai', "mai": "mai",
# Malayalam # Malayalam
'ml': 'mal', "ml": "mal",
# Marathi # Marathi
'mr': 'mar', "mr": "mar",
# Macedonian # Macedonian
'mk': 'mkd', "mk": "mkd",
# Maltese # Maltese
'mt': 'mlt', "mt": "mlt",
# Meitei # Meitei
'mni': 'mni', "mni": "mni",
# Burmese # Burmese
'my': 'mya', "my": "mya",
# Dutch # Dutch
'nl': 'nld', "nl": "nld",
# Norwegian Nynorsk # Norwegian Nynorsk
'nn': 'nno', "nn": "nno",
# Norwegian Bokmål # Norwegian Bokmål
'nb': 'nob', "nb": "nob",
# Nepali # Nepali
'ne': 'npi', "ne": "npi",
# Nyanja # Nyanja
'ny': 'nya', "ny": "nya",
# Odia # Odia
'or': 'ory', "or": "ory",
# Punjabi # Punjabi
'pa': 'pan', "pa": "pan",
# Southern Pashto # Southern Pashto
'pbt': 'pbt', "pbt": "pbt",
# Western Persian # Western Persian
'pes': 'pes', "pes": "pes",
# Polish # Polish
'pl': 'pol', "pl": "pol",
# Portuguese # Portuguese
'pt': 'por', "pt": "por",
# Romanian # Romanian
'ro': 'ron', "ro": "ron",
# Russian # Russian
'ru': 'rus', "ru": "rus",
# Slovak # Slovak
'sk': 'slk', "sk": "slk",
# Slovenian # Slovenian
'sl': 'slv', "sl": "slv",
# Shona # Shona
'sn': 'sna', "sn": "sna",
# Sindhi # Sindhi
'sd': 'snd', "sd": "snd",
# Somali # Somali
'so': 'som', "so": "som",
# Spanish # Spanish
'es': 'spa', "es": "spa",
# Serbian # Serbian
'sr': 'srp', "sr": "srp",
# Swedish # Swedish
'sv': 'swe', "sv": "swe",
# Swahili # Swahili
'sw': 'swh', "sw": "swh",
# Tamil # Tamil
'ta': 'tam', "ta": "tam",
# Telugu # Telugu
'te': 'tel', "te": "tel",
# Tajik # Tajik
'tg': 'tgk', "tg": "tgk",
# Tagalog # Tagalog
'tl': 'tgl', "tl": "tgl",
# Thai # Thai
'th': 'tha', "th": "tha",
# Turkish # Turkish
'tr': 'tur', "tr": "tur",
# Ukrainian # Ukrainian
'uk': 'ukr', "uk": "ukr",
# Urdu # Urdu
'ur': 'urd', "ur": "urd",
# Northern Uzbek # Northern Uzbek
'uz': 'uzn', "uz": "uzn",
# Vietnamese # Vietnamese
'vi': 'vie', "vi": "vie",
# Yoruba # Yoruba
'yo': 'yor', "yo": "yor",
# Cantonese # Cantonese
'yue': 'yue', "yue": "yue",
# Standard Malay # Standard Malay
'ms': 'zsm', "ms": "zsm",
# Zulu # Zulu
'zu': 'zul' "zu": "zul",
} }
return seamless_lang_code.get(lang_code, "eng") return seamless_lang_code.get(lang_code, "eng")
@@ -381,7 +381,7 @@ class Translator:
@app.function( @app.function(
container_idle_timeout=60, scaledown_window=60,
timeout=60, timeout=60,
allow_concurrent_inputs=40, allow_concurrent_inputs=40,
secrets=[ secrets=[
@@ -413,9 +413,9 @@ def web():
@app.post("/translate", dependencies=[Depends(apikey_auth)]) @app.post("/translate", dependencies=[Depends(apikey_auth)])
async def translate( async def translate(
text: str, text: str,
source_language: Annotated[str, Body(...)] = "en", source_language: Annotated[str, Body(...)] = "en",
target_language: Annotated[str, Body(...)] = "fr", target_language: Annotated[str, Body(...)] = "fr",
) -> TranslateResponse: ) -> TranslateResponse:
func = translatorstub.translate_text.spawn( func = translatorstub.translate_text.spawn(
text=text, text=text,

View File

@@ -53,7 +53,7 @@ app = modal.App("reflector-vllm-hermes3")
image=vllm_image, image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"), gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
timeout=60 * 5, timeout=60 * 5,
container_idle_timeout=60 * 5, scaledown_window=60 * 5,
allow_concurrent_inputs=100, allow_concurrent_inputs=100,
secrets=[ secrets=[
modal.Secret.from_name("reflector-gpu"), modal.Secret.from_name("reflector-gpu"),