Upgrade modal apps

This commit is contained in:
2025-03-25 11:09:01 +01:00
parent 821d7c0692
commit cfb1b2f9bc
6 changed files with 146 additions and 136 deletions

View File

@@ -72,7 +72,7 @@ diarizer_image = (
@app.cls(
gpu=modal.gpu.A100(size="40GB"),
timeout=60 * 30,
container_idle_timeout=60,
scaledown_window=60,
allow_concurrent_inputs=1,
image=diarizer_image,
)
@@ -126,7 +126,7 @@ class Diarizer:
@app.function(
timeout=60 * 10,
container_idle_timeout=60 * 3,
scaledown_window=60 * 3,
allow_concurrent_inputs=40,
secrets=[
Secret.from_name("reflector-gpu"),

View File

@@ -3,13 +3,14 @@ Reflector GPU backend - LLM
===========================
"""
import json
import os
import threading
from typing import Optional
import modal
from modal import Image, Secret, App, asgi_app, method, enter, exit
from modal import App, Image, Secret, asgi_app, enter, exit, method
# LLM
LLM_MODEL: str = "lmsys/vicuna-13b-v1.5"
@@ -56,7 +57,7 @@ llm_image = (
"accelerate==0.21.0",
"einops==0.6.1",
"hf-transfer~=0.1",
"huggingface_hub==0.16.4"
"huggingface_hub==0.16.4",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_llm)
@@ -67,7 +68,7 @@ llm_image = (
@app.cls(
gpu="A100",
timeout=60 * 5,
container_idle_timeout=60 * 5,
scaledown_window=60 * 5,
allow_concurrent_inputs=15,
image=llm_image,
)
@@ -83,7 +84,7 @@ class LLM:
torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
local_files_only=True,
)
# JSONFormer doesn't yet support generation configs
@@ -97,9 +98,7 @@ class LLM:
# load tokenizer
print("Instance llm tokenizer")
tokenizer = AutoTokenizer.from_pretrained(
LLM_MODEL,
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True
)
# move model to gpu
@@ -119,7 +118,9 @@ class LLM:
print("Exit llm")
@method()
def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict:
def generate(
self, prompt: str, gen_schema: str | None, gen_cfg: str | None
) -> dict:
"""
Perform a generation action using the LLM
"""
@@ -140,7 +141,7 @@ class LLM:
tokenizer=self.tokenizer,
json_schema=json.loads(gen_schema),
prompt=prompt,
max_string_token_length=gen_cfg.max_new_tokens
max_string_token_length=gen_cfg.max_new_tokens,
)
response = jsonformer_llm()
else:
@@ -153,18 +154,21 @@ class LLM:
output = self.model.generate(input_ids, generation_config=gen_cfg)
# decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True)
response = response[len(prompt):]
response = self.tokenizer.decode(
output[0].cpu(), skip_special_tokens=True
)
response = response[len(prompt) :]
print(f"Generated {response=}")
return {"text": response}
# -------------------------------------------------------------------
# Web API
# -------------------------------------------------------------------
@app.function(
container_idle_timeout=60 * 10,
scaledown_window=60 * 10,
timeout=60 * 5,
allow_concurrent_inputs=45,
secrets=[
@@ -201,7 +205,9 @@ def web():
):
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None
func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg)
func = llmstub.generate.spawn(
prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg
)
result = func.get()
return result

View File

@@ -3,13 +3,14 @@ Reflector GPU backend - LLM
===========================
"""
import json
import os
import threading
from typing import Optional
import modal
from modal import Image, Secret, App, asgi_app, method, enter, exit
from modal import App, Image, Secret, asgi_app, enter, exit, method
# LLM
LLM_MODEL: str = "HuggingFaceH4/zephyr-7b-alpha"
@@ -56,7 +57,7 @@ llm_image = (
"accelerate==0.21.0",
"einops==0.6.1",
"hf-transfer~=0.1",
"huggingface_hub==0.16.4"
"huggingface_hub==0.16.4",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(download_llm)
@@ -67,7 +68,7 @@ llm_image = (
@app.cls(
gpu="A10G",
timeout=60 * 5,
container_idle_timeout=60 * 5,
scaledown_window=60 * 5,
allow_concurrent_inputs=10,
image=llm_image,
)
@@ -83,7 +84,7 @@ class LLM:
torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
local_files_only=True,
)
# JSONFormer doesn't yet support generation configs
@@ -97,9 +98,7 @@ class LLM:
# load tokenizer
print("Instance llm tokenizer")
tokenizer = AutoTokenizer.from_pretrained(
LLM_MODEL,
cache_dir=IMAGE_MODEL_DIR,
local_files_only=True
LLM_MODEL, cache_dir=IMAGE_MODEL_DIR, local_files_only=True
)
gen_cfg.pad_token_id = tokenizer.eos_token_id
gen_cfg.eos_token_id = tokenizer.eos_token_id
@@ -122,7 +121,9 @@ class LLM:
print("Exit llm")
@method()
def generate(self, prompt: str, gen_schema: str | None, gen_cfg: str | None) -> dict:
def generate(
self, prompt: str, gen_schema: str | None, gen_cfg: str | None
) -> dict:
"""
Perform a generation action using the LLM
"""
@@ -145,7 +146,7 @@ class LLM:
tokenizer=self.tokenizer,
json_schema=json.loads(gen_schema),
prompt=prompt,
max_string_token_length=gen_cfg.max_new_tokens
max_string_token_length=gen_cfg.max_new_tokens,
)
response = jsonformer_llm()
else:
@@ -158,21 +159,22 @@ class LLM:
output = self.model.generate(input_ids, generation_config=gen_cfg)
# decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True)
response = response[len(prompt):]
response = {
"long_summary": response
}
response = self.tokenizer.decode(
output[0].cpu(), skip_special_tokens=True
)
response = response[len(prompt) :]
response = {"long_summary": response}
print(f"Generated {response=}")
return {"text": response}
# -------------------------------------------------------------------
# Web API
# -------------------------------------------------------------------
@app.function(
container_idle_timeout=60 * 10,
scaledown_window=60 * 10,
timeout=60 * 5,
allow_concurrent_inputs=30,
secrets=[
@@ -209,7 +211,9 @@ def web():
):
gen_schema = json.dumps(req.gen_schema) if req.gen_schema else None
gen_cfg = json.dumps(req.gen_cfg) if req.gen_cfg else None
func = llmstub.generate.spawn(prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg)
func = llmstub.generate.spawn(
prompt=req.prompt, gen_schema=gen_schema, gen_cfg=gen_cfg
)
result = func.get()
return result

View File

@@ -52,7 +52,7 @@ image = (
@app.cls(
gpu="A10G",
timeout=5 * MINUTES,
container_idle_timeout=5 * MINUTES,
scaledown_window=5 * MINUTES,
allow_concurrent_inputs=6,
image=image,
volumes={MODELS_DIR: volume},
@@ -107,7 +107,7 @@ class Transcriber:
@app.function(
container_idle_timeout=60,
scaledown_window=60,
timeout=60,
allow_concurrent_inputs=40,
secrets=[

View File

@@ -6,7 +6,7 @@ Reflector GPU backend - transcriber
import os
import threading
from modal import Image, Secret, App, asgi_app, method, enter
from modal import App, Image, Secret, asgi_app, enter, method
from pydantic import BaseModel
# Seamless M4T
@@ -137,7 +137,7 @@ transcriber_image = (
@app.cls(
gpu="A10G",
timeout=60 * 5,
container_idle_timeout=60 * 5,
scaledown_window=60 * 5,
allow_concurrent_inputs=4,
image=transcriber_image,
)
@@ -169,195 +169,195 @@ class Translator:
# TODO: Enhance with complete list of lang codes
seamless_lang_code = {
# Afrikaans
'af': 'afr',
"af": "afr",
# Amharic
'am': 'amh',
"am": "amh",
# Modern Standard Arabic
'ar': 'arb',
"ar": "arb",
# Moroccan Arabic
'ary': 'ary',
"ary": "ary",
# Egyptian Arabic
'arz': 'arz',
"arz": "arz",
# Assamese
'as': 'asm',
"as": "asm",
# North Azerbaijani
'az': 'azj',
"az": "azj",
# Belarusian
'be': 'bel',
"be": "bel",
# Bengali
'bn': 'ben',
"bn": "ben",
# Bosnian
'bs': 'bos',
"bs": "bos",
# Bulgarian
'bg': 'bul',
"bg": "bul",
# Catalan
'ca': 'cat',
"ca": "cat",
# Cebuano
'ceb': 'ceb',
"ceb": "ceb",
# Czech
'cs': 'ces',
"cs": "ces",
# Central Kurdish
'ku': 'ckb',
"ku": "ckb",
# Mandarin Chinese
'cmn': 'cmn_Hant',
"cmn": "cmn_Hant",
# Welsh
'cy': 'cym',
"cy": "cym",
# Danish
'da': 'dan',
"da": "dan",
# German
'de': 'deu',
"de": "deu",
# Greek
'el': 'ell',
"el": "ell",
# English
'en': 'eng',
"en": "eng",
# Estonian
'et': 'est',
"et": "est",
# Basque
'eu': 'eus',
"eu": "eus",
# Finnish
'fi': 'fin',
"fi": "fin",
# French
'fr': 'fra',
"fr": "fra",
# Irish
'ga': 'gle',
"ga": "gle",
# West Central Oromo,
'gaz': 'gaz',
"gaz": "gaz",
# Galician
'gl': 'glg',
"gl": "glg",
# Gujarati
'gu': 'guj',
"gu": "guj",
# Hebrew
'he': 'heb',
"he": "heb",
# Hindi
'hi': 'hin',
"hi": "hin",
# Croatian
'hr': 'hrv',
"hr": "hrv",
# Hungarian
'hu': 'hun',
"hu": "hun",
# Armenian
'hy': 'hye',
"hy": "hye",
# Igbo
'ig': 'ibo',
"ig": "ibo",
# Indonesian
'id': 'ind',
"id": "ind",
# Icelandic
'is': 'isl',
"is": "isl",
# Italian
'it': 'ita',
"it": "ita",
# Javanese
'jv': 'jav',
"jv": "jav",
# Japanese
'ja': 'jpn',
"ja": "jpn",
# Kannada
'kn': 'kan',
"kn": "kan",
# Georgian
'ka': 'kat',
"ka": "kat",
# Kazakh
'kk': 'kaz',
"kk": "kaz",
# Halh Mongolian
'khk': 'khk',
"khk": "khk",
# Khmer
'km': 'khm',
"km": "khm",
# Kyrgyz
'ky': 'kir',
"ky": "kir",
# Korean
'ko': 'kor',
"ko": "kor",
# Lao
'lo': 'lao',
"lo": "lao",
# Lithuanian
'lt': 'lit',
"lt": "lit",
# Ganda
'lg': 'lug',
"lg": "lug",
# Luo
'luo': 'luo',
"luo": "luo",
# Standard Latvian
'lv': 'lvs',
"lv": "lvs",
# Maithili
'mai': 'mai',
"mai": "mai",
# Malayalam
'ml': 'mal',
"ml": "mal",
# Marathi
'mr': 'mar',
"mr": "mar",
# Macedonian
'mk': 'mkd',
"mk": "mkd",
# Maltese
'mt': 'mlt',
"mt": "mlt",
# Meitei
'mni': 'mni',
"mni": "mni",
# Burmese
'my': 'mya',
"my": "mya",
# Dutch
'nl': 'nld',
"nl": "nld",
# Norwegian Nynorsk
'nn': 'nno',
"nn": "nno",
# Norwegian Bokmål
'nb': 'nob',
"nb": "nob",
# Nepali
'ne': 'npi',
"ne": "npi",
# Nyanja
'ny': 'nya',
"ny": "nya",
# Odia
'or': 'ory',
"or": "ory",
# Punjabi
'pa': 'pan',
"pa": "pan",
# Southern Pashto
'pbt': 'pbt',
"pbt": "pbt",
# Western Persian
'pes': 'pes',
"pes": "pes",
# Polish
'pl': 'pol',
"pl": "pol",
# Portuguese
'pt': 'por',
"pt": "por",
# Romanian
'ro': 'ron',
"ro": "ron",
# Russian
'ru': 'rus',
"ru": "rus",
# Slovak
'sk': 'slk',
"sk": "slk",
# Slovenian
'sl': 'slv',
"sl": "slv",
# Shona
'sn': 'sna',
"sn": "sna",
# Sindhi
'sd': 'snd',
"sd": "snd",
# Somali
'so': 'som',
"so": "som",
# Spanish
'es': 'spa',
"es": "spa",
# Serbian
'sr': 'srp',
"sr": "srp",
# Swedish
'sv': 'swe',
"sv": "swe",
# Swahili
'sw': 'swh',
"sw": "swh",
# Tamil
'ta': 'tam',
"ta": "tam",
# Telugu
'te': 'tel',
"te": "tel",
# Tajik
'tg': 'tgk',
"tg": "tgk",
# Tagalog
'tl': 'tgl',
"tl": "tgl",
# Thai
'th': 'tha',
"th": "tha",
# Turkish
'tr': 'tur',
"tr": "tur",
# Ukrainian
'uk': 'ukr',
"uk": "ukr",
# Urdu
'ur': 'urd',
"ur": "urd",
# Northern Uzbek
'uz': 'uzn',
"uz": "uzn",
# Vietnamese
'vi': 'vie',
"vi": "vie",
# Yoruba
'yo': 'yor',
"yo": "yor",
# Cantonese
'yue': 'yue',
"yue": "yue",
# Standard Malay
'ms': 'zsm',
"ms": "zsm",
# Zulu
'zu': 'zul'
"zu": "zul",
}
return seamless_lang_code.get(lang_code, "eng")
@@ -381,7 +381,7 @@ class Translator:
@app.function(
container_idle_timeout=60,
scaledown_window=60,
timeout=60,
allow_concurrent_inputs=40,
secrets=[

View File

@@ -53,7 +53,7 @@ app = modal.App("reflector-vllm-hermes3")
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
timeout=60 * 5,
container_idle_timeout=60 * 5,
scaledown_window=60 * 5,
allow_concurrent_inputs=100,
secrets=[
modal.Secret.from_name("reflector-gpu"),