diff --git a/server/gpu/modal_deployments/reflector_diarizer.py b/server/gpu/modal_deployments/reflector_diarizer.py index ddab9950..db46b83c 100644 --- a/server/gpu/modal_deployments/reflector_diarizer.py +++ b/server/gpu/modal_deployments/reflector_diarizer.py @@ -6,12 +6,12 @@ Reflector GPU backend - diarizer import os import modal.gpu -from modal import Image, Secret, Stub, asgi_app, method +from modal import Image, Secret, App, asgi_app, method, enter from pydantic import BaseModel -PYANNOTE_MODEL_NAME: str = "pyannote/speaker-diarization-3.0" +PYANNOTE_MODEL_NAME: str = "pyannote/speaker-diarization-3.1" MODEL_DIR = "/root/diarization_models" -stub = Stub(name="reflector-diarizer") +app = App(name="reflector-diarizer") def migrate_cache_llm(): @@ -33,7 +33,6 @@ def download_pyannote_audio(): Pipeline.from_pretrained( "pyannote/speaker-diarization-3.0", cache_dir=MODEL_DIR, - use_auth_token=os.environ["HF_TOKEN"] ) @@ -54,7 +53,7 @@ diarizer_image = ( "hf-transfer" ) .run_function(migrate_cache_llm) - .run_function(download_pyannote_audio, secrets=[modal.Secret.from_name("my-huggingface-secret")]) + .run_function(download_pyannote_audio) .env( { "LD_LIBRARY_PATH": ( @@ -66,16 +65,16 @@ diarizer_image = ( ) -@stub.cls( +@app.cls( gpu=modal.gpu.A100(memory=40), timeout=60 * 30, container_idle_timeout=60, allow_concurrent_inputs=1, image=diarizer_image, - secrets=[modal.Secret.from_name("my-huggingface-secret")], ) class Diarizer: - def __enter__(self): + @enter() + def enter(self): import torch from pyannote.audio import Pipeline @@ -124,7 +123,7 @@ class Diarizer: # ------------------------------------------------------------------- -@stub.function( +@app.function( timeout=60 * 10, container_idle_timeout=60 * 3, allow_concurrent_inputs=40, diff --git a/server/gpu/modal_deployments/reflector_llm.py b/server/gpu/modal_deployments/reflector_llm.py index f1e9d166..8faf5909 100644 --- a/server/gpu/modal_deployments/reflector_llm.py +++ b/server/gpu/modal_deployments/reflector_llm.py @@ -9,7 +9,7 @@ import threading from typing import Optional import modal -from modal import Image, Secret, Stub, asgi_app, method +from modal import Image, Secret, App, asgi_app, method, enter, exit # LLM LLM_MODEL: str = "lmsys/vicuna-13b-v1.5" @@ -19,7 +19,7 @@ LLM_MAX_NEW_TOKENS: int = 300 IMAGE_MODEL_DIR = "/root/llm_models" -stub = Stub(name="reflector-llm") +app = App(name="reflector-llm") def download_llm(): @@ -64,7 +64,7 @@ llm_image = ( ) -@stub.cls( +@app.cls( gpu="A100", timeout=60 * 5, container_idle_timeout=60 * 5, @@ -72,7 +72,8 @@ llm_image = ( image=llm_image, ) class LLM: - def __enter__(self): + @enter() + def enter(self): import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig @@ -113,7 +114,8 @@ class LLM: self.lock = threading.Lock() - def __exit__(self, *args): + @exit() + def exit(): print("Exit llm") @method() @@ -161,7 +163,7 @@ class LLM: # ------------------------------------------------------------------- -@stub.function( +@app.function( container_idle_timeout=60 * 10, timeout=60 * 5, allow_concurrent_inputs=45, diff --git a/server/gpu/modal_deployments/reflector_llm_zephyr.py b/server/gpu/modal_deployments/reflector_llm_zephyr.py index b101f5f2..18608acd 100644 --- a/server/gpu/modal_deployments/reflector_llm_zephyr.py +++ b/server/gpu/modal_deployments/reflector_llm_zephyr.py @@ -9,7 +9,7 @@ import threading from typing import Optional import modal -from modal import Image, Secret, Stub, asgi_app, method +from modal import Image, Secret, App, asgi_app, method, enter, exit # LLM LLM_MODEL: str = "HuggingFaceH4/zephyr-7b-alpha" @@ -19,7 +19,7 @@ LLM_MAX_NEW_TOKENS: int = 300 IMAGE_MODEL_DIR = "/root/llm_models/zephyr" -stub = Stub(name="reflector-llm-zephyr") +app = App(name="reflector-llm-zephyr") def download_llm(): @@ -64,7 +64,7 @@ llm_image = ( ) -@stub.cls( +@app.cls( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, @@ -72,7 +72,8 @@ llm_image = ( image=llm_image, ) class LLM: - def __enter__(self): + @enter() + def enter(self): import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig @@ -116,7 +117,8 @@ class LLM: self.GenerationConfig = GenerationConfig self.lock = threading.Lock() - def __exit__(self, *args): + @exit() + def exit(): print("Exit llm") @method() @@ -169,7 +171,7 @@ class LLM: # ------------------------------------------------------------------- -@stub.function( +@app.function( container_idle_timeout=60 * 10, timeout=60 * 5, allow_concurrent_inputs=30, diff --git a/server/gpu/modal_deployments/reflector_transcriber.py b/server/gpu/modal_deployments/reflector_transcriber.py index 4f746ded..5b7cb351 100644 --- a/server/gpu/modal_deployments/reflector_transcriber.py +++ b/server/gpu/modal_deployments/reflector_transcriber.py @@ -7,7 +7,7 @@ import os import tempfile import threading -from modal import Image, Secret, Stub, asgi_app, method +from modal import Image, Secret, App, asgi_app, method, enter from pydantic import BaseModel # Whisper @@ -18,7 +18,7 @@ WHISPER_NUM_WORKERS: int = 1 WHISPER_MODEL_DIR = "/root/transcription_models" -stub = Stub(name="reflector-transcriber") +app = App(name="reflector-transcriber") def download_whisper(): @@ -75,7 +75,7 @@ transcriber_image = ( ) -@stub.cls( +@app.cls( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, @@ -83,7 +83,8 @@ transcriber_image = ( image=transcriber_image, ) class Transcriber: - def __enter__(self): + @enter() + def enter(self): import faster_whisper import torch @@ -149,7 +150,7 @@ class Transcriber: # ------------------------------------------------------------------- -@stub.function( +@app.function( container_idle_timeout=60, timeout=60, allow_concurrent_inputs=40, diff --git a/server/gpu/modal_deployments/reflector_translator.py b/server/gpu/modal_deployments/reflector_translator.py index 8e920a5a..a21c33fe 100644 --- a/server/gpu/modal_deployments/reflector_translator.py +++ b/server/gpu/modal_deployments/reflector_translator.py @@ -6,7 +6,7 @@ Reflector GPU backend - transcriber import os import threading -from modal import Image, Secret, Stub, asgi_app, method +from modal import Image, Secret, App, asgi_app, method, enter from pydantic import BaseModel # Seamless M4T @@ -20,7 +20,7 @@ HF_SEAMLESS_M4T_VOCODEREPO: str = "facebook/seamless-m4t-vocoder" SEAMLESS_GITEPO: str = "https://github.com/facebookresearch/seamless_communication.git" SEAMLESS_MODEL_DIR: str = "m4t" -stub = Stub(name="reflector-translator") +app = App(name="reflector-translator") def install_seamless_communication(): @@ -134,7 +134,7 @@ transcriber_image = ( ) -@stub.cls( +@app.cls( gpu="A10G", timeout=60 * 5, container_idle_timeout=60 * 5, @@ -142,7 +142,8 @@ transcriber_image = ( image=transcriber_image, ) class Translator: - def __enter__(self): + @enter() + def enter(self): import torch from seamless_communication.inference.translator import Translator @@ -379,7 +380,7 @@ class Translator: # ------------------------------------------------------------------- -@stub.function( +@app.function( container_idle_timeout=60, timeout=60, allow_concurrent_inputs=40,