From 012390d0aac7fe5bc88f390a7ee358a3c2d42132 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Wed, 30 Aug 2023 10:43:51 +0530 Subject: [PATCH 1/6] backup --- server/gpu/modal/reflector_llm.py | 2 +- server/gpu/modal/reflector_transcriber.py | 42 +++++++++++++++++++---- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py index 1a3f77d6..89580466 100644 --- a/server/gpu/modal/reflector_llm.py +++ b/server/gpu/modal/reflector_llm.py @@ -38,7 +38,7 @@ def migrate_cache_llm(): from transformers.utils.hub import move_cache print("Moving LLM cache") - move_cache() + move_cache(cache_dir=IMAGE_MODEL_DIR) print("LLM cache moved") diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index f06706c8..84b24bb7 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -13,19 +13,40 @@ from pydantic import BaseModel WHISPER_MODEL: str = "large-v2" WHISPER_COMPUTE_TYPE: str = "float16" WHISPER_NUM_WORKERS: int = 1 -WHISPER_CACHE_DIR: str = "/cache/whisper" + +MODEL_DIR = "/model" # Translation Model TRANSLATION_MODEL = "facebook/m2m100_418M" -stub = Stub(name="reflector-transcriber") +stub = Stub(name="reflector-transtest") -def download_whisper(): +def download_models(): from faster_whisper.utils import download_model + from huggingface_hub import snapshot_download - download_model(WHISPER_MODEL, local_files_only=False) + print("Downloading Whisper model") + download_model(WHISPER_MODEL) + print("Whisper model downloaded") + print("Downloading Translation model") + ignore_patterns = ["*.ot"] + snapshot_download(TRANSLATION_MODEL, cache_dir=MODEL_DIR, ignore_patterns=ignore_patterns) + print("Translation model downloaded") + +def migrate_cache_llm(): + """ + XXX The cache for model files in Transformers v4.22.0 has been updated. + Migrating your old cache. This is a one-time only operation. You can + interrupt this and resume the migration later on by calling + `transformers.utils.move_cache()`. + """ + from transformers.utils.hub import move_cache + + print("Moving LLM cache") + move_cache() + print("LLM cache moved") whisper_image = ( Image.debian_slim(python_version="3.10.8") @@ -38,7 +59,8 @@ whisper_image = ( "sentencepiece", "protobuf", ) - .run_function(download_whisper) + .run_function(download_models) + .run_function(migrate_cache_llm) .env( { "LD_LIBRARY_PATH": ( @@ -69,8 +91,14 @@ class Whisper: compute_type=WHISPER_COMPUTE_TYPE, num_workers=WHISPER_NUM_WORKERS, ) - self.translation_model = M2M100ForConditionalGeneration.from_pretrained(TRANSLATION_MODEL).to(self.device) - self.translation_tokenizer = M2M100Tokenizer.from_pretrained(TRANSLATION_MODEL) + self.translation_model = M2M100ForConditionalGeneration.from_pretrained( + TRANSLATION_MODEL, + cache_dir=TRANSCRIPTION_MODEL_DIR + ).to(self.device) + self.translation_tokenizer = M2M100Tokenizer.from_pretrained( + TRANSLATION_MODEL, + cache_dir=TRANSCRIPTION_MODEL_DIR + ) @method() From 61e24969e42008d23fcf0128970244cd8e237cd7 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Wed, 30 Aug 2023 13:00:42 +0530 Subject: [PATCH 2/6] change model download --- server/gpu/modal/reflector_transcriber.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index 84b24bb7..8641f12f 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -18,6 +18,7 @@ MODEL_DIR = "/model" # Translation Model TRANSLATION_MODEL = "facebook/m2m100_418M" +TRANSLATION_MODEL_DIR = "translation" stub = Stub(name="reflector-transtest") @@ -30,11 +31,6 @@ def download_models(): download_model(WHISPER_MODEL) print("Whisper model downloaded") - print("Downloading Translation model") - ignore_patterns = ["*.ot"] - snapshot_download(TRANSLATION_MODEL, cache_dir=MODEL_DIR, ignore_patterns=ignore_patterns) - print("Translation model downloaded") - def migrate_cache_llm(): """ XXX The cache for model files in Transformers v4.22.0 has been updated. @@ -48,6 +44,14 @@ def migrate_cache_llm(): move_cache() print("LLM cache moved") +def download_translation_model(): + from huggingface_hub import snapshot_download + + print("Downloading Translation model") + ignore_patterns = ["*.ot"] + snapshot_download(TRANSLATION_MODEL, cache_dir=MODEL_DIR, ignore_patterns=ignore_patterns) + print("Translation model downloaded") + whisper_image = ( Image.debian_slim(python_version="3.10.8") .apt_install("git") @@ -58,6 +62,7 @@ whisper_image = ( "transformers", "sentencepiece", "protobuf", + "huggingface_hub==0.16.4", ) .run_function(download_models) .run_function(migrate_cache_llm) @@ -92,13 +97,9 @@ class Whisper: num_workers=WHISPER_NUM_WORKERS, ) self.translation_model = M2M100ForConditionalGeneration.from_pretrained( - TRANSLATION_MODEL, - cache_dir=TRANSCRIPTION_MODEL_DIR + TRANSLATION_MODEL_DIR ).to(self.device) - self.translation_tokenizer = M2M100Tokenizer.from_pretrained( - TRANSLATION_MODEL, - cache_dir=TRANSCRIPTION_MODEL_DIR - ) + self.translation_tokenizer = M2M100Tokenizer.from_pretrained(TRANSLATION_MODEL) @method() From 6b84bbb4f6265b1268809c85c48cd2598820d0c8 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 5 Sep 2023 12:52:07 +0530 Subject: [PATCH 3/6] download transcriber model --- server/gpu/modal/reflector_transcriber.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index 8641f12f..996a73d7 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -23,14 +23,23 @@ TRANSLATION_MODEL_DIR = "translation" stub = Stub(name="reflector-transtest") -def download_models(): +def download_whisper(): from faster_whisper.utils import download_model - from huggingface_hub import snapshot_download print("Downloading Whisper model") - download_model(WHISPER_MODEL) + download_model(WHISPER_MODEL, cache_dir=MODEL_DIR) print("Whisper model downloaded") + +def download_translation_model(): + from huggingface_hub import snapshot_download + + print("Downloading Translation model") + ignore_patterns = ["*.ot"] + snapshot_download(TRANSLATION_MODEL, local_dir=MODEL_DIR, ignore_patterns=ignore_patterns) + print("Translation model downloaded") + + def migrate_cache_llm(): """ XXX The cache for model files in Transformers v4.22.0 has been updated. @@ -44,13 +53,6 @@ def migrate_cache_llm(): move_cache() print("LLM cache moved") -def download_translation_model(): - from huggingface_hub import snapshot_download - - print("Downloading Translation model") - ignore_patterns = ["*.ot"] - snapshot_download(TRANSLATION_MODEL, cache_dir=MODEL_DIR, ignore_patterns=ignore_patterns) - print("Translation model downloaded") whisper_image = ( Image.debian_slim(python_version="3.10.8") From e613157fd64a78b94550a4d8753ec23899510f3d Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 5 Sep 2023 14:28:48 +0530 Subject: [PATCH 4/6] update to use cache dir --- server/gpu/modal/reflector_transcriber.py | 27 ++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index 996a73d7..e97a90d4 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -14,32 +14,36 @@ WHISPER_MODEL: str = "large-v2" WHISPER_COMPUTE_TYPE: str = "float16" WHISPER_NUM_WORKERS: int = 1 -MODEL_DIR = "/model" - # Translation Model TRANSLATION_MODEL = "facebook/m2m100_418M" -TRANSLATION_MODEL_DIR = "translation" + +MODEL_DIR = "model" stub = Stub(name="reflector-transtest") -def download_whisper(): +def download_whisper(cache_dir: str = None): from faster_whisper.utils import download_model print("Downloading Whisper model") - download_model(WHISPER_MODEL, cache_dir=MODEL_DIR) + download_model(WHISPER_MODEL, cache_dir=cache_dir) print("Whisper model downloaded") -def download_translation_model(): +def download_translation_model(cache_dir: str = None): from huggingface_hub import snapshot_download print("Downloading Translation model") ignore_patterns = ["*.ot"] - snapshot_download(TRANSLATION_MODEL, local_dir=MODEL_DIR, ignore_patterns=ignore_patterns) + snapshot_download(TRANSLATION_MODEL, cache_dir=cache_dir, ignore_patterns=ignore_patterns) print("Translation model downloaded") +def download_models(): + download_whisper(cache_dir=MODEL_DIR) + download_translation_model(cache_dir=MODEL_DIR) + + def migrate_cache_llm(): """ XXX The cache for model files in Transformers v4.22.0 has been updated. @@ -99,10 +103,13 @@ class Whisper: num_workers=WHISPER_NUM_WORKERS, ) self.translation_model = M2M100ForConditionalGeneration.from_pretrained( - TRANSLATION_MODEL_DIR + TRANSLATION_MODEL, + cache_dir=MODEL_DIR ).to(self.device) - self.translation_tokenizer = M2M100Tokenizer.from_pretrained(TRANSLATION_MODEL) - + self.translation_tokenizer = M2M100Tokenizer.from_pretrained( + TRANSLATION_MODEL, + cache_dir=MODEL_DIR + ) @method() def warmup(self): From 2bed312e64997403da4737e8bbc70cf9893e2abf Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Fri, 8 Sep 2023 00:22:38 +0530 Subject: [PATCH 5/6] persistent model storage --- server/gpu/modal/reflector_llm.py | 20 +++++++---- server/gpu/modal/reflector_transcriber.py | 43 ++++++++++------------- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py index 89580466..0299280d 100644 --- a/server/gpu/modal/reflector_llm.py +++ b/server/gpu/modal/reflector_llm.py @@ -7,6 +7,7 @@ import json import os from typing import Optional +import modal from modal import Image, Secret, Stub, asgi_app, method # LLM @@ -15,16 +16,17 @@ LLM_LOW_CPU_MEM_USAGE: bool = True LLM_TORCH_DTYPE: str = "bfloat16" LLM_MAX_NEW_TOKENS: int = 300 -IMAGE_MODEL_DIR = "/model" +IMAGE_MODEL_DIR = "/root/llm_models" +volume = modal.NetworkFileSystem.persisted("reflector-llm-models") -stub = Stub(name="reflector-llm") +stub = Stub(name="reflector-llmtest1") def download_llm(): from huggingface_hub import snapshot_download print("Downloading LLM model") - snapshot_download(LLM_MODEL, local_dir=IMAGE_MODEL_DIR) + snapshot_download(LLM_MODEL, cache_dir=IMAGE_MODEL_DIR) print("LLM model downloaded") @@ -38,7 +40,7 @@ def migrate_cache_llm(): from transformers.utils.hub import move_cache print("Moving LLM cache") - move_cache(cache_dir=IMAGE_MODEL_DIR) + move_cache() print("LLM cache moved") @@ -58,7 +60,6 @@ llm_image = ( ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function(download_llm) - .run_function(migrate_cache_llm) ) @@ -68,6 +69,7 @@ llm_image = ( container_idle_timeout=60 * 5, concurrency_limit=2, image=llm_image, + network_file_systems={IMAGE_MODEL_DIR: volume}, ) class LLM: def __enter__(self): @@ -77,9 +79,10 @@ class LLM: print("Instance llm model") model = AutoModelForCausalLM.from_pretrained( - IMAGE_MODEL_DIR, + LLM_MODEL, torch_dtype=getattr(torch, LLM_TORCH_DTYPE), low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, + cache_dir=IMAGE_MODEL_DIR ) # generation configuration @@ -91,7 +94,10 @@ class LLM: # load tokenizer print("Instance llm tokenizer") - tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL) + tokenizer = AutoTokenizer.from_pretrained( + LLM_MODEL, + cache_dir=IMAGE_MODEL_DIR + ) # move model to gpu print("Move llm model to GPU") diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index e97a90d4..ecfc8a3f 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -6,6 +6,7 @@ Reflector GPU backend - transcriber import os import tempfile +import modal from modal import Image, Secret, Stub, asgi_app, method from pydantic import BaseModel @@ -17,12 +18,13 @@ WHISPER_NUM_WORKERS: int = 1 # Translation Model TRANSLATION_MODEL = "facebook/m2m100_418M" -MODEL_DIR = "model" +IMAGE_MODEL_DIR = "/root/transcription_models" +volume = modal.NetworkFileSystem.persisted("reflector-transcribe-models") -stub = Stub(name="reflector-transtest") +stub = Stub(name="reflector-transtest1") -def download_whisper(cache_dir: str = None): +def download_whisper(cache_dir: str | None = None): from faster_whisper.utils import download_model print("Downloading Whisper model") @@ -30,32 +32,24 @@ def download_whisper(cache_dir: str = None): print("Whisper model downloaded") -def download_translation_model(cache_dir: str = None): +def download_translation_model(cache_dir: str | None = None): from huggingface_hub import snapshot_download print("Downloading Translation model") ignore_patterns = ["*.ot"] - snapshot_download(TRANSLATION_MODEL, cache_dir=cache_dir, ignore_patterns=ignore_patterns) + snapshot_download( + TRANSLATION_MODEL, + cache_dir=cache_dir, + ignore_patterns=ignore_patterns + ) print("Translation model downloaded") def download_models(): - download_whisper(cache_dir=MODEL_DIR) - download_translation_model(cache_dir=MODEL_DIR) - - -def migrate_cache_llm(): - """ - XXX The cache for model files in Transformers v4.22.0 has been updated. - Migrating your old cache. This is a one-time only operation. You can - interrupt this and resume the migration later on by calling - `transformers.utils.move_cache()`. - """ - from transformers.utils.hub import move_cache - - print("Moving LLM cache") - move_cache() - print("LLM cache moved") + print(f"Downloading models to {IMAGE_MODEL_DIR=}") + download_whisper(cache_dir=IMAGE_MODEL_DIR) + download_translation_model(cache_dir=IMAGE_MODEL_DIR) + print(f"Model downloads complete.") whisper_image = ( @@ -71,7 +65,6 @@ whisper_image = ( "huggingface_hub==0.16.4", ) .run_function(download_models) - .run_function(migrate_cache_llm) .env( { "LD_LIBRARY_PATH": ( @@ -87,6 +80,7 @@ whisper_image = ( gpu="A10G", container_idle_timeout=60, image=whisper_image, + network_file_systems={IMAGE_MODEL_DIR: volume}, ) class Whisper: def __enter__(self): @@ -101,14 +95,15 @@ class Whisper: device=self.device, compute_type=WHISPER_COMPUTE_TYPE, num_workers=WHISPER_NUM_WORKERS, + download_root=IMAGE_MODEL_DIR ) self.translation_model = M2M100ForConditionalGeneration.from_pretrained( TRANSLATION_MODEL, - cache_dir=MODEL_DIR + cache_dir=IMAGE_MODEL_DIR ).to(self.device) self.translation_tokenizer = M2M100Tokenizer.from_pretrained( TRANSLATION_MODEL, - cache_dir=MODEL_DIR + cache_dir=IMAGE_MODEL_DIR ) @method() From 9a7b89adaa353a600c16655290cc0c0acb759003 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Fri, 8 Sep 2023 10:05:17 +0530 Subject: [PATCH 6/6] keep models in cache and load from cache --- server/gpu/modal/reflector_llm.py | 7 +++---- server/gpu/modal/reflector_transcriber.py | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py index 0299280d..9e20ff00 100644 --- a/server/gpu/modal/reflector_llm.py +++ b/server/gpu/modal/reflector_llm.py @@ -17,9 +17,8 @@ LLM_TORCH_DTYPE: str = "bfloat16" LLM_MAX_NEW_TOKENS: int = 300 IMAGE_MODEL_DIR = "/root/llm_models" -volume = modal.NetworkFileSystem.persisted("reflector-llm-models") -stub = Stub(name="reflector-llmtest1") +stub = Stub(name="reflector-llm") def download_llm(): @@ -40,7 +39,7 @@ def migrate_cache_llm(): from transformers.utils.hub import move_cache print("Moving LLM cache") - move_cache() + move_cache(cache_dir=IMAGE_MODEL_DIR, new_cache_dir=IMAGE_MODEL_DIR) print("LLM cache moved") @@ -60,6 +59,7 @@ llm_image = ( ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) .run_function(download_llm) + .run_function(migrate_cache_llm) ) @@ -69,7 +69,6 @@ llm_image = ( container_idle_timeout=60 * 5, concurrency_limit=2, image=llm_image, - network_file_systems={IMAGE_MODEL_DIR: volume}, ) class LLM: def __enter__(self): diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index ecfc8a3f..ff4caff7 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -19,9 +19,8 @@ WHISPER_NUM_WORKERS: int = 1 TRANSLATION_MODEL = "facebook/m2m100_418M" IMAGE_MODEL_DIR = "/root/transcription_models" -volume = modal.NetworkFileSystem.persisted("reflector-transcribe-models") -stub = Stub(name="reflector-transtest1") +stub = Stub(name="reflector-transcriber") def download_whisper(cache_dir: str | None = None): @@ -52,6 +51,20 @@ def download_models(): print(f"Model downloads complete.") +def migrate_cache_llm(): + """ + XXX The cache for model files in Transformers v4.22.0 has been updated. + Migrating your old cache. This is a one-time only operation. You can + interrupt this and resume the migration later on by calling + `transformers.utils.move_cache()`. + """ + from transformers.utils.hub import move_cache + + print("Moving LLM cache") + move_cache(cache_dir=IMAGE_MODEL_DIR, new_cache_dir=IMAGE_MODEL_DIR) + print("LLM cache moved") + + whisper_image = ( Image.debian_slim(python_version="3.10.8") .apt_install("git") @@ -65,6 +78,7 @@ whisper_image = ( "huggingface_hub==0.16.4", ) .run_function(download_models) + .run_function(migrate_cache_llm) .env( { "LD_LIBRARY_PATH": ( @@ -80,7 +94,6 @@ whisper_image = ( gpu="A10G", container_idle_timeout=60, image=whisper_image, - network_file_systems={IMAGE_MODEL_DIR: volume}, ) class Whisper: def __enter__(self):