From 5cb132cac7bc6a363e8e1ffce7c70841e9113111 Mon Sep 17 00:00:00 2001
From: projects-g <63178974+projects-g@users.noreply.github.com>
Date: Wed, 8 Nov 2023 22:02:48 +0530
Subject: [PATCH] fix loading shards from local cache (#313)

---
 server/gpu/modal/reflector_llm.py         | 6 ++++--
 server/gpu/modal/reflector_llm_zephyr.py  | 8 +++++---
 server/gpu/modal/reflector_transcriber.py | 3 ++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py
index 02feedb7..f1e9d166 100644
--- a/server/gpu/modal/reflector_llm.py
+++ b/server/gpu/modal/reflector_llm.py
@@ -81,7 +81,8 @@ class LLM:
             LLM_MODEL,
             torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
             low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
         )
 
         # JSONFormer doesn't yet support generation configs
@@ -96,7 +97,8 @@ class LLM:
         print("Instance llm tokenizer")
         tokenizer = AutoTokenizer.from_pretrained(
             LLM_MODEL,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
         )
 
         # move model to gpu
diff --git a/server/gpu/modal/reflector_llm_zephyr.py b/server/gpu/modal/reflector_llm_zephyr.py
index cbb436b0..b101f5f2 100644
--- a/server/gpu/modal/reflector_llm_zephyr.py
+++ b/server/gpu/modal/reflector_llm_zephyr.py
@@ -17,7 +17,7 @@ LLM_LOW_CPU_MEM_USAGE: bool = True
 LLM_TORCH_DTYPE: str = "bfloat16"
 LLM_MAX_NEW_TOKENS: int = 300
 
-IMAGE_MODEL_DIR = "/root/llm_models"
+IMAGE_MODEL_DIR = "/root/llm_models/zephyr"
 
 stub = Stub(name="reflector-llm-zephyr")
 
@@ -81,7 +81,8 @@ class LLM:
             LLM_MODEL,
             torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
             low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
         )
 
         # JSONFormer doesn't yet support generation configs
@@ -96,7 +97,8 @@ class LLM:
         print("Instance llm tokenizer")
         tokenizer = AutoTokenizer.from_pretrained(
             LLM_MODEL,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
         )
         gen_cfg.pad_token_id = tokenizer.eos_token_id
         gen_cfg.eos_token_id = tokenizer.eos_token_id
diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py
index bee9ccd1..4f746ded 100644
--- a/server/gpu/modal/reflector_transcriber.py
+++ b/server/gpu/modal/reflector_transcriber.py
@@ -95,7 +95,8 @@ class Transcriber:
             device=self.device,
             compute_type=WHISPER_COMPUTE_TYPE,
             num_workers=WHISPER_NUM_WORKERS,
-            download_root=WHISPER_MODEL_DIR
+            download_root=WHISPER_MODEL_DIR,
+            local_files_only=True
         )
 
     @method()