fix loading shards from local cache (#313)

2025-12-20 20:29:06 +00:00 · 2023-11-08 22:02:48 +05:30
parent 6282583d92
commit 5cb132cac7
3 changed files with 11 additions and 6 deletions
--- a/server/gpu/modal/reflector_llm_zephyr.py
+++ b/server/gpu/modal/reflector_llm_zephyr.py
@@ -17,7 +17,7 @@ LLM_LOW_CPU_MEM_USAGE: bool = True
 LLM_TORCH_DTYPE: str = "bfloat16"
 LLM_MAX_NEW_TOKENS: int = 300

-IMAGE_MODEL_DIR = "/root/llm_models"
+IMAGE_MODEL_DIR = "/root/llm_models/zephyr"

 stub = Stub(name="reflector-llm-zephyr")

@@ -81,7 +81,8 @@ class LLM:
            LLM_MODEL,
            torch_dtype=getattr(torch, LLM_TORCH_DTYPE),
            low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
        )

        # JSONFormer doesn't yet support generation configs
@@ -96,7 +97,8 @@ class LLM:
        print("Instance llm tokenizer")
        tokenizer = AutoTokenizer.from_pretrained(
            LLM_MODEL,
-            cache_dir=IMAGE_MODEL_DIR
+            cache_dir=IMAGE_MODEL_DIR,
+            local_files_only=True
        )
        gen_cfg.pad_token_id = tokenizer.eos_token_id
        gen_cfg.eos_token_id = tokenizer.eos_token_id