feat: retake summary using NousResearch/Hermes-3-Llama-3.1-8B model (#415)

This feature a new modal endpoint, and a complete new way to build the summary. ## SummaryBuilder The summary builder is based on conversational model, where an exchange between the model and the user is made. This allow more context inclusion and a better respect of the rules. It requires an endpoint with OpenAI-like completions endpoint (/v1/chat/completions) ## vLLM Hermes3 Unlike previous deployment, this one use vLLM, which gives OpenAI-like completions endpoint out of the box. It could also handle guided JSON generation, so jsonformer is not needed. But, the model is quite good to follow JSON schema if asked in the prompt. ## Conversion of long/short into summary builder The builder is identifying participants, find key subjects, get a summary for each, then get a quick recap. The quick recap is used as a short_summary, while the markdown including the quick recap + key subjects + summaries are used for the long_summary. This is why the nextjs component has to be updated, to correctly style h1 and keep the new line of the markdown.
2026-02-04 18:06:48 +00:00 · 2024-09-14 02:28:38 +02:00
parent 6c4eac04c1
commit 5267ab2d37
20 changed files with 1383 additions and 238 deletions
--- a/server/gpu/modal_deployments/reflector_vllm_hermes3.py
+++ b/server/gpu/modal_deployments/reflector_vllm_hermes3.py
@@ -0,0 +1,171 @@
+# # Run an OpenAI-Compatible vLLM Server
+
+import modal
+
+MODELS_DIR = "/llamas"
+MODEL_NAME = "NousResearch/Hermes-3-Llama-3.1-8B"
+N_GPU = 1
+
+
+def download_llm():
+    from huggingface_hub import snapshot_download
+
+    print("Downloading LLM model")
+    snapshot_download(
+        MODEL_NAME,
+        local_dir=f"{MODELS_DIR}/{MODEL_NAME}",
+        ignore_patterns=[
+            "*.pt",
+            "*.bin",
+            "*.pth",
+            "original/*",
+        ],  # Ensure safetensors
+    )
+    print("LLM model downloaded")
+
+
+def move_cache():
+    from transformers.utils import move_cache as transformers_move_cache
+
+    transformers_move_cache()
+
+
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install("vllm==0.5.3post1")
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .pip_install(
+        # "accelerate==0.34.2",
+        "einops==0.8.0",
+        "hf-transfer~=0.1",
+    )
+    .run_function(download_llm)
+    .run_function(move_cache)
+    .pip_install(
+        "bitsandbytes>=0.42.9",
+    )
+)
+
+app = modal.App("reflector-vllm-hermes3")
+
+
+@app.function(
+    image=vllm_image,
+    gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
+    timeout=60 * 5,
+    container_idle_timeout=60 * 5,
+    allow_concurrent_inputs=100,
+    secrets=[
+        modal.Secret.from_name("reflector-gpu"),
+    ],
+)
+@modal.asgi_app()
+def serve():
+    import os
+
+    import fastapi
+    import vllm.entrypoints.openai.api_server as api_server
+    from vllm.engine.arg_utils import AsyncEngineArgs
+    from vllm.engine.async_llm_engine import AsyncLLMEngine
+    from vllm.entrypoints.logger import RequestLogger
+    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+    from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+    from vllm.usage.usage_lib import UsageContext
+
+    TOKEN = os.environ["REFLECTOR_GPU_APIKEY"]
+
+    # create a fastAPI app that uses vLLM's OpenAI-compatible router
+    web_app = fastapi.FastAPI(
+        title=f"OpenAI-compatible {MODEL_NAME} server",
+        description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
+        version="0.0.1",
+        docs_url="/docs",
+    )
+
+    # security: CORS middleware for external requests
+    http_bearer = fastapi.security.HTTPBearer(
+        scheme_name="Bearer Token",
+        description="See code for authentication details.",
+    )
+    web_app.add_middleware(
+        fastapi.middleware.cors.CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    # security: inject dependency on authed routes
+    async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
+        if api_key.credentials != TOKEN:
+            raise fastapi.HTTPException(
+                status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
+                detail="Invalid authentication credentials",
+            )
+        return {"username": "authenticated_user"}
+
+    router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
+
+    # wrap vllm's router in auth router
+    router.include_router(api_server.router)
+    # add authed vllm to our fastAPI app
+    web_app.include_router(router)
+
+    engine_args = AsyncEngineArgs(
+        model=MODELS_DIR + "/" + MODEL_NAME,
+        tensor_parallel_size=N_GPU,
+        gpu_memory_utilization=0.90,
+        # max_model_len=8096,
+        enforce_eager=False,  # capture the graph for faster inference, but slower cold starts (30s > 20s)
+        # --- 4 bits load
+        # quantization="bitsandbytes",
+        # load_format="bitsandbytes",
+    )
+
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_API_SERVER
+    )
+
+    model_config = get_model_config(engine)
+
+    request_logger = RequestLogger(max_log_len=2048)
+
+    api_server.openai_serving_chat = OpenAIServingChat(
+        engine,
+        model_config=model_config,
+        served_model_names=[MODEL_NAME],
+        chat_template=None,
+        response_role="assistant",
+        lora_modules=[],
+        prompt_adapters=[],
+        request_logger=request_logger,
+    )
+    api_server.openai_serving_completion = OpenAIServingCompletion(
+        engine,
+        model_config=model_config,
+        served_model_names=[MODEL_NAME],
+        lora_modules=[],
+        prompt_adapters=[],
+        request_logger=request_logger,
+    )
+
+    return web_app
+
+
+def get_model_config(engine):
+    import asyncio
+
+    try:  # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
+        event_loop = asyncio.get_running_loop()
+    except RuntimeError:
+        event_loop = None
+
+    if event_loop is not None and event_loop.is_running():
+        # If the current is instanced by Ray Serve,
+        # there is already a running event loop
+        model_config = event_loop.run_until_complete(engine.get_model_config())
+    else:
+        # When using single vLLM without engine_use_ray
+        model_config = asyncio.run(engine.get_model_config())
+
+    return model_config