Files
reflector/server/reflector/llm/llm_modal.py
Mathieu Virbel 5267ab2d37 feat: retake summary using NousResearch/Hermes-3-Llama-3.1-8B model (#415)
This feature a new modal endpoint, and a complete new way to build the
summary.

## SummaryBuilder

The summary builder is based on conversational model, where an exchange
between the model and the user is made. This allow more context
inclusion and a better respect of the rules.

It requires an endpoint with OpenAI-like completions endpoint
(/v1/chat/completions)

## vLLM Hermes3

Unlike previous deployment, this one use vLLM, which gives OpenAI-like
completions endpoint out of the box. It could also handle guided JSON
generation, so jsonformer is not needed. But, the model is quite good to
follow JSON schema if asked in the prompt.

## Conversion of long/short into summary builder

The builder is identifying participants, find key subjects, get a
summary for each, then get a quick recap.

The quick recap is used as a short_summary, while the markdown including
the quick recap + key subjects + summaries are used for the
long_summary.

This is why the nextjs component has to be updated, to correctly style
h1 and keep the new line of the markdown.
2024-09-14 02:28:38 +02:00

152 lines
5.1 KiB
Python

import httpx
from reflector.llm.base import LLM
from reflector.logger import logger as reflector_logger
from reflector.settings import settings
from reflector.utils.retry import retry
from transformers import AutoTokenizer, GenerationConfig
class ModalLLM(LLM):
def __init__(self, model_name: str | None = None):
super().__init__()
self.timeout = settings.LLM_TIMEOUT
self.llm_url = settings.LLM_URL + "/llm"
self.headers = {
"Authorization": f"Bearer {settings.LLM_MODAL_API_KEY}",
}
self._set_model_name(model_name if model_name else settings.DEFAULT_LLM)
@property
def supported_models(self):
"""
List of currently supported models on this GPU platform
"""
# TODO: Query the specific GPU platform
# Replace this with a HTTP call
return [
"lmsys/vicuna-13b-v1.5",
"HuggingFaceH4/zephyr-7b-alpha",
"NousResearch/Hermes-3-Llama-3.1-8B",
]
async def _generate(
self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
):
json_payload = {"prompt": prompt}
if gen_schema:
json_payload["gen_schema"] = gen_schema
if gen_cfg:
json_payload["gen_cfg"] = gen_cfg
# Handing over generation of the final summary to Zephyr model
# but replacing the Vicuna model will happen after more testing
# TODO: Create a mapping of model names and cloud deployments
if self.model_name == "HuggingFaceH4/zephyr-7b-alpha":
self.llm_url = settings.ZEPHYR_LLM_URL + "/llm"
async with httpx.AsyncClient() as client:
response = await retry(client.post)(
self.llm_url,
headers=self.headers,
json=json_payload,
timeout=self.timeout,
retry_timeout=60 * 5,
follow_redirects=True,
)
response.raise_for_status()
text = response.json()["text"]
return text
async def _completion(self, messages: list, **kwargs) -> dict:
kwargs.setdefault("temperature", 0.3)
kwargs.setdefault("max_tokens", 2048)
kwargs.setdefault("stream", False)
kwargs.setdefault("repetition_penalty", 1)
kwargs.setdefault("top_p", 1)
kwargs.setdefault("top_k", -1)
kwargs.setdefault("min_p", 0.05)
data = {"messages": messages, "model": self.model_name, **kwargs}
if self.model_name == "NousResearch/Hermes-3-Llama-3.1-8B":
self.llm_url = settings.HERMES_3_8B_LLM_URL + "/v1/chat/completions"
async with httpx.AsyncClient() as client:
response = await retry(client.post)(
self.llm_url,
headers=self.headers,
json=data,
timeout=self.timeout,
retry_timeout=60 * 5,
follow_redirects=True,
)
response.raise_for_status()
return response.json()
def _set_model_name(self, model_name: str) -> bool:
"""
Set the model name
"""
# Abort, if the model is not supported
if model_name not in self.supported_models:
reflector_logger.info(
f"Attempted to change {model_name=}, but is not supported."
f"Setting model and tokenizer failed !"
)
return False
# Abort, if the model is already set
elif hasattr(self, "model_name") and model_name == self._get_model_name():
reflector_logger.info("No change in model. Setting model skipped.")
return False
# Update model name and tokenizer
self.model_name = model_name
self.llm_tokenizer = AutoTokenizer.from_pretrained(
self.model_name, cache_dir=settings.CACHE_DIR
)
reflector_logger.info(f"Model set to {model_name=}. Tokenizer updated.")
return True
def _get_tokenizer(self) -> AutoTokenizer:
"""
Return the currently used LLM tokenizer
"""
return self.llm_tokenizer
def _get_model_name(self) -> str:
"""
Return the current model name from the instance details
"""
return self.model_name
LLM.register("modal", ModalLLM)
if __name__ == "__main__":
from reflector.logger import logger
async def main():
llm = ModalLLM()
prompt = llm.create_prompt(
instruct="Complete the following task",
text="Tell me a joke about programming.",
)
result = await llm.generate(prompt=prompt, logger=logger)
print(result)
gen_schema = {
"type": "object",
"properties": {"response": {"type": "string"}},
}
result = await llm.generate(prompt=prompt, gen_schema=gen_schema, logger=logger)
print(result)
gen_cfg = GenerationConfig(max_new_tokens=150)
result = await llm.generate(
prompt=prompt, gen_cfg=gen_cfg, gen_schema=gen_schema, logger=logger
)
print(result)
import asyncio
asyncio.run(main())