mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
This feature a new modal endpoint, and a complete new way to build the summary. ## SummaryBuilder The summary builder is based on conversational model, where an exchange between the model and the user is made. This allow more context inclusion and a better respect of the rules. It requires an endpoint with OpenAI-like completions endpoint (/v1/chat/completions) ## vLLM Hermes3 Unlike previous deployment, this one use vLLM, which gives OpenAI-like completions endpoint out of the box. It could also handle guided JSON generation, so jsonformer is not needed. But, the model is quite good to follow JSON schema if asked in the prompt. ## Conversion of long/short into summary builder The builder is identifying participants, find key subjects, get a summary for each, then get a quick recap. The quick recap is used as a short_summary, while the markdown including the quick recap + key subjects + summaries are used for the long_summary. This is why the nextjs component has to be updated, to correctly style h1 and keep the new line of the markdown.
152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
import httpx
|
|
from reflector.llm.base import LLM
|
|
from reflector.logger import logger as reflector_logger
|
|
from reflector.settings import settings
|
|
from reflector.utils.retry import retry
|
|
from transformers import AutoTokenizer, GenerationConfig
|
|
|
|
|
|
class ModalLLM(LLM):
|
|
def __init__(self, model_name: str | None = None):
|
|
super().__init__()
|
|
self.timeout = settings.LLM_TIMEOUT
|
|
self.llm_url = settings.LLM_URL + "/llm"
|
|
self.headers = {
|
|
"Authorization": f"Bearer {settings.LLM_MODAL_API_KEY}",
|
|
}
|
|
self._set_model_name(model_name if model_name else settings.DEFAULT_LLM)
|
|
|
|
@property
|
|
def supported_models(self):
|
|
"""
|
|
List of currently supported models on this GPU platform
|
|
"""
|
|
# TODO: Query the specific GPU platform
|
|
# Replace this with a HTTP call
|
|
return [
|
|
"lmsys/vicuna-13b-v1.5",
|
|
"HuggingFaceH4/zephyr-7b-alpha",
|
|
"NousResearch/Hermes-3-Llama-3.1-8B",
|
|
]
|
|
|
|
async def _generate(
|
|
self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
|
|
):
|
|
json_payload = {"prompt": prompt}
|
|
if gen_schema:
|
|
json_payload["gen_schema"] = gen_schema
|
|
if gen_cfg:
|
|
json_payload["gen_cfg"] = gen_cfg
|
|
|
|
# Handing over generation of the final summary to Zephyr model
|
|
# but replacing the Vicuna model will happen after more testing
|
|
# TODO: Create a mapping of model names and cloud deployments
|
|
if self.model_name == "HuggingFaceH4/zephyr-7b-alpha":
|
|
self.llm_url = settings.ZEPHYR_LLM_URL + "/llm"
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await retry(client.post)(
|
|
self.llm_url,
|
|
headers=self.headers,
|
|
json=json_payload,
|
|
timeout=self.timeout,
|
|
retry_timeout=60 * 5,
|
|
follow_redirects=True,
|
|
)
|
|
response.raise_for_status()
|
|
text = response.json()["text"]
|
|
return text
|
|
|
|
async def _completion(self, messages: list, **kwargs) -> dict:
|
|
kwargs.setdefault("temperature", 0.3)
|
|
kwargs.setdefault("max_tokens", 2048)
|
|
kwargs.setdefault("stream", False)
|
|
kwargs.setdefault("repetition_penalty", 1)
|
|
kwargs.setdefault("top_p", 1)
|
|
kwargs.setdefault("top_k", -1)
|
|
kwargs.setdefault("min_p", 0.05)
|
|
data = {"messages": messages, "model": self.model_name, **kwargs}
|
|
|
|
if self.model_name == "NousResearch/Hermes-3-Llama-3.1-8B":
|
|
self.llm_url = settings.HERMES_3_8B_LLM_URL + "/v1/chat/completions"
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await retry(client.post)(
|
|
self.llm_url,
|
|
headers=self.headers,
|
|
json=data,
|
|
timeout=self.timeout,
|
|
retry_timeout=60 * 5,
|
|
follow_redirects=True,
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def _set_model_name(self, model_name: str) -> bool:
|
|
"""
|
|
Set the model name
|
|
"""
|
|
# Abort, if the model is not supported
|
|
if model_name not in self.supported_models:
|
|
reflector_logger.info(
|
|
f"Attempted to change {model_name=}, but is not supported."
|
|
f"Setting model and tokenizer failed !"
|
|
)
|
|
return False
|
|
# Abort, if the model is already set
|
|
elif hasattr(self, "model_name") and model_name == self._get_model_name():
|
|
reflector_logger.info("No change in model. Setting model skipped.")
|
|
return False
|
|
# Update model name and tokenizer
|
|
self.model_name = model_name
|
|
self.llm_tokenizer = AutoTokenizer.from_pretrained(
|
|
self.model_name, cache_dir=settings.CACHE_DIR
|
|
)
|
|
reflector_logger.info(f"Model set to {model_name=}. Tokenizer updated.")
|
|
return True
|
|
|
|
def _get_tokenizer(self) -> AutoTokenizer:
|
|
"""
|
|
Return the currently used LLM tokenizer
|
|
"""
|
|
return self.llm_tokenizer
|
|
|
|
def _get_model_name(self) -> str:
|
|
"""
|
|
Return the current model name from the instance details
|
|
"""
|
|
return self.model_name
|
|
|
|
|
|
LLM.register("modal", ModalLLM)
|
|
|
|
if __name__ == "__main__":
|
|
from reflector.logger import logger
|
|
|
|
async def main():
|
|
llm = ModalLLM()
|
|
prompt = llm.create_prompt(
|
|
instruct="Complete the following task",
|
|
text="Tell me a joke about programming.",
|
|
)
|
|
result = await llm.generate(prompt=prompt, logger=logger)
|
|
print(result)
|
|
|
|
gen_schema = {
|
|
"type": "object",
|
|
"properties": {"response": {"type": "string"}},
|
|
}
|
|
|
|
result = await llm.generate(prompt=prompt, gen_schema=gen_schema, logger=logger)
|
|
print(result)
|
|
|
|
gen_cfg = GenerationConfig(max_new_tokens=150)
|
|
result = await llm.generate(
|
|
prompt=prompt, gen_cfg=gen_cfg, gen_schema=gen_schema, logger=logger
|
|
)
|
|
print(result)
|
|
|
|
import asyncio
|
|
|
|
asyncio.run(main())
|