feat: retake summary using NousResearch/Hermes-3-Llama-3.1-8B model (#415)

This feature a new modal endpoint, and a complete new way to build the summary. ## SummaryBuilder The summary builder is based on conversational model, where an exchange between the model and the user is made. This allow more context inclusion and a better respect of the rules. It requires an endpoint with OpenAI-like completions endpoint (/v1/chat/completions) ## vLLM Hermes3 Unlike previous deployment, this one use vLLM, which gives OpenAI-like completions endpoint out of the box. It could also handle guided JSON generation, so jsonformer is not needed. But, the model is quite good to follow JSON schema if asked in the prompt. ## Conversion of long/short into summary builder The builder is identifying participants, find key subjects, get a summary for each, then get a quick recap. The quick recap is used as a short_summary, while the markdown including the quick recap + key subjects + summaries are used for the long_summary. This is why the nextjs component has to be updated, to correctly style h1 and keep the new line of the markdown.
2026-02-06 02:36:47 +00:00 · 2024-09-14 02:28:38 +02:00
parent 6c4eac04c1
commit 5267ab2d37
20 changed files with 1383 additions and 238 deletions
--- a/server/reflector/llm/base.py
+++ b/server/reflector/llm/base.py
@@ -156,6 +156,27 @@ class LLM:

        return result

+    async def completion(
+        self, messages: list, logger: reflector_logger, **kwargs
+    ) -> dict:
+        """
+        Use /v1/chat/completion Open-AI compatible endpoint from the URL
+        It's up to the user to validate anything or transform the result
+        """
+        logger.info("LLM completions", messages=messages)
+
+        try:
+            with self.m_generate.time():
+                result = await retry(self._completion)(messages=messages, **kwargs)
+            self.m_generate_success.inc()
+        except Exception:
+            logger.exception("Failed to call llm after retrying")
+            self.m_generate_failure.inc()
+            raise
+
+        logger.debug("LLM completion result", result=repr(result))
+        return result
+
    def ensure_casing(self, title: str) -> str:
        """
        LLM takes care of word casing, but in rare cases this
@@ -234,6 +255,11 @@ class LLM:
    ) -> str:
        raise NotImplementedError

+    async def _completion(
+        self, messages: list, logger: reflector_logger, **kwargs
+    ) -> dict:
+        raise NotImplementedError
+
    def _parse_json(self, result: str) -> dict:
        result = result.strip()
        # try detecting code block if exist
--- a/server/reflector/llm/llm_modal.py
+++ b/server/reflector/llm/llm_modal.py
@@ -23,7 +23,11 @@ class ModalLLM(LLM):
        """
        # TODO: Query the specific GPU platform
        # Replace this with a HTTP call
-        return ["lmsys/vicuna-13b-v1.5", "HuggingFaceH4/zephyr-7b-alpha"]
+        return [
+            "lmsys/vicuna-13b-v1.5",
+            "HuggingFaceH4/zephyr-7b-alpha",
+            "NousResearch/Hermes-3-Llama-3.1-8B",
+        ]

    async def _generate(
        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
@@ -53,6 +57,31 @@ class ModalLLM(LLM):
            text = response.json()["text"]
            return text

+    async def _completion(self, messages: list, **kwargs) -> dict:
+        kwargs.setdefault("temperature", 0.3)
+        kwargs.setdefault("max_tokens", 2048)
+        kwargs.setdefault("stream", False)
+        kwargs.setdefault("repetition_penalty", 1)
+        kwargs.setdefault("top_p", 1)
+        kwargs.setdefault("top_k", -1)
+        kwargs.setdefault("min_p", 0.05)
+        data = {"messages": messages, "model": self.model_name, **kwargs}
+
+        if self.model_name == "NousResearch/Hermes-3-Llama-3.1-8B":
+            self.llm_url = settings.HERMES_3_8B_LLM_URL + "/v1/chat/completions"
+
+        async with httpx.AsyncClient() as client:
+            response = await retry(client.post)(
+                self.llm_url,
+                headers=self.headers,
+                json=data,
+                timeout=self.timeout,
+                retry_timeout=60 * 5,
+                follow_redirects=True,
+            )
+            response.raise_for_status()
+            return response.json()
+
    def _set_model_name(self, model_name: str) -> bool:
        """
        Set the model name