Feature additions (#210)

* initial * add LLM features * update LLM logic * update llm functions: change control flow * add generation config * update return types * update processors and tests * update rtc_offer * revert new title processor change * fix unit tests * add comments and fix HTTP 500 * adjust prompt * test with reflector app * revert new event for final title * update * move onus onto processors * move onus onto processors * stash * add provision for gen config * dynamically pack the LLM input using context length * tune final summary params * update consolidated class structures * update consolidated class structures * update precommit * add broadcast processors * working baseline * Organize LLMParams * minor fixes * minor fixes * minor fixes * fix unit tests * fix unit tests * fix unit tests * update tests * update tests * edit pipeline response events * update summary return types * configure tests * alembic db migration * change LLM response flow * edit main llm functions * edit main llm functions * change llm name and gen cf * Update transcript_topic_detector.py * PR review comments * checkpoint before db event migration * update DB migration of past events * update DB migration of past events * edit LLM classes * Delete unwanted file * remove List typing * remove List typing * update oobabooga API call * topic enhancements * update UI event handling * move ensure_casing to llm base * update tests * update tests
2026-02-06 02:36:47 +00:00 · 2023-09-13 11:26:08 +05:30
parent 762d7bfc3c
commit 9fe261406c
33 changed files with 1334 additions and 202 deletions
--- a/server/reflector/llm/init.py
+++ b/server/reflector/llm/init.py
@@ -1 +1,2 @@
 from .base import LLM  # noqa: F401
+from .llm_params import LLMTaskParams  # noqa: F401
--- a/server/reflector/llm/base.py
+++ b/server/reflector/llm/base.py
@@ -2,12 +2,19 @@ import importlib
 import json
 import re
 from time import monotonic
+from typing import TypeVar

+import nltk
 from prometheus_client import Counter, Histogram
+from transformers import GenerationConfig
+
+from reflector.llm.llm_params import TaskParams
 from reflector.logger import logger as reflector_logger
 from reflector.settings import settings
 from reflector.utils.retry import retry

+T = TypeVar("T", bound="LLM")
+

 class LLM:
    _registry = {}
@@ -32,12 +39,25 @@ class LLM:
        ["backend"],
    )

+    def __enter__(self):
+        self.ensure_nltk()
+
+    @classmethod
+    def ensure_nltk(cls):
+        """
+        Make sure NLTK package is installed. Searches in the cache and
+        downloads only if needed.
+        """
+        nltk.download("punkt", download_dir=settings.CACHE_DIR)
+        # For POS tagging
+        nltk.download("averaged_perceptron_tagger", download_dir=settings.CACHE_DIR)
+
    @classmethod
    def register(cls, name, klass):
        cls._registry[name] = klass

    @classmethod
-    def get_instance(cls, name=None):
+    def get_instance(cls, model_name: str | None = None, name: str = None) -> T:
        """
        Return an instance depending on the settings.
        Settings used:
@@ -50,7 +70,39 @@ class LLM:
        if name not in cls._registry:
            module_name = f"reflector.llm.llm_{name}"
            importlib.import_module(module_name)
-        return cls._registry[name]()
+        return cls._registry[name](model_name)
+
+    def get_model_name(self) -> str:
+        """
+        Get the currently set model name
+        """
+        return self._get_model_name()
+
+    def _get_model_name(self) -> str:
+        pass
+
+    def set_model_name(self, model_name: str) -> bool:
+        """
+        Update the model name with the provided model name
+        """
+        return self._set_model_name(model_name)
+
+    def _set_model_name(self, model_name: str) -> bool:
+        raise NotImplementedError
+
+    @property
+    def template(self) -> str:
+        """
+        Return the LLM Prompt template
+        """
+        return """
+        ### Human:
+        {instruct}
+
+        {text}
+
+        ### Assistant:
+        """

    def __init__(self):
        name = self.__class__.__name__
@@ -73,21 +125,39 @@ class LLM:
    async def _warmup(self, logger: reflector_logger):
        pass

+    @property
+    def tokenizer(self):
+        """
+        Return the tokenizer instance used by LLM
+        """
+        return self._get_tokenizer()
+
+    def _get_tokenizer(self):
+        pass
+
    async def generate(
        self,
        prompt: str,
        logger: reflector_logger,
-        schema: dict | None = None,
+        gen_schema: dict | None = None,
+        gen_cfg: GenerationConfig | None = None,
        **kwargs,
    ) -> dict:
        logger.info("LLM generate", prompt=repr(prompt))
+
+        if gen_cfg:
+            gen_cfg = gen_cfg.to_dict()
        self.m_generate_call.inc()
        try:
            with self.m_generate.time():
                result = await retry(self._generate)(
-                    prompt=prompt, schema=schema, **kwargs
+                    prompt=prompt,
+                    gen_schema=gen_schema,
+                    gen_cfg=gen_cfg,
+                    **kwargs,
                )
            self.m_generate_success.inc()
+
        except Exception:
            logger.exception("Failed to call llm after retrying")
            self.m_generate_failure.inc()
@@ -100,7 +170,60 @@ class LLM:

        return result

-    async def _generate(self, prompt: str, schema: dict | None, **kwargs) -> str:
+    def ensure_casing(self, title: str) -> str:
+        """
+        LLM takes care of word casing, but in rare cases this
+        can falter. This is a fallback to ensure the casing of
+        topics is in a proper format.
+
+        We select nouns, verbs and adjectives and check if camel
+         casing is present and fix it, if not. Will not perform
+         any other changes.
+        """
+        tokens = nltk.word_tokenize(title)
+        pos_tags = nltk.pos_tag(tokens)
+        camel_cased = []
+
+        whitelisted_pos_tags = [
+            "NN",
+            "NNS",
+            "NNP",
+            "NNPS",  # Noun POS
+            "VB",
+            "VBD",
+            "VBG",
+            "VBN",
+            "VBP",
+            "VBZ",  # Verb POS
+            "JJ",
+            "JJR",
+            "JJS",  # Adjective POS
+        ]
+
+        # If at all there is an exception, do not block other reflector
+        # processes. Return the LLM generated title, at the least.
+        try:
+            for word, pos in pos_tags:
+                if pos in whitelisted_pos_tags and word[0].islower():
+                    camel_cased.append(word[0].upper() + word[1:])
+                else:
+                    camel_cased.append(word)
+            modified_title = " ".join(camel_cased)
+
+            # The result can have words in braces with additional space.
+            # Change ( ABC ), [ ABC ], etc. ==> (ABC), [ABC], etc.
+            pattern = r"(?<=[\[\{\(])\s+|\s+(?=[\]\}\)])"
+            title = re.sub(pattern, "", modified_title)
+        except Exception as e:
+            reflector_logger.info(
+                f"Failed to ensure casing on {title=} " f"with exception : {str(e)}"
+            )
+
+        return title
+
+    async def _generate(
+        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
+    ) -> str:
        raise NotImplementedError

    def _parse_json(self, result: str) -> dict:
@@ -122,3 +245,62 @@ class LLM:
                result = result[:-3]

        return json.loads(result.strip())
+
+    def text_token_threshold(self, task_params: TaskParams | None) -> int:
+        """
+        Choose the token size to set as the threshold to pack the LLM calls
+        """
+        buffer_token_size = 25
+        default_output_tokens = 1000
+        context_window = self.tokenizer.model_max_length
+        tokens = self.tokenizer.tokenize(
+            self.create_prompt(instruct=task_params.instruct, text="")
+        )
+        threshold = context_window - len(tokens) - buffer_token_size
+        if task_params.gen_cfg:
+            threshold -= task_params.gen_cfg.max_new_tokens
+        else:
+            threshold -= default_output_tokens
+        return threshold
+
+    def split_corpus(
+        self,
+        corpus: str,
+        task_params: TaskParams,
+        token_threshold: int | None = None,
+    ) -> list[str]:
+        """
+        Split the input to the LLM due to CUDA memory limitations and LLM context window
+        restrictions.
+
+        Accumulate tokens from full sentences till threshold and yield accumulated
+        tokens. Reset accumulation when threshold is reached and repeat process.
+        """
+        if not token_threshold:
+            token_threshold = self.text_token_threshold(task_params=task_params)
+
+        accumulated_tokens = []
+        accumulated_sentences = []
+        accumulated_token_count = 0
+        corpus_sentences = nltk.sent_tokenize(corpus)
+
+        for sentence in corpus_sentences:
+            tokens = self.tokenizer.tokenize(sentence)
+            if accumulated_token_count + len(tokens) <= token_threshold:
+                accumulated_token_count += len(tokens)
+                accumulated_tokens.extend(tokens)
+                accumulated_sentences.append(sentence)
+            else:
+                yield "".join(accumulated_sentences)
+                accumulated_token_count = len(tokens)
+                accumulated_tokens = tokens
+                accumulated_sentences = [sentence]
+
+        if accumulated_tokens:
+            yield " ".join(accumulated_sentences)
+
+    def create_prompt(self, instruct: str, text: str) -> str:
+        """
+        Create a consumable prompt based on the prompt template
+        """
+        return self.template.format(instruct=instruct, text=text)
--- a/server/reflector/llm/llm_banana.py
+++ b/server/reflector/llm/llm_banana.py
@@ -1,4 +1,5 @@
 import httpx
+
 from reflector.llm.base import LLM
 from reflector.settings import settings
 from reflector.utils.retry import retry
@@ -13,10 +14,14 @@ class BananaLLM(LLM):
            "X-Banana-Model-Key": settings.LLM_BANANA_MODEL_KEY,
        }

-    async def _generate(self, prompt: str, schema: dict | None, **kwargs):
+    async def _generate(
+        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
+    ):
        json_payload = {"prompt": prompt}
-        if schema:
-            json_payload["schema"] = schema
+        if gen_schema:
+            json_payload["gen_schema"] = gen_schema
+        if gen_cfg:
+            json_payload["gen_cfg"] = gen_cfg
        async with httpx.AsyncClient() as client:
            response = await retry(client.post)(
                settings.LLM_URL,
@@ -27,18 +32,21 @@ class BananaLLM(LLM):
            )
            response.raise_for_status()
            text = response.json()["text"]
-            if not schema:
-                text = text[len(prompt) :]
            return text


 LLM.register("banana", BananaLLM)

 if __name__ == "__main__":
+    from reflector.logger import logger

    async def main():
        llm = BananaLLM()
-        result = await llm.generate("Hello, my name is")
+        prompt = llm.create_prompt(
+            instruct="Complete the following task",
+            text="Tell me a joke about programming.",
+        )
+        result = await llm.generate(prompt=prompt, logger=logger)
        print(result)

    import asyncio
--- a/server/reflector/llm/llm_modal.py
+++ b/server/reflector/llm/llm_modal.py
@@ -1,11 +1,14 @@
 import httpx
+from transformers import AutoTokenizer, GenerationConfig
+
 from reflector.llm.base import LLM
+from reflector.logger import logger as reflector_logger
 from reflector.settings import settings
 from reflector.utils.retry import retry


 class ModalLLM(LLM):
-    def __init__(self):
+    def __init__(self, model_name: str | None = None):
        super().__init__()
        self.timeout = settings.LLM_TIMEOUT
        self.llm_url = settings.LLM_URL + "/llm"
@@ -13,6 +16,16 @@ class ModalLLM(LLM):
        self.headers = {
            "Authorization": f"Bearer {settings.LLM_MODAL_API_KEY}",
        }
+        self._set_model_name(model_name if model_name else settings.DEFAULT_LLM)
+
+    @property
+    def supported_models(self):
+        """
+        List of currently supported models on this GPU platform
+        """
+        # TODO: Query the specific GPU platform
+        # Replace this with a HTTP call
+        return ["lmsys/vicuna-13b-v1.5"]

    async def _warmup(self, logger):
        async with httpx.AsyncClient() as client:
@@ -23,10 +36,14 @@ class ModalLLM(LLM):
            )
            response.raise_for_status()

-    async def _generate(self, prompt: str, schema: dict | None, **kwargs):
+    async def _generate(
+        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
+    ):
        json_payload = {"prompt": prompt}
-        if schema:
-            json_payload["schema"] = schema
+        if gen_schema:
+            json_payload["gen_schema"] = gen_schema
+        if gen_cfg:
+            json_payload["gen_cfg"] = gen_cfg
        async with httpx.AsyncClient() as client:
            response = await retry(client.post)(
                self.llm_url,
@@ -37,10 +54,43 @@ class ModalLLM(LLM):
            )
            response.raise_for_status()
            text = response.json()["text"]
-            if not schema:
-                text = text[len(prompt) :]
            return text

+    def _set_model_name(self, model_name: str) -> bool:
+        """
+        Set the model name
+        """
+        # Abort, if the model is not supported
+        if model_name not in self.supported_models:
+            reflector_logger.info(
+                f"Attempted to change {model_name=}, but is not supported."
+                f"Setting model and tokenizer failed !"
+            )
+            return False
+        # Abort, if the model is already set
+        elif hasattr(self, "model_name") and model_name == self._get_model_name():
+            reflector_logger.info("No change in model. Setting model skipped.")
+            return False
+        # Update model name and tokenizer
+        self.model_name = model_name
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, cache_dir=settings.CACHE_DIR
+        )
+        reflector_logger.info(f"Model set to {model_name=}. Tokenizer updated.")
+        return True
+
+    def _get_tokenizer(self) -> AutoTokenizer:
+        """
+        Return the currently used LLM tokenizer
+        """
+        return self.llm_tokenizer
+
+    def _get_model_name(self) -> str:
+        """
+        Return the current model name from the instance details
+        """
+        return self.model_name
+

 LLM.register("modal", ModalLLM)

@@ -49,15 +99,25 @@ if __name__ == "__main__":

    async def main():
        llm = ModalLLM()
-        result = await llm.generate("Hello, my name is", logger=logger)
+        prompt = llm.create_prompt(
+            instruct="Complete the following task",
+            text="Tell me a joke about programming.",
+        )
+        result = await llm.generate(prompt=prompt, logger=logger)
        print(result)

-        schema = {
+        gen_schema = {
            "type": "object",
-            "properties": {"name": {"type": "string"}},
+            "properties": {"response": {"type": "string"}},
        }

-        result = await llm.generate("Hello, my name is", schema=schema, logger=logger)
+        result = await llm.generate(prompt=prompt, gen_schema=gen_schema, logger=logger)
+        print(result)
+
+        gen_cfg = GenerationConfig(max_new_tokens=150)
+        result = await llm.generate(
+            prompt=prompt, gen_cfg=gen_cfg, gen_schema=gen_schema, logger=logger
+        )
        print(result)

    import asyncio
--- a/server/reflector/llm/llm_oobabooga.py
+++ b/server/reflector/llm/llm_oobabooga.py
@@ -1,13 +1,21 @@
 import httpx
+
 from reflector.llm.base import LLM
 from reflector.settings import settings


 class OobaboogaLLM(LLM):
-    async def _generate(self, prompt: str, schema: dict | None, **kwargs):
+    def __init__(self, model_name: str | None = None):
+        super().__init__()
+
+    async def _generate(
+        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
+    ):
        json_payload = {"prompt": prompt}
-        if schema:
-            json_payload["schema"] = schema
+        if gen_schema:
+            json_payload["gen_schema"] = gen_schema
+        if gen_cfg:
+            json_payload.update(gen_cfg)
        async with httpx.AsyncClient() as client:
            response = await client.post(
                settings.LLM_URL,
--- a/server/reflector/llm/llm_openai.py
+++ b/server/reflector/llm/llm_openai.py
@@ -1,11 +1,13 @@
 import httpx
+from transformers import GenerationConfig
+
 from reflector.llm.base import LLM
 from reflector.logger import logger
 from reflector.settings import settings


 class OpenAILLM(LLM):
-    def __init__(self, **kwargs):
+    def __init__(self, model_name: str | None = None, **kwargs):
        super().__init__(**kwargs)
        self.openai_key = settings.LLM_OPENAI_KEY
        self.openai_url = settings.LLM_URL
@@ -15,7 +17,13 @@ class OpenAILLM(LLM):
        self.max_tokens = settings.LLM_MAX_TOKENS
        logger.info(f"LLM use openai backend at {self.openai_url}")

-    async def _generate(self, prompt: str, schema: dict | None, **kwargs) -> str:
+    async def _generate(
+        self,
+        prompt: str,
+        gen_schema: dict | None,
+        gen_cfg: GenerationConfig | None,
+        **kwargs,
+    ) -> str:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.openai_key}",
--- a/server/reflector/llm/llm_params.py
+++ b/server/reflector/llm/llm_params.py
@@ -0,0 +1,150 @@
+from typing import Optional, TypeVar
+
+from pydantic import BaseModel
+from transformers import GenerationConfig
+
+
+class TaskParams(BaseModel, arbitrary_types_allowed=True):
+    instruct: str
+    gen_cfg: Optional[GenerationConfig] = None
+    gen_schema: Optional[dict] = None
+
+
+T = TypeVar("T", bound="LLMTaskParams")
+
+
+class LLMTaskParams:
+    _registry = {}
+
+    @classmethod
+    def register(cls, task, klass) -> None:
+        cls._registry[task] = klass
+
+    @classmethod
+    def get_instance(cls, task: str) -> T:
+        return cls._registry[task]()
+
+    @property
+    def task_params(self) -> TaskParams | None:
+        """
+        Fetch the task related parameters
+        """
+        return self._get_task_params()
+
+    def _get_task_params(self) -> None:
+        pass
+
+
+class FinalLongSummaryParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=800, num_beams=3, do_sample=True, temperature=0.3
+        )
+        self._instruct = """
+        Take the key ideas and takeaways from the text and create a short
+         summary. Be sure to keep the length of the response to a minimum.
+         Do not include trivial information in the summary.
+          """
+        self._schema = {
+            "type": "object",
+            "properties": {"long_summary": {"type": "string"}},
+        }
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """gen_schema
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
+class FinalShortSummaryParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=1300, num_beams=3, do_sample=True, temperature=0.3
+        )
+        self._instruct = """
+        Take the key ideas and takeaways from the text and create a short
+         summary. Be sure to keep the length of the response to a minimum.
+         Do not include trivial information in the summary.
+          """
+        self._schema = {
+            "type": "object",
+            "properties": {"short_summary": {"type": "string"}},
+        }
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
+class FinalTitleParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=200, num_beams=5, do_sample=True, temperature=0.5
+        )
+        self._instruct = """
+            Combine the following individual titles into one single short title that
+            condenses the essence of all titles.
+        """
+        self._schema = {
+            "type": "object",
+            "properties": {"title": {"type": "string"}},
+        }
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
+class TopicParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=550, num_beams=6, do_sample=True, temperature=0.9
+        )
+        self._instruct = """
+                Create a JSON object as response.The JSON object must have 2 fields:
+                i) title and ii) summary.
+                For the title field, generate a very detailed and self-explanatory
+                 title for the given text. Let the title be as descriptive as possible.
+                For the summary field, summarize the given text in a maximum of
+                three sentences.
+            """
+        self._schema = {
+            "type": "object",
+            "properties": {
+                "title": {"type": "string"},
+                "summary": {"type": "string"},
+            },
+        }
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
+LLMTaskParams.register("topic", TopicParams)
+LLMTaskParams.register("final_title", FinalTitleParams)
+LLMTaskParams.register("final_short_summary", FinalShortSummaryParams)
+LLMTaskParams.register("final_long_summary", FinalLongSummaryParams)