New summary (#283)

* handover final summary to Zephyr deployment * fix display error * push new summary feature * fix failing test case * Added markdown support for final summary * update UI render issue * retain sentence tokenizer call --------- Co-authored-by: Koper <andreas@monadical.com>
2025-12-24 22:29:05 +00:00 · 2023-10-13 22:53:29 +05:30
parent 38cd0385b4
commit 1d92d43fe0
13 changed files with 933 additions and 23 deletions
--- a/server/reflector/llm/base.py
+++ b/server/reflector/llm/base.py
@@ -258,7 +258,7 @@ class LLM:
        """
        Choose the token size to set as the threshold to pack the LLM calls
        """
-        buffer_token_size = 25
+        buffer_token_size = 100
        default_output_tokens = 1000
        context_window = self.tokenizer.model_max_length
        tokens = self.tokenizer.tokenize(
--- a/server/reflector/llm/llm_modal.py
+++ b/server/reflector/llm/llm_modal.py
@@ -23,7 +23,7 @@ class ModalLLM(LLM):
        """
        # TODO: Query the specific GPU platform
        # Replace this with a HTTP call
-        return ["lmsys/vicuna-13b-v1.5"]
+        return ["lmsys/vicuna-13b-v1.5", "HuggingFaceH4/zephyr-7b-alpha"]

    async def _generate(
        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
@@ -33,6 +33,13 @@ class ModalLLM(LLM):
            json_payload["gen_schema"] = gen_schema
        if gen_cfg:
            json_payload["gen_cfg"] = gen_cfg
+
+        # Handing over generation of the final summary to Zephyr model
+        # but replacing the Vicuna model will happen after more testing
+        # TODO: Create a mapping of model names and cloud deployments
+        if self.model_name == "HuggingFaceH4/zephyr-7b-alpha":
+            self.llm_url = settings.ZEPHYR_LLM_URL + "/llm"
+
        async with httpx.AsyncClient() as client:
            response = await retry(client.post)(
                self.llm_url,
--- a/server/reflector/llm/llm_params.py
+++ b/server/reflector/llm/llm_params.py
@@ -144,7 +144,76 @@ class TopicParams(LLMTaskParams):
        return self._task_params


+class BulletedSummaryParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=800,
+            num_beams=1,
+            do_sample=True,
+            temperature=0.2,
+            early_stopping=True,
+        )
+        self._instruct = """
+        Given a meeting transcript, extract the key things discussed in the
+         form of a list.
+
+        While generating the response, follow the constraints mentioned below.
+
+        Summary constraints:
+        i) Do not add new content, except to fix spelling or punctuation.
+        ii) Do not add any prefixes or numbering in the response.
+        iii) The summarization should be as information dense as possible.
+        iv) Do not add any additional sections like Note, Conclusion, etc. in
+        the response.
+
+        Response format:
+        i) The response should be in the form of a bulleted list.
+        ii) Iteratively merge all the relevant paragraphs together to keep the
+         number of paragraphs to a minimum.
+        iii) Remove any unfinished sentences from the final response.
+        iv) Do not include narrative or reporting clauses.
+        v) Use "*" as the bullet icon.
+    """
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=None, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """gen_schema
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
+class MergedSummaryParams(LLMTaskParams):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._gen_cfg = GenerationConfig(
+            max_new_tokens=600,
+            num_beams=1,
+            do_sample=True,
+            temperature=0.2,
+            early_stopping=True,
+        )
+        self._instruct = """
+        Given the key points of a meeting, summarize the points to describe the
+         meeting in the form of paragraphs.
+        """
+        self._task_params = TaskParams(
+            instruct=self._instruct, gen_schema=None, gen_cfg=self._gen_cfg
+        )
+
+    def _get_task_params(self) -> TaskParams:
+        """gen_schema
+        Return the parameters associated with a specific LLM task
+        """
+        return self._task_params
+
+
 LLMTaskParams.register("topic", TopicParams)
 LLMTaskParams.register("final_title", FinalTitleParams)
 LLMTaskParams.register("final_short_summary", FinalShortSummaryParams)
 LLMTaskParams.register("final_long_summary", FinalLongSummaryParams)
+LLMTaskParams.register("bullet_summary", BulletedSummaryParams)
+LLMTaskParams.register("merged_summary", MergedSummaryParams)
--- a/server/reflector/processors/transcript_final_long_summary.py
+++ b/server/reflector/processors/transcript_final_long_summary.py
@@ -1,3 +1,4 @@
+import nltk
 from reflector.llm import LLM, LLMTaskParams
 from reflector.processors.base import Processor
 from reflector.processors.types import FinalLongSummary, TitleSummary
@@ -10,36 +11,58 @@ class TranscriptFinalLongSummaryProcessor(Processor):

    INPUT_TYPE = TitleSummary
    OUTPUT_TYPE = FinalLongSummary
-    TASK = "final_long_summary"

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.chunks: list[TitleSummary] = []
-        self.llm = LLM.get_instance()
-        self.params = LLMTaskParams.get_instance(self.TASK).task_params
+        self.llm = LLM.get_instance(model_name="HuggingFaceH4/zephyr-7b-alpha")

    async def _push(self, data: TitleSummary):
        self.chunks.append(data)

+    async def get_bullet_summary(self, text: str) -> str:
+        params = LLMTaskParams.get_instance("bullet_summary").task_params
+        chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
+
+        bullet_summary = ""
+        for chunk in chunks:
+            prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
+            summary_result = await self.llm.generate(
+                prompt=prompt,
+                gen_schema=params.gen_schema,
+                gen_cfg=params.gen_cfg,
+                logger=self.logger,
+            )
+            bullet_summary += summary_result["long_summary"]
+        return bullet_summary
+
+    async def get_merged_summary(self, text: str) -> str:
+        params = LLMTaskParams.get_instance("merged_summary").task_params
+        chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
+
+        merged_summary = ""
+        for chunk in chunks:
+            prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
+            summary_result = await self.llm.generate(
+                prompt=prompt,
+                gen_schema=params.gen_schema,
+                gen_cfg=params.gen_cfg,
+                logger=self.logger,
+            )
+            merged_summary += summary_result["long_summary"]
+        return merged_summary
+
    async def get_long_summary(self, text: str) -> str:
        """
        Generate a long version of the final summary
        """
-        self.logger.info(f"Smoothing out {len(text)} length summary to a long summary")
-        chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
+        bullet_summary = await self.get_bullet_summary(text)
+        merged_summary = await self.get_merged_summary(bullet_summary)

-        accumulated_summaries = ""
-        for chunk in chunks:
-            prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
-            summary_result = await self.llm.generate(
-                prompt=prompt,
-                gen_schema=self.params.gen_schema,
-                gen_cfg=self.params.gen_cfg,
-                logger=self.logger,
-            )
-            accumulated_summaries += summary_result["long_summary"]
+        return merged_summary

-        return accumulated_summaries
+    def sentence_tokenize(self, text: str) -> [str]:
+        return nltk.sent_tokenize(text)

    async def _flush(self):
        if not self.chunks:
@@ -49,11 +72,25 @@ class TranscriptFinalLongSummaryProcessor(Processor):
        accumulated_summaries = " ".join([chunk.summary for chunk in self.chunks])
        long_summary = await self.get_long_summary(accumulated_summaries)

+        # Format the output as much as possible to be handled
+        # by front-end for displaying
+        summary_sentences = []
+        for sentence in self.sentence_tokenize(long_summary):
+            sentence = str(sentence).strip()
+            if sentence.startswith("- "):
+                sentence.replace("- ", "* ")
+            else:
+                sentence = "* " + sentence
+            sentence += " \n"
+            summary_sentences.append(sentence)
+
+        formatted_long_summary = "".join(summary_sentences)
+
        last_chunk = self.chunks[-1]
        duration = last_chunk.timestamp + last_chunk.duration

        final_long_summary = FinalLongSummary(
-            long_summary=long_summary,
+            long_summary=formatted_long_summary,
            duration=duration,
        )
        await self.emit(final_long_summary)
--- a/server/reflector/settings.py
+++ b/server/reflector/settings.py
@@ -72,6 +72,7 @@ class Settings(BaseSettings):
    LLM_TIMEOUT: int = 60 * 5  # take cold start into account
    LLM_MAX_TOKENS: int = 1024
    LLM_TEMPERATURE: float = 0.7
+    ZEPHYR_LLM_URL: str | None = None

    # LLM Banana configuration
    LLM_BANANA_API_KEY: str | None = None