New summary (#283)

* handover final summary to Zephyr deployment

* fix display error

* push new summary feature

* fix failing test case

* Added markdown support for final summary

* update UI render issue

* retain sentence tokenizer call

---------

Co-authored-by: Koper <andreas@monadical.com>
This commit is contained in:
projects-g
2023-10-13 22:53:29 +05:30
committed by GitHub
parent 38cd0385b4
commit 1d92d43fe0
13 changed files with 933 additions and 23 deletions

View File

@@ -1,3 +1,4 @@
import nltk
from reflector.llm import LLM, LLMTaskParams
from reflector.processors.base import Processor
from reflector.processors.types import FinalLongSummary, TitleSummary
@@ -10,36 +11,58 @@ class TranscriptFinalLongSummaryProcessor(Processor):
INPUT_TYPE = TitleSummary
OUTPUT_TYPE = FinalLongSummary
TASK = "final_long_summary"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.chunks: list[TitleSummary] = []
self.llm = LLM.get_instance()
self.params = LLMTaskParams.get_instance(self.TASK).task_params
self.llm = LLM.get_instance(model_name="HuggingFaceH4/zephyr-7b-alpha")
async def _push(self, data: TitleSummary):
self.chunks.append(data)
async def get_bullet_summary(self, text: str) -> str:
params = LLMTaskParams.get_instance("bullet_summary").task_params
chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
bullet_summary = ""
for chunk in chunks:
prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=params.gen_schema,
gen_cfg=params.gen_cfg,
logger=self.logger,
)
bullet_summary += summary_result["long_summary"]
return bullet_summary
async def get_merged_summary(self, text: str) -> str:
params = LLMTaskParams.get_instance("merged_summary").task_params
chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
merged_summary = ""
for chunk in chunks:
prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=params.gen_schema,
gen_cfg=params.gen_cfg,
logger=self.logger,
)
merged_summary += summary_result["long_summary"]
return merged_summary
async def get_long_summary(self, text: str) -> str:
"""
Generate a long version of the final summary
"""
self.logger.info(f"Smoothing out {len(text)} length summary to a long summary")
chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
bullet_summary = await self.get_bullet_summary(text)
merged_summary = await self.get_merged_summary(bullet_summary)
accumulated_summaries = ""
for chunk in chunks:
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
accumulated_summaries += summary_result["long_summary"]
return merged_summary
return accumulated_summaries
def sentence_tokenize(self, text: str) -> [str]:
return nltk.sent_tokenize(text)
async def _flush(self):
if not self.chunks:
@@ -49,11 +72,25 @@ class TranscriptFinalLongSummaryProcessor(Processor):
accumulated_summaries = " ".join([chunk.summary for chunk in self.chunks])
long_summary = await self.get_long_summary(accumulated_summaries)
# Format the output as much as possible to be handled
# by front-end for displaying
summary_sentences = []
for sentence in self.sentence_tokenize(long_summary):
sentence = str(sentence).strip()
if sentence.startswith("- "):
sentence.replace("- ", "* ")
else:
sentence = "* " + sentence
sentence += " \n"
summary_sentences.append(sentence)
formatted_long_summary = "".join(summary_sentences)
last_chunk = self.chunks[-1]
duration = last_chunk.timestamp + last_chunk.duration
final_long_summary = FinalLongSummary(
long_summary=long_summary,
long_summary=formatted_long_summary,
duration=duration,
)
await self.emit(final_long_summary)