mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 20:59:05 +00:00
New summary (#283)
* handover final summary to Zephyr deployment * fix display error * push new summary feature * fix failing test case * Added markdown support for final summary * update UI render issue * retain sentence tokenizer call --------- Co-authored-by: Koper <andreas@monadical.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import nltk
|
||||
from reflector.llm import LLM, LLMTaskParams
|
||||
from reflector.processors.base import Processor
|
||||
from reflector.processors.types import FinalLongSummary, TitleSummary
|
||||
@@ -10,36 +11,58 @@ class TranscriptFinalLongSummaryProcessor(Processor):
|
||||
|
||||
INPUT_TYPE = TitleSummary
|
||||
OUTPUT_TYPE = FinalLongSummary
|
||||
TASK = "final_long_summary"
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.chunks: list[TitleSummary] = []
|
||||
self.llm = LLM.get_instance()
|
||||
self.params = LLMTaskParams.get_instance(self.TASK).task_params
|
||||
self.llm = LLM.get_instance(model_name="HuggingFaceH4/zephyr-7b-alpha")
|
||||
|
||||
async def _push(self, data: TitleSummary):
|
||||
self.chunks.append(data)
|
||||
|
||||
async def get_bullet_summary(self, text: str) -> str:
|
||||
params = LLMTaskParams.get_instance("bullet_summary").task_params
|
||||
chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
|
||||
|
||||
bullet_summary = ""
|
||||
for chunk in chunks:
|
||||
prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
|
||||
summary_result = await self.llm.generate(
|
||||
prompt=prompt,
|
||||
gen_schema=params.gen_schema,
|
||||
gen_cfg=params.gen_cfg,
|
||||
logger=self.logger,
|
||||
)
|
||||
bullet_summary += summary_result["long_summary"]
|
||||
return bullet_summary
|
||||
|
||||
async def get_merged_summary(self, text: str) -> str:
|
||||
params = LLMTaskParams.get_instance("merged_summary").task_params
|
||||
chunks = list(self.llm.split_corpus(corpus=text, task_params=params))
|
||||
|
||||
merged_summary = ""
|
||||
for chunk in chunks:
|
||||
prompt = self.llm.create_prompt(instruct=params.instruct, text=chunk)
|
||||
summary_result = await self.llm.generate(
|
||||
prompt=prompt,
|
||||
gen_schema=params.gen_schema,
|
||||
gen_cfg=params.gen_cfg,
|
||||
logger=self.logger,
|
||||
)
|
||||
merged_summary += summary_result["long_summary"]
|
||||
return merged_summary
|
||||
|
||||
async def get_long_summary(self, text: str) -> str:
|
||||
"""
|
||||
Generate a long version of the final summary
|
||||
"""
|
||||
self.logger.info(f"Smoothing out {len(text)} length summary to a long summary")
|
||||
chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
|
||||
bullet_summary = await self.get_bullet_summary(text)
|
||||
merged_summary = await self.get_merged_summary(bullet_summary)
|
||||
|
||||
accumulated_summaries = ""
|
||||
for chunk in chunks:
|
||||
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
|
||||
summary_result = await self.llm.generate(
|
||||
prompt=prompt,
|
||||
gen_schema=self.params.gen_schema,
|
||||
gen_cfg=self.params.gen_cfg,
|
||||
logger=self.logger,
|
||||
)
|
||||
accumulated_summaries += summary_result["long_summary"]
|
||||
return merged_summary
|
||||
|
||||
return accumulated_summaries
|
||||
def sentence_tokenize(self, text: str) -> [str]:
|
||||
return nltk.sent_tokenize(text)
|
||||
|
||||
async def _flush(self):
|
||||
if not self.chunks:
|
||||
@@ -49,11 +72,25 @@ class TranscriptFinalLongSummaryProcessor(Processor):
|
||||
accumulated_summaries = " ".join([chunk.summary for chunk in self.chunks])
|
||||
long_summary = await self.get_long_summary(accumulated_summaries)
|
||||
|
||||
# Format the output as much as possible to be handled
|
||||
# by front-end for displaying
|
||||
summary_sentences = []
|
||||
for sentence in self.sentence_tokenize(long_summary):
|
||||
sentence = str(sentence).strip()
|
||||
if sentence.startswith("- "):
|
||||
sentence.replace("- ", "* ")
|
||||
else:
|
||||
sentence = "* " + sentence
|
||||
sentence += " \n"
|
||||
summary_sentences.append(sentence)
|
||||
|
||||
formatted_long_summary = "".join(summary_sentences)
|
||||
|
||||
last_chunk = self.chunks[-1]
|
||||
duration = last_chunk.timestamp + last_chunk.duration
|
||||
|
||||
final_long_summary = FinalLongSummary(
|
||||
long_summary=long_summary,
|
||||
long_summary=formatted_long_summary,
|
||||
duration=duration,
|
||||
)
|
||||
await self.emit(final_long_summary)
|
||||
|
||||
Reference in New Issue
Block a user