Feature additions (#210)

* initial

* add LLM features

* update LLM logic

* update llm functions: change control flow

* add generation config

* update return types

* update processors and tests

* update rtc_offer

* revert new title processor change

* fix unit tests

* add comments and fix HTTP 500

* adjust prompt

* test with reflector app

* revert new event for final title

* update

* move onus onto processors

* move onus onto processors

* stash

* add provision for gen config

* dynamically pack the LLM input using context length

* tune final summary params

* update consolidated class structures

* update consolidated class structures

* update precommit

* add broadcast processors

* working baseline

* Organize LLMParams

* minor fixes

* minor fixes

* minor fixes

* fix unit tests

* fix unit tests

* fix unit tests

* update tests

* update tests

* edit pipeline response events

* update summary return types

* configure tests

* alembic db migration

* change LLM response flow

* edit main llm functions

* edit main llm functions

* change llm name and gen cf

* Update transcript_topic_detector.py

* PR review comments

* checkpoint before db event migration

* update DB migration of past events

* update DB migration of past events

* edit LLM classes

* Delete unwanted file

* remove List typing

* remove List typing

* update oobabooga API call

* topic enhancements

* update UI event handling

* move ensure_casing to llm base

* update tests

* update tests
This commit is contained in:
projects-g
2023-09-13 11:26:08 +05:30
committed by GitHub
parent 762d7bfc3c
commit 9fe261406c
33 changed files with 1334 additions and 202 deletions

View File

@@ -4,7 +4,20 @@ from .audio_merge import AudioMergeProcessor # noqa: F401
from .audio_transcript import AudioTranscriptProcessor # noqa: F401
from .audio_transcript_auto import AudioTranscriptAutoProcessor # noqa: F401
from .base import Pipeline, PipelineEvent, Processor, ThreadedProcessor # noqa: F401
from .transcript_final_summary import TranscriptFinalSummaryProcessor # noqa: F401
from .transcript_final_long_summary import ( # noqa: F401
TranscriptFinalLongSummaryProcessor,
)
from .transcript_final_short_summary import ( # noqa: F401
TranscriptFinalShortSummaryProcessor,
)
from .transcript_final_title import TranscriptFinalTitleProcessor # noqa: F401
from .transcript_liner import TranscriptLinerProcessor # noqa: F401
from .transcript_topic_detector import TranscriptTopicDetectorProcessor # noqa: F401
from .types import AudioFile, FinalSummary, TitleSummary, Transcript, Word # noqa: F401
from .types import ( # noqa: F401
AudioFile,
FinalLongSummary,
FinalShortSummary,
TitleSummary,
Transcript,
Word,
)

View File

@@ -5,6 +5,7 @@ from uuid import uuid4
from prometheus_client import Counter, Gauge, Histogram
from pydantic import BaseModel
from reflector.logger import logger
@@ -296,7 +297,7 @@ class BroadcastProcessor(Processor):
types of input.
"""
def __init__(self, processors: Processor):
def __init__(self, processors: list[Processor]):
super().__init__()
self.processors = processors
self.INPUT_TYPE = processors[0].INPUT_TYPE

View File

@@ -0,0 +1,59 @@
from reflector.llm import LLM, LLMTaskParams
from reflector.processors.base import Processor
from reflector.processors.types import FinalLongSummary, TitleSummary
class TranscriptFinalLongSummaryProcessor(Processor):
"""
Get the final long summary
"""
INPUT_TYPE = TitleSummary
OUTPUT_TYPE = FinalLongSummary
TASK = "final_long_summary"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.chunks: list[TitleSummary] = []
self.llm = LLM.get_instance()
self.params = LLMTaskParams.get_instance(self.TASK).task_params
async def _push(self, data: TitleSummary):
self.chunks.append(data)
async def get_long_summary(self, text: str) -> str:
"""
Generate a long version of the final summary
"""
self.logger.info(f"Smoothing out {len(text)} length summary to a long summary")
chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
accumulated_summaries = ""
for chunk in chunks:
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
accumulated_summaries += summary_result["long_summary"]
return accumulated_summaries
async def _flush(self):
if not self.chunks:
self.logger.warning("No summary to output")
return
accumulated_summaries = " ".join([chunk.summary for chunk in self.chunks])
long_summary = await self.get_long_summary(accumulated_summaries)
last_chunk = self.chunks[-1]
duration = last_chunk.timestamp + last_chunk.duration
final_long_summary = FinalLongSummary(
long_summary=long_summary,
duration=duration,
)
await self.emit(final_long_summary)

View File

@@ -0,0 +1,72 @@
from reflector.llm import LLM, LLMTaskParams
from reflector.processors.base import Processor
from reflector.processors.types import FinalShortSummary, TitleSummary
class TranscriptFinalShortSummaryProcessor(Processor):
"""
Get the final summary using a tree summarizer
"""
INPUT_TYPE = TitleSummary
OUTPUT_TYPE = FinalShortSummary
TASK = "final_short_summary"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.chunks: list[TitleSummary] = []
self.llm = LLM.get_instance()
self.params = LLMTaskParams.get_instance(self.TASK).task_params
async def _push(self, data: TitleSummary):
self.chunks.append(data)
async def get_short_summary(self, text: str) -> dict:
"""
Generata a short summary using tree summarizer
"""
self.logger.info(f"Smoothing out {len(text)} length summary to a short summary")
chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
if len(chunks) == 1:
chunk = chunks[0]
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
return summary_result
else:
accumulated_summaries = ""
for chunk in chunks:
prompt = self.llm.create_prompt(
instruct=self.params.instruct, text=chunk
)
summary_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
accumulated_summaries += summary_result["short_summary"]
return await self.get_short_summary(accumulated_summaries)
async def _flush(self):
if not self.chunks:
self.logger.warning("No summary to output")
return
accumulated_summaries = " ".join([chunk.summary for chunk in self.chunks])
short_summary_result = await self.get_short_summary(accumulated_summaries)
last_chunk = self.chunks[-1]
duration = last_chunk.timestamp + last_chunk.duration
final_summary = FinalShortSummary(
short_summary=short_summary_result["short_summary"],
duration=duration,
)
await self.emit(final_summary)

View File

@@ -1,30 +0,0 @@
from reflector.processors.base import Processor
from reflector.processors.types import TitleSummary, FinalSummary
class TranscriptFinalSummaryProcessor(Processor):
"""
Assemble all summary into a line-based json
"""
INPUT_TYPE = TitleSummary
OUTPUT_TYPE = FinalSummary
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.chunks: list[TitleSummary] = []
async def _push(self, data: TitleSummary):
self.chunks.append(data)
async def _flush(self):
if not self.chunks:
self.logger.warning("No summary to output")
return
# FIXME improve final summary
result = "\n".join([chunk.summary for chunk in self.chunks])
last_chunk = self.chunks[-1]
duration = last_chunk.timestamp + last_chunk.duration
await self.emit(FinalSummary(summary=result, duration=duration))

View File

@@ -0,0 +1,65 @@
from reflector.llm import LLM, LLMTaskParams
from reflector.processors.base import Processor
from reflector.processors.types import FinalTitle, TitleSummary
class TranscriptFinalTitleProcessor(Processor):
"""
Assemble all summary into a line-based json
"""
INPUT_TYPE = TitleSummary
OUTPUT_TYPE = FinalTitle
TASK = "final_title"
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.chunks: list[TitleSummary] = []
self.llm = LLM.get_instance()
self.params = LLMTaskParams.get_instance(self.TASK).task_params
async def _push(self, data: TitleSummary):
self.chunks.append(data)
async def get_title(self, text: str) -> dict:
"""
Generate a title for the whole recording
"""
chunks = list(self.llm.split_corpus(corpus=text, task_params=self.params))
if len(chunks) == 1:
chunk = chunks[0]
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=chunk)
title_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
return title_result
else:
accumulated_titles = ""
for chunk in chunks:
prompt = self.llm.create_prompt(
instruct=self.params.instruct, text=chunk
)
title_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
accumulated_titles += title_result["summary"]
return await self.get_title(accumulated_titles)
async def _flush(self):
if not self.chunks:
self.logger.warning("No summary to output")
return
accumulated_titles = ".".join([chunk.title for chunk in self.chunks])
title_result = await self.get_title(accumulated_titles)
final_title = FinalTitle(title=title_result["title"])
await self.emit(final_title)

View File

@@ -1,7 +1,6 @@
from reflector.llm import LLM
from reflector.llm import LLM, LLMTaskParams
from reflector.processors.base import Processor
from reflector.processors.types import TitleSummary, Transcript
from reflector.utils.retry import retry
class TranscriptTopicDetectorProcessor(Processor):
@@ -11,34 +10,14 @@ class TranscriptTopicDetectorProcessor(Processor):
INPUT_TYPE = Transcript
OUTPUT_TYPE = TitleSummary
TASK = "topic"
PROMPT = """
### Human:
Create a JSON object as response.The JSON object must have 2 fields:
i) title and ii) summary.
For the title field, generate a short title for the given text.
For the summary field, summarize the given text in a maximum of
three sentences.
{input_text}
### Assistant:
"""
def __init__(self, min_transcript_length=750, **kwargs):
def __init__(self, min_transcript_length: int = 750, **kwargs):
super().__init__(**kwargs)
self.transcript = None
self.min_transcript_length = min_transcript_length
self.llm = LLM.get_instance()
self.topic_detector_schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
},
}
self.params = LLMTaskParams.get_instance(self.TASK).task_params
async def _warmup(self):
await self.llm.warmup(logger=self.logger)
@@ -55,18 +34,30 @@ class TranscriptTopicDetectorProcessor(Processor):
return
await self.flush()
async def get_topic(self, text: str) -> dict:
"""
Generate a topic and description for a transcription excerpt
"""
prompt = self.llm.create_prompt(instruct=self.params.instruct, text=text)
topic_result = await self.llm.generate(
prompt=prompt,
gen_schema=self.params.gen_schema,
gen_cfg=self.params.gen_cfg,
logger=self.logger,
)
return topic_result
async def _flush(self):
if not self.transcript:
return
text = self.transcript.text
self.logger.info(f"Topic detector got {len(text)} length transcript")
prompt = self.PROMPT.format(input_text=text)
result = await retry(self.llm.generate)(
prompt=prompt, schema=self.topic_detector_schema, logger=self.logger
)
topic_result = await self.get_topic(text=text)
summary = TitleSummary(
title=result["title"],
summary=result["summary"],
title=self.llm.ensure_casing(topic_result["title"]),
summary=topic_result["summary"],
timestamp=self.transcript.timestamp,
duration=self.transcript.duration,
transcript=self.transcript,

View File

@@ -103,11 +103,20 @@ class TitleSummary(BaseModel):
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
class FinalSummary(BaseModel):
summary: str
class FinalLongSummary(BaseModel):
long_summary: str
duration: float
class FinalShortSummary(BaseModel):
short_summary: str
duration: float
class FinalTitle(BaseModel):
title: str
class TranslationLanguages(BaseModel):
language_to_id_mapping: dict = {
"Afrikaans": "af",