mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
This feature a new modal endpoint, and a complete new way to build the summary. ## SummaryBuilder The summary builder is based on conversational model, where an exchange between the model and the user is made. This allow more context inclusion and a better respect of the rules. It requires an endpoint with OpenAI-like completions endpoint (/v1/chat/completions) ## vLLM Hermes3 Unlike previous deployment, this one use vLLM, which gives OpenAI-like completions endpoint out of the box. It could also handle guided JSON generation, so jsonformer is not needed. But, the model is quite good to follow JSON schema if asked in the prompt. ## Conversion of long/short into summary builder The builder is identifying participants, find key subjects, get a summary for each, then get a quick recap. The quick recap is used as a short_summary, while the markdown including the quick recap + key subjects + summaries are used for the long_summary. This is why the nextjs component has to be updated, to correctly style h1 and keep the new line of the markdown.
232 lines
6.9 KiB
Python
232 lines
6.9 KiB
Python
from unittest.mock import patch
|
|
from tempfile import NamedTemporaryFile
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
@pytest.mark.asyncio
|
|
async def setup_database():
|
|
from reflector.settings import settings
|
|
|
|
with NamedTemporaryFile() as f:
|
|
settings.DATABASE_URL = f"sqlite:///{f.name}"
|
|
from reflector.db import engine, metadata
|
|
|
|
metadata.create_all(bind=engine)
|
|
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def dummy_processors():
|
|
with patch(
|
|
"reflector.processors.transcript_topic_detector.TranscriptTopicDetectorProcessor.get_topic"
|
|
) as mock_topic, patch(
|
|
"reflector.processors.transcript_final_title.TranscriptFinalTitleProcessor.get_title"
|
|
) as mock_title, patch(
|
|
"reflector.processors.transcript_final_summary.TranscriptFinalSummaryProcessor.get_long_summary"
|
|
) as mock_long_summary, patch(
|
|
"reflector.processors.transcript_final_summary.TranscriptFinalSummaryProcessor.get_short_summary"
|
|
) as mock_short_summary, patch(
|
|
"reflector.processors.transcript_translator.TranscriptTranslatorProcessor.get_translation"
|
|
) as mock_translate:
|
|
mock_topic.return_value = {"title": "LLM TITLE", "summary": "LLM SUMMARY"}
|
|
mock_title.return_value = {"title": "LLM TITLE"}
|
|
mock_long_summary.return_value = "LLM LONG SUMMARY"
|
|
mock_short_summary.return_value = "LLM SHORT SUMMARY"
|
|
mock_translate.return_value = "Bonjour le monde"
|
|
yield (
|
|
mock_translate,
|
|
mock_topic,
|
|
mock_title,
|
|
mock_long_summary,
|
|
mock_short_summary,
|
|
) # noqa
|
|
|
|
|
|
@pytest.fixture
|
|
async def dummy_transcript():
|
|
from reflector.processors.audio_transcript import AudioTranscriptProcessor
|
|
from reflector.processors.types import AudioFile, Transcript, Word
|
|
|
|
class TestAudioTranscriptProcessor(AudioTranscriptProcessor):
|
|
_time_idx = 0
|
|
|
|
async def _transcript(self, data: AudioFile):
|
|
i = self._time_idx
|
|
self._time_idx += 2
|
|
return Transcript(
|
|
text="Hello world.",
|
|
words=[
|
|
Word(start=i, end=i + 1, text="Hello", speaker=0),
|
|
Word(start=i + 1, end=i + 2, text=" world.", speaker=0),
|
|
],
|
|
)
|
|
|
|
with patch(
|
|
"reflector.processors.audio_transcript_auto"
|
|
".AudioTranscriptAutoProcessor.__new__"
|
|
) as mock_audio:
|
|
mock_audio.return_value = TestAudioTranscriptProcessor()
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
async def dummy_diarization():
|
|
from reflector.processors.audio_diarization import AudioDiarizationProcessor
|
|
|
|
class TestAudioDiarizationProcessor(AudioDiarizationProcessor):
|
|
_time_idx = 0
|
|
|
|
async def _diarize(self, data):
|
|
i = self._time_idx
|
|
self._time_idx += 2
|
|
return [
|
|
{"start": i, "end": i + 1, "speaker": 0},
|
|
{"start": i + 1, "end": i + 2, "speaker": 1},
|
|
]
|
|
|
|
with patch(
|
|
"reflector.processors.audio_diarization_auto"
|
|
".AudioDiarizationAutoProcessor.__new__"
|
|
) as mock_audio:
|
|
mock_audio.return_value = TestAudioDiarizationProcessor()
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
async def dummy_llm():
|
|
from reflector.llm.base import LLM
|
|
|
|
class TestLLM(LLM):
|
|
def __init__(self):
|
|
self.model_name = "DUMMY MODEL"
|
|
self.llm_tokenizer = "DUMMY TOKENIZER"
|
|
|
|
with patch("reflector.llm.base.LLM.get_instance") as mock_llm:
|
|
mock_llm.return_value = TestLLM()
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
async def dummy_storage():
|
|
from reflector.storage.base import Storage
|
|
|
|
class DummyStorage(Storage):
|
|
async def _put_file(self, *args, **kwargs):
|
|
pass
|
|
|
|
async def _delete_file(self, *args, **kwargs):
|
|
pass
|
|
|
|
async def _get_file_url(self, *args, **kwargs):
|
|
return "http://fake_server/audio.mp3"
|
|
|
|
with patch("reflector.storage.base.Storage.get_instance") as mock_storage:
|
|
mock_storage.return_value = DummyStorage()
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def nltk():
|
|
with patch("reflector.llm.base.LLM.ensure_nltk") as mock_nltk:
|
|
mock_nltk.return_value = "NLTK PACKAGE"
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
def ensure_casing():
|
|
with patch("reflector.llm.base.LLM.ensure_casing") as mock_casing:
|
|
mock_casing.return_value = "LLM TITLE"
|
|
yield
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def celery_enable_logging():
|
|
return True
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def celery_config():
|
|
with NamedTemporaryFile() as f:
|
|
yield {
|
|
"broker_url": "memory://",
|
|
"result_backend": f"db+sqlite:///{f.name}",
|
|
}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def celery_includes():
|
|
return ["reflector.pipelines.main_live_pipeline"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def fake_mp3_upload():
|
|
with patch(
|
|
"reflector.db.transcripts.TranscriptController.move_mp3_to_storage"
|
|
) as mock_move:
|
|
mock_move.return_value = True
|
|
yield
|
|
|
|
|
|
@pytest.fixture
|
|
async def fake_transcript_with_topics(tmpdir):
|
|
from reflector.settings import settings
|
|
from reflector.app import app
|
|
from reflector.views.transcripts import transcripts_controller
|
|
from reflector.db.transcripts import TranscriptTopic
|
|
from reflector.processors.types import Word
|
|
from pathlib import Path
|
|
from httpx import AsyncClient
|
|
import shutil
|
|
|
|
settings.DATA_DIR = Path(tmpdir)
|
|
|
|
# create a transcript
|
|
ac = AsyncClient(app=app, base_url="http://test/v1")
|
|
response = await ac.post("/transcripts", json={"name": "Test audio download"})
|
|
assert response.status_code == 200
|
|
tid = response.json()["id"]
|
|
|
|
transcript = await transcripts_controller.get_by_id(tid)
|
|
assert transcript is not None
|
|
|
|
await transcripts_controller.update(transcript, {"status": "finished"})
|
|
|
|
# manually copy a file at the expected location
|
|
audio_filename = transcript.audio_mp3_filename
|
|
path = Path(__file__).parent / "records" / "test_mathieu_hello.mp3"
|
|
audio_filename.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.copy(path, audio_filename)
|
|
|
|
# create some topics
|
|
await transcripts_controller.upsert_topic(
|
|
transcript,
|
|
TranscriptTopic(
|
|
title="Topic 1",
|
|
summary="Topic 1 summary",
|
|
timestamp=0,
|
|
transcript="Hello world",
|
|
words=[
|
|
Word(text="Hello", start=0, end=1, speaker=0),
|
|
Word(text="world", start=1, end=2, speaker=0),
|
|
],
|
|
),
|
|
)
|
|
await transcripts_controller.upsert_topic(
|
|
transcript,
|
|
TranscriptTopic(
|
|
title="Topic 2",
|
|
summary="Topic 2 summary",
|
|
timestamp=2,
|
|
transcript="Hello world",
|
|
words=[
|
|
Word(text="Hello", start=2, end=3, speaker=0),
|
|
Word(text="world", start=3, end=4, speaker=0),
|
|
],
|
|
),
|
|
)
|
|
|
|
yield transcript
|