Files
reflector/server/server.py
2023-07-27 14:08:41 +02:00

386 lines
11 KiB
Python

import argparse
import asyncio
import datetime
import json
import os
import uuid
import wave
from concurrent.futures import ThreadPoolExecutor
from typing import NoReturn, Union
import aiohttp_cors
import av
import requests
from aiohttp import web
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
from aiortc.contrib.media import MediaRelay
from faster_whisper import WhisperModel
from reflector_dataclasses import (
BlackListedMessages,
FinalSummaryResult,
ParseLLMResult,
TitleSummaryInput,
TitleSummaryOutput,
TranscriptionInput,
TranscriptionOutput,
TranscriptionContext,
)
from utils.log_utils import LOGGER
from utils.run_utils import CONFIG, run_in_executor, SECRETS
# WebRTC components
pcs = set()
relay = MediaRelay()
executor = ThreadPoolExecutor()
# Transcription model
model = WhisperModel("tiny", device="cpu", compute_type="float32", num_workers=12)
# Audio configurations
CHANNELS = int(CONFIG["AUDIO"]["CHANNELS"])
RATE = int(CONFIG["AUDIO"]["SAMPLING_RATE"])
AUDIO_BUFFER_SIZE = 256 * 960
# LLM
LLM_URL = os.environ.get("LLM_URL")
if LLM_URL:
LOGGER.info(f"Using LLM from environment: {LLM_URL}")
else:
LLM_MACHINE_IP = CONFIG["LLM"]["LLM_MACHINE_IP"]
LLM_MACHINE_PORT = CONFIG["LLM"]["LLM_MACHINE_PORT"]
LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate"
def parse_llm_output(
param: TitleSummaryInput, response: requests.Response
) -> Union[None, ParseLLMResult]:
"""
Function to parse the LLM response
:param param:
:param response:
:return:
"""
try:
output = json.loads(response.json()["results"][0]["text"])
return ParseLLMResult(param, output)
except Exception as e:
LOGGER.info("Exception" + str(e))
return None
def get_title_and_summary(
ctx: TranscriptionContext, param: TitleSummaryInput
) -> Union[None, TitleSummaryOutput]:
"""
From the input provided (transcript), query the LLM to generate
topics and summaries
:param param:
:return:
"""
LOGGER.info("Generating title and summary")
# TODO : Handle unexpected output formats from the model
try:
response = requests.post(LLM_URL, headers=param.headers, json=param.data)
output = parse_llm_output(param, response)
if output:
result = output.get_result()
ctx.incremental_responses.append(result)
return TitleSummaryOutput(ctx.incremental_responses)
except Exception:
LOGGER.exception("Exception while generating title and summary")
return None
def channel_log(channel, t: str, message: str) -> NoReturn:
"""
Add logs
:param channel:
:param t:
:param message:
:return:
"""
LOGGER.info("channel(%s) %s %s" % (channel.label, t, message))
def channel_send(channel, message: str) -> NoReturn:
"""
Send text messages via the data channel
:param channel:
:param message:
:return:
"""
if channel:
channel.send(message)
def channel_send_increment(
channel, param: Union[FinalSummaryResult, TitleSummaryOutput]
) -> NoReturn:
"""
Send the incremental topics and summaries via the data channel
:param channel:
:param param:
:return:
"""
if channel and param:
message = param.get_result()
channel.send(json.dumps(message))
def channel_send_transcript(ctx: TranscriptionContext) -> NoReturn:
"""
Send the transcription result via the data channel
:param channel:
:return:
"""
# channel_log(channel, ">", message)
if not ctx.data_channel:
return
try:
least_time = next(iter(ctx.sorted_transcripts))
message = ctx.sorted_transcripts[least_time].get_result()
if message:
del ctx.sorted_transcripts[least_time]
if message["text"] not in BlackListedMessages.messages:
ctx.data_channel.send(json.dumps(message))
# Due to exceptions if one of the earlier batches can't return
# a transcript, we don't want to be stuck waiting for the result
# With the threshold size of 3, we pop the first(lost) element
else:
if len(ctx.sorted_transcripts) >= 3:
del ctx.sorted_transcripts[least_time]
except Exception as exception:
LOGGER.info("Exception", str(exception))
def get_transcription(
ctx: TranscriptionContext, input_frames: TranscriptionInput
) -> Union[None, TranscriptionOutput]:
"""
From the collected audio frames create transcription by inferring from
the chosen transcription model
:param input_frames:
:return:
"""
LOGGER.info("Transcribing..")
ctx.sorted_transcripts[input_frames.frames[0].time] = None
# TODO: Find cleaner way, watch "no transcription" issue below
# Passing IO objects instead of temporary files throws an error
# Passing ndarray (type casted with float) does not give any
# transcription. Refer issue,
# https://github.com/guillaumekln/faster-whisper/issues/369
audio_file = "test" + str(datetime.datetime.now())
wf = wave.open(audio_file, "wb")
wf.setnchannels(CHANNELS)
wf.setframerate(RATE)
wf.setsampwidth(2)
for frame in input_frames.frames:
wf.writeframes(b"".join(frame.to_ndarray()))
wf.close()
result_text = ""
try:
segments, _ = model.transcribe(
audio_file,
language="en",
beam_size=5,
vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500},
)
os.remove(audio_file)
segments = list(segments)
result_text = ""
duration = 0.0
for segment in segments:
result_text += segment.text
start_time = segment.start
end_time = segment.end
if not segment.start:
start_time = 0.0
if not segment.end:
end_time = 5.5
duration += end_time - start_time
ctx.last_transcribed_time += duration
ctx.transcription_text += result_text
except Exception as exception:
LOGGER.info("Exception" + str(exception))
result = TranscriptionOutput(result_text)
ctx.sorted_transcripts[input_frames.frames[0].time] = result
return result
def get_final_summary_response(ctx: TranscriptionContext) -> FinalSummaryResult:
"""
Collate the incremental summaries generated so far and return as the final
summary
:return:
"""
final_summary = ""
# Collate inc summaries
for topic in ctx.incremental_responses:
final_summary += topic["description"]
response = FinalSummaryResult(final_summary, ctx.last_transcribed_time)
with open(
"./artefacts/meeting_titles_and_summaries.txt", "a", encoding="utf-8"
) as file:
file.write(json.dumps(ctx.incremental_responses))
return response
class AudioStreamTrack(MediaStreamTrack):
"""
An audio stream track.
"""
kind = "audio"
def __init__(self, ctx: TranscriptionContext, track):
super().__init__()
self.ctx = ctx
self.track = track
self.audio_buffer = av.AudioFifo()
async def recv(self) -> av.audio.frame.AudioFrame:
ctx = self.ctx
frame = await self.track.recv()
self.audio_buffer.write(frame)
if local_frames := self.audio_buffer.read_many(
AUDIO_BUFFER_SIZE, partial=False
):
whisper_result = run_in_executor(
get_transcription,
ctx,
TranscriptionInput(local_frames),
executor=executor,
)
whisper_result.add_done_callback(
lambda f: channel_send_transcript(ctx) if f.result() else None
)
if len(ctx.transcription_text) > 25:
llm_input_text = ctx.transcription_text
ctx.transcription_text = ""
param = TitleSummaryInput(
input_text=llm_input_text, transcribed_time=ctx.last_transcribed_time
)
llm_result = run_in_executor(
get_title_and_summary, ctx, param, executor=executor
)
llm_result.add_done_callback(
lambda f: channel_send_increment(ctx.data_channel, llm_result.result())
if f.result()
else None
)
return frame
async def offer(request: requests.Request) -> web.Response:
"""
Establish the WebRTC connection with the client
:param request:
:return:
"""
params = await request.json()
offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
ctx = TranscriptionContext()
pc = RTCPeerConnection()
pc_id = "PeerConnection(%s)" % uuid.uuid4()
pcs.add(pc)
def log_info(msg, *args) -> NoReturn:
LOGGER.info(pc_id + " " + msg, *args)
log_info("Created for " + request.remote)
@pc.on("datachannel")
def on_datachannel(channel) -> NoReturn:
ctx.data_channel = channel
channel_log(channel, "-", "created by remote party")
@channel.on("message")
def on_message(message: str) -> NoReturn:
channel_log(channel, "<", message)
if json.loads(message)["cmd"] == "STOP":
# Placeholder final summary
response = get_final_summary_response()
channel_send_increment(channel, response)
# To-do Add code to stop connection from server side here
# But have to handshake with client once
if isinstance(message, str) and message.startswith("ping"):
channel_send(channel, "pong" + message[4:])
@pc.on("connectionstatechange")
async def on_connectionstatechange() -> NoReturn:
log_info("Connection state is " + pc.connectionState)
if pc.connectionState == "failed":
await pc.close()
pcs.discard(pc)
@pc.on("track")
def on_track(track) -> NoReturn:
log_info("Track " + track.kind + " received")
pc.addTrack(AudioStreamTrack(ctx, relay.subscribe(track)))
await pc.setRemoteDescription(offer)
answer = await pc.createAnswer()
await pc.setLocalDescription(answer)
return web.Response(
content_type="application/json",
text=json.dumps(
{"sdp": pc.localDescription.sdp, "type": pc.localDescription.type}
),
)
async def on_shutdown(application: web.Application) -> NoReturn:
"""
On shutdown, the coroutines that shutdown client connections are
executed
:param application:
:return:
"""
coroutines = [pc.close() for pc in pcs]
await asyncio.gather(*coroutines)
pcs.clear()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="WebRTC based server for Reflector")
parser.add_argument(
"--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)"
)
parser.add_argument(
"--port", type=int, default=1250, help="Server port (def: 1250)"
)
args = parser.parse_args()
app = web.Application()
cors = aiohttp_cors.setup(
app,
defaults={
"*": aiohttp_cors.ResourceOptions(
allow_credentials=True, expose_headers="*", allow_headers="*"
)
},
)
offer_resource = cors.add(app.router.add_resource("/offer"))
cors.add(offer_resource.add_route("POST", offer))
app.on_shutdown.append(on_shutdown)
web.run_app(app, access_log=None, host=args.host, port=args.port)