mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
organize imports
This commit is contained in:
0
__init__.py
Normal file
0
__init__.py
Normal file
@@ -1,30 +0,0 @@
|
|||||||
import json
|
|
||||||
|
|
||||||
with open("meeting_titles_and_summaries.txt", "r") as f:
|
|
||||||
outputs = f.read()
|
|
||||||
|
|
||||||
outputs = json.loads(outputs)
|
|
||||||
|
|
||||||
transcript_file = open("meeting_transcript.txt", "a")
|
|
||||||
title_description_file = open("meeting_title_description.txt", "a")
|
|
||||||
|
|
||||||
for item in outputs["topics"]:
|
|
||||||
transcript_file.write(item["transcript"])
|
|
||||||
|
|
||||||
title_description_file.write("TITLE: \n")
|
|
||||||
title_description_file.write(item["title"])
|
|
||||||
title_description_file.write("\n")
|
|
||||||
|
|
||||||
title_description_file.write("DESCRIPTION: \n")
|
|
||||||
title_description_file.write(item["description"])
|
|
||||||
title_description_file.write("\n")
|
|
||||||
|
|
||||||
title_description_file.write("TRANSCRIPT: \n")
|
|
||||||
title_description_file.write(item["transcript"])
|
|
||||||
title_description_file.write("\n")
|
|
||||||
|
|
||||||
title_description_file.write("---------------------------------------- \n\n")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
34
server.py
34
server.py
@@ -1,25 +1,23 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
|
||||||
import io
|
|
||||||
import numpy as np
|
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import uuid
|
import uuid
|
||||||
import wave
|
import wave
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
import aiohttp_cors
|
import aiohttp_cors
|
||||||
import jax.numpy as jnp
|
|
||||||
import requests
|
import requests
|
||||||
from aiohttp import web
|
from aiohttp import web
|
||||||
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
||||||
from aiortc.contrib.media import MediaRelay
|
from aiortc.contrib.media import MediaRelay
|
||||||
from av import AudioFifo
|
from av import AudioFifo
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from whisper_jax import FlaxWhisperPipline
|
|
||||||
from utils.run_utils import run_in_executor
|
|
||||||
from sortedcontainers import SortedDict
|
from sortedcontainers import SortedDict
|
||||||
|
|
||||||
|
from utils.run_utils import run_in_executor
|
||||||
|
|
||||||
pcs = set()
|
pcs = set()
|
||||||
relay = MediaRelay()
|
relay = MediaRelay()
|
||||||
data_channel = None
|
data_channel = None
|
||||||
@@ -45,7 +43,7 @@ blacklisted_messages = [" Thank you.", " See you next time!",
|
|||||||
|
|
||||||
|
|
||||||
def get_title_and_summary(llm_input_text, last_timestamp):
|
def get_title_and_summary(llm_input_text, last_timestamp):
|
||||||
print("Generating title and summary")
|
("Generating title and summary")
|
||||||
# output = llm.generate(prompt)
|
# output = llm.generate(prompt)
|
||||||
|
|
||||||
# Use monadical-ml to fire this query to an LLM and get result
|
# Use monadical-ml to fire this query to an LLM and get result
|
||||||
@@ -69,13 +67,13 @@ def get_title_and_summary(llm_input_text, last_timestamp):
|
|||||||
"prompt": prompt
|
"prompt": prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
# To-do: Handle unexpected output formats from the model
|
# TODO : Handle unexpected output formats from the model
|
||||||
try:
|
try:
|
||||||
response = requests.post(LLM_URL, headers=headers, json=data)
|
response = requests.post(LLM_URL, headers=headers, json=data)
|
||||||
output = json.loads(response.json()["results"][0]["text"])
|
output = json.loads(response.json()["results"][0]["text"])
|
||||||
output["description"] = output.pop("summary")
|
output["description"] = output.pop("summary")
|
||||||
output["transcript"] = llm_input_text
|
output["transcript"] = llm_input_text
|
||||||
output["timestamp"] =\
|
output["timestamp"] = \
|
||||||
str(datetime.timedelta(seconds=round(last_timestamp)))
|
str(datetime.timedelta(seconds=round(last_timestamp)))
|
||||||
incremental_responses.append(output)
|
incremental_responses.append(output)
|
||||||
result = {
|
result = {
|
||||||
@@ -84,13 +82,13 @@ def get_title_and_summary(llm_input_text, last_timestamp):
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception" + str(e))
|
logger.info("Exception" + str(e))
|
||||||
result = None
|
result = None
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def channel_log(channel, t, message):
|
def channel_log(channel, t, message):
|
||||||
print("channel(%s) %s %s" % (channel.label, t, message))
|
logger.info("channel(%s) %s %s" % (channel.label, t, message))
|
||||||
|
|
||||||
|
|
||||||
def channel_send(channel, message):
|
def channel_send(channel, message):
|
||||||
@@ -120,17 +118,18 @@ def channel_send_transcript(channel):
|
|||||||
if len(sorted_transcripts) >= 3:
|
if len(sorted_transcripts) >= 3:
|
||||||
del sorted_transcripts[least_time]
|
del sorted_transcripts[least_time]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception", str(e))
|
logger.info("Exception", str(e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_transcription(frames):
|
def get_transcription(frames):
|
||||||
print("Transcribing..")
|
logger.info("Transcribing..")
|
||||||
sorted_transcripts[frames[0].time] = None
|
sorted_transcripts[frames[0].time] = None
|
||||||
|
|
||||||
|
# TODO:
|
||||||
# Passing IO objects instead of temporary files throws an error
|
# Passing IO objects instead of temporary files throws an error
|
||||||
# Passing ndarrays (typecasted with float) does not give any
|
# Passing ndarrays (typecasted with float) does not give any
|
||||||
# transcription. Refer issue
|
# transcription. Refer issue,
|
||||||
# https://github.com/guillaumekln/faster-whisper/issues/369
|
# https://github.com/guillaumekln/faster-whisper/issues/369
|
||||||
audiofilename = "test" + str(datetime.datetime.now())
|
audiofilename = "test" + str(datetime.datetime.now())
|
||||||
wf = wave.open(audiofilename, "wb")
|
wf = wave.open(audiofilename, "wb")
|
||||||
@@ -170,7 +169,7 @@ def get_transcription(frames):
|
|||||||
transcription_text += result_text
|
transcription_text += result_text
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception" + str(e))
|
logger.info("Exception" + str(e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -195,7 +194,7 @@ def get_final_summary_response():
|
|||||||
"summary": final_summary
|
"summary": final_summary
|
||||||
}
|
}
|
||||||
|
|
||||||
with open("meeting_titles_and_summaries.txt", "a") as f:
|
with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f:
|
||||||
f.write(json.dumps(incremental_responses))
|
f.write(json.dumps(incremental_responses))
|
||||||
return response
|
return response
|
||||||
|
|
||||||
@@ -275,7 +274,6 @@ async def offer(request):
|
|||||||
if isinstance(message, str) and message.startswith("ping"):
|
if isinstance(message, str) and message.startswith("ping"):
|
||||||
channel_send(channel, "pong" + message[4:])
|
channel_send(channel, "pong" + message[4:])
|
||||||
|
|
||||||
|
|
||||||
@pc.on("connectionstatechange")
|
@pc.on("connectionstatechange")
|
||||||
async def on_connectionstatechange():
|
async def on_connectionstatechange():
|
||||||
log_info("Connection state is " + pc.connectionState)
|
log_info("Connection state is " + pc.connectionState)
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class StreamClient:
|
|||||||
self.channel_log(channel, "<", message)
|
self.channel_log(channel, "<", message)
|
||||||
|
|
||||||
if isinstance(message, str) and message.startswith("pong"):
|
if isinstance(message, str) and message.startswith("pong"):
|
||||||
elapsed_ms = (self.current_stamp() - int(message[5:]))\
|
elapsed_ms = (self.current_stamp() - int(message[5:])) \
|
||||||
/ 1000
|
/ 1000
|
||||||
print(" RTT %.2f ms" % elapsed_ms)
|
print(" RTT %.2f ms" % elapsed_ms)
|
||||||
|
|
||||||
|
|||||||
@@ -1,24 +1,24 @@
|
|||||||
[DEFAULT]
|
[DEFAULT]
|
||||||
#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
|
#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
|
||||||
KMP_DUPLICATE_LIB_OK=TRUE
|
KMP_DUPLICATE_LIB_OK = TRUE
|
||||||
#ExportOpenAIAPIKey
|
#ExportOpenAIAPIKey
|
||||||
OPENAI_APIKEY=
|
OPENAI_APIKEY =
|
||||||
#ExportWhisperModelSize
|
#ExportWhisperModelSize
|
||||||
WHISPER_MODEL_SIZE=tiny
|
WHISPER_MODEL_SIZE = tiny
|
||||||
WHISPER_REAL_TIME_MODEL_SIZE=tiny
|
WHISPER_REAL_TIME_MODEL_SIZE = tiny
|
||||||
#AWSconfig
|
#AWSconfig
|
||||||
AWS_ACCESS_KEY=***REMOVED***
|
AWS_ACCESS_KEY = ***REMOVED***
|
||||||
AWS_SECRET_KEY=***REMOVED***
|
AWS_SECRET_KEY = ***REMOVED***
|
||||||
BUCKET_NAME=reflector-bucket
|
BUCKET_NAME = reflector-bucket
|
||||||
#Summarizerconfig
|
#Summarizerconfig
|
||||||
SUMMARY_MODEL=facebook/bart-large-cnn
|
SUMMARY_MODEL = facebook/bart-large-cnn
|
||||||
INPUT_ENCODING_MAX_LENGTH=1024
|
INPUT_ENCODING_MAX_LENGTH = 1024
|
||||||
MAX_LENGTH=2048
|
MAX_LENGTH = 2048
|
||||||
BEAM_SIZE=6
|
BEAM_SIZE = 6
|
||||||
MAX_CHUNK_LENGTH=1024
|
MAX_CHUNK_LENGTH = 1024
|
||||||
SUMMARIZE_USING_CHUNKS=YES
|
SUMMARIZE_USING_CHUNKS = YES
|
||||||
#Audiodevice
|
#Audiodevice
|
||||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
|
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
|
||||||
AV_FOUNDATION_DEVICE_ID=1
|
AV_FOUNDATION_DEVICE_ID = 1
|
||||||
# LLM PATH
|
# LLM PATH
|
||||||
LLM_PATH=
|
LLM_PATH =
|
||||||
|
|||||||
32
utils/format_output.py
Normal file
32
utils/format_output.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f:
|
||||||
|
outputs = f.read()
|
||||||
|
|
||||||
|
outputs = json.loads(outputs)
|
||||||
|
|
||||||
|
transcript_file = open("../artefacts/meeting_transcript.txt", "a")
|
||||||
|
title_desc_file = open("../artefacts/meeting_title_description.txt", "a")
|
||||||
|
summary_file = open("../artefacts/meeting_summary.txt", "a")
|
||||||
|
|
||||||
|
for item in outputs["topics"]:
|
||||||
|
transcript_file.write(item["transcript"])
|
||||||
|
summary_file.write(item["description"])
|
||||||
|
|
||||||
|
title_desc_file.write("TITLE: \n")
|
||||||
|
title_desc_file.write(item["title"])
|
||||||
|
title_desc_file.write("\n")
|
||||||
|
|
||||||
|
title_desc_file.write("DESCRIPTION: \n")
|
||||||
|
title_desc_file.write(item["description"])
|
||||||
|
title_desc_file.write("\n")
|
||||||
|
|
||||||
|
title_desc_file.write("TRANSCRIPT: \n")
|
||||||
|
title_desc_file.write(item["transcript"])
|
||||||
|
title_desc_file.write("\n")
|
||||||
|
|
||||||
|
title_desc_file.write("---------------------------------------- \n\n")
|
||||||
|
|
||||||
|
transcript_file.close()
|
||||||
|
title_desc_file.close()
|
||||||
|
summary_file.close()
|
||||||
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from transformers import BartForConditionalGeneration, BartTokenizer
|
from transformers import BartForConditionalGeneration, BartTokenizer
|
||||||
|
|
||||||
from utils.log_utils import logger
|
from log_utils import logger
|
||||||
from utils.run_utils import config
|
from run_utils import config
|
||||||
|
|
||||||
nltk.download('punkt', quiet=True)
|
nltk.download('punkt', quiet=True)
|
||||||
|
|
||||||
@@ -171,7 +171,7 @@ def summarize(transcript_text, timestamp,
|
|||||||
output_filename = "real_time_" + output_filename
|
output_filename = "real_time_" + output_filename
|
||||||
|
|
||||||
if summarize_using_chunks != "YES":
|
if summarize_using_chunks != "YES":
|
||||||
inputs = tokenizer.\
|
inputs = tokenizer. \
|
||||||
batch_encode_plus([transcript_text], truncation=True,
|
batch_encode_plus([transcript_text], truncation=True,
|
||||||
padding='longest',
|
padding='longest',
|
||||||
max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),
|
max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud
|
|||||||
en = spacy.load('en_core_web_md')
|
en = spacy.load('en_core_web_md')
|
||||||
spacy_stopwords = en.Defaults.stop_words
|
spacy_stopwords = en.Defaults.stop_words
|
||||||
|
|
||||||
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\
|
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
|
||||||
union(set(spacy_stopwords))
|
union(set(spacy_stopwords))
|
||||||
|
|
||||||
|
|
||||||
@@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False):
|
|||||||
"""
|
"""
|
||||||
filename = "transcript"
|
filename = "transcript"
|
||||||
if real_time:
|
if real_time:
|
||||||
filename = "real_time_" + filename + "_" +\
|
filename = "real_time_" + filename + "_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
else:
|
else:
|
||||||
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
@@ -47,7 +47,7 @@ def create_wordcloud(timestamp, real_time=False):
|
|||||||
|
|
||||||
wordcloud_name = "wordcloud"
|
wordcloud_name = "wordcloud"
|
||||||
if real_time:
|
if real_time:
|
||||||
wordcloud_name = "real_time_" + wordcloud_name + "_" +\
|
wordcloud_name = "real_time_" + wordcloud_name + "_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||||
else:
|
else:
|
||||||
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||||
@@ -57,12 +57,12 @@ def create_wordcloud(timestamp, real_time=False):
|
|||||||
|
|
||||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||||
"""
|
"""
|
||||||
Perform agenda vs transription diff to see covered topics.
|
Perform agenda vs transcription diff to see covered topics.
|
||||||
Create a scatter plot of words in topics.
|
Create a scatter plot of words in topics.
|
||||||
:return: None. Saved locally.
|
:return: None. Saved locally.
|
||||||
"""
|
"""
|
||||||
spaCy_model = "en_core_web_md"
|
spacy_model = "en_core_web_md"
|
||||||
nlp = spacy.load(spaCy_model)
|
nlp = spacy.load(spacy_model)
|
||||||
nlp.add_pipe('sentencizer')
|
nlp.add_pipe('sentencizer')
|
||||||
|
|
||||||
agenda_topics = []
|
agenda_topics = []
|
||||||
@@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
agenda_topics.append(line.split(":")[0])
|
agenda_topics.append(line.split(":")[0])
|
||||||
|
|
||||||
# Load the transcription with timestamp
|
# Load the transcription with timestamp
|
||||||
filename = ""
|
|
||||||
if real_time:
|
if real_time:
|
||||||
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
|
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
else:
|
else:
|
||||||
filename = "./artefacts/transcript_with_timestamp_" +\
|
filename = "./artefacts/transcript_with_timestamp_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
transcription_timestamp_text = f.read()
|
transcription_timestamp_text = f.read()
|
||||||
@@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
|
|
||||||
df = df.apply(create_new_columns, axis=1)
|
df = df.apply(create_new_columns, axis=1)
|
||||||
|
|
||||||
# Count the number of items covered and calculatre the percentage
|
# Count the number of items covered and calculate the percentage
|
||||||
num_covered_items = sum(covered_items.values())
|
num_covered_items = sum(covered_items.values())
|
||||||
percentage_covered = num_covered_items / len(agenda) * 100
|
percentage_covered = num_covered_items / len(agenda) * 100
|
||||||
|
|
||||||
@@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
# Save df, mappings for further experimentation
|
# Save df, mappings for further experimentation
|
||||||
df_name = "df"
|
df_name = "df"
|
||||||
if real_time:
|
if real_time:
|
||||||
df_name = "real_time_" + df_name + "_" +\
|
df_name = "real_time_" + df_name + "_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
else:
|
else:
|
||||||
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
@@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
|
|
||||||
mappings_name = "mappings"
|
mappings_name = "mappings"
|
||||||
if real_time:
|
if real_time:
|
||||||
mappings_name = "real_time_" + mappings_name + "_" +\
|
mappings_name = "real_time_" + mappings_name + "_" + \
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
else:
|
else:
|
||||||
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
|
|||||||
Reference in New Issue
Block a user