mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
refactor
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -160,9 +160,6 @@ cython_debug/
|
||||
#.idea/
|
||||
|
||||
*.mp4
|
||||
summary.txt
|
||||
transcript.txt
|
||||
transcript_timestamps.txt
|
||||
*.html
|
||||
*.pkl
|
||||
transcript_*.txt
|
||||
@@ -176,3 +173,4 @@ test_samples/
|
||||
.DS_Store/
|
||||
.DS_Store
|
||||
.vscode/
|
||||
artefacts/
|
||||
|
||||
22
config.ini
22
config.ini
@@ -1,22 +0,0 @@
|
||||
[DEFAULT]
|
||||
# Set exception rule for OpenMP error to allow duplicate lib initialization
|
||||
KMP_DUPLICATE_LIB_OK = TRUE
|
||||
# Export OpenAI API Key
|
||||
OPENAI_APIKEY =
|
||||
# Export Whisper Model Size
|
||||
WHISPER_MODEL_SIZE = tiny
|
||||
WHISPER_REAL_TIME_MODEL_SIZE = tiny
|
||||
# AWS config
|
||||
AWS_ACCESS_KEY = ***REMOVED***
|
||||
AWS_SECRET_KEY = ***REMOVED***
|
||||
BUCKET_NAME = 'reflector-bucket'
|
||||
# Summarizer config
|
||||
SUMMARY_MODEL = facebook/bart-large-cnn
|
||||
INPUT_ENCODING_MAX_LENGTH = 1024
|
||||
MAX_LENGTH = 2048
|
||||
BEAM_SIZE = 6
|
||||
MAX_CHUNK_LENGTH = 1024
|
||||
SUMMARIZE_USING_CHUNKS = YES
|
||||
# Audio device
|
||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
|
||||
AV_FOUNDATION_DEVICE_ID = 2
|
||||
@@ -56,5 +56,4 @@ cached_property==1.5.2
|
||||
stamina==23.1.0
|
||||
httpx==0.24.1
|
||||
sortedcontainers==2.4.0
|
||||
openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
|
||||
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
|
||||
|
||||
@@ -1,15 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Directory to search for Python files
|
||||
directory="."
|
||||
cwd=$(pwd)
|
||||
last_component="${cwd##*/}"
|
||||
|
||||
if [ "$last_component" = "reflector" ]; then
|
||||
directory="./artefacts"
|
||||
elif [ "$last_component" = "scripts" ]; then
|
||||
directory="../artefacts"
|
||||
fi
|
||||
|
||||
# Pattern to match Python files (e.g., "*.py" for all .py files)
|
||||
text_file_pattern="transcript_*.txt"
|
||||
transcript_file_pattern="transcript_*.txt"
|
||||
summary_file_pattern="summary_*.txt"
|
||||
pickle_file_pattern="*.pkl"
|
||||
html_file_pattern="*.html"
|
||||
png_file_pattern="wordcloud*.png"
|
||||
|
||||
find "$directory" -type f -name "$text_file_pattern" -delete
|
||||
find "$directory" -type f -name "$transcript_file_pattern" -delete
|
||||
find "$directory" -type f -name "$summary_file_pattern" -delete
|
||||
find "$directory" -type f -name "$pickle_file_pattern" -delete
|
||||
find "$directory" -type f -name "$html_file_pattern" -delete
|
||||
find "$directory" -type f -name "$png_file_pattern" -delete
|
||||
|
||||
@@ -65,6 +65,7 @@ def get_transcription():
|
||||
transcribe = True
|
||||
|
||||
if transcribe:
|
||||
print("Transcribing..")
|
||||
try:
|
||||
sorted_message_queue[frames[0].time] = None
|
||||
out_file = io.BytesIO()
|
||||
@@ -113,7 +114,7 @@ def start_messaging_thread():
|
||||
|
||||
def start_transcription_thread(max_threads: int):
|
||||
for i in range(max_threads):
|
||||
t_thread = threading.Thread(target=get_transcription, args=(i,))
|
||||
t_thread = threading.Thread(target=get_transcription)
|
||||
t_thread.start()
|
||||
|
||||
|
||||
@@ -128,7 +129,7 @@ async def offer(request: requests.Request):
|
||||
def log_info(msg: str, *args):
|
||||
logger.info(pc_id + " " + msg, *args)
|
||||
|
||||
log_info("Created for %s", request.remote)
|
||||
log_info("Created for " + request.remote)
|
||||
|
||||
@pc.on("datachannel")
|
||||
def on_datachannel(channel):
|
||||
@@ -146,14 +147,14 @@ async def offer(request: requests.Request):
|
||||
|
||||
@pc.on("connectionstatechange")
|
||||
async def on_connectionstatechange():
|
||||
log_info("Connection state is %s", pc.connectionState)
|
||||
log_info("Connection state is " + pc.connectionState)
|
||||
if pc.connectionState == "failed":
|
||||
await pc.close()
|
||||
pcs.discard(pc)
|
||||
|
||||
@pc.on("track")
|
||||
def on_track(track):
|
||||
log_info("Track %s received", track.kind)
|
||||
log_info("Track " + track.kind + " received")
|
||||
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
|
||||
|
||||
# handle offer
|
||||
|
||||
@@ -3,8 +3,8 @@ import sys
|
||||
import boto3
|
||||
import botocore
|
||||
|
||||
from log_utils import logger
|
||||
from run_utils import config
|
||||
from .log_utils import logger
|
||||
from .run_utils import config
|
||||
|
||||
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]
|
||||
|
||||
|
||||
@@ -6,18 +6,18 @@ from threading import Lock
|
||||
from typing import ContextManager, Generic, TypeVar
|
||||
|
||||
|
||||
class ConfigParser:
|
||||
__config = configparser.ConfigParser()
|
||||
|
||||
def __init__(self, config_file='../config.ini'):
|
||||
self.__config.read(config_file)
|
||||
class ReflectorConfig:
|
||||
__config = None
|
||||
|
||||
@staticmethod
|
||||
def get_config():
|
||||
return ConfigParser.__config
|
||||
if ReflectorConfig.__config is None:
|
||||
ReflectorConfig.__config = configparser.ConfigParser()
|
||||
ReflectorConfig.__config.read('utils/config.ini')
|
||||
return ReflectorConfig.__config
|
||||
|
||||
|
||||
config = ConfigParser.get_config()
|
||||
config = ReflectorConfig.get_config()
|
||||
|
||||
|
||||
def run_in_executor(func, *args, executor=None, **kwargs):
|
||||
|
||||
0
utils/test.py
Normal file
0
utils/test.py
Normal file
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import BartForConditionalGeneration, BartTokenizer
|
||||
|
||||
from log_utils import logger
|
||||
from run_utils import config
|
||||
from utils.log_utils import logger
|
||||
from utils.run_utils import config
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
|
||||
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
||||
for summary in summaries]
|
||||
summary = " ".join(decoded_summaries)
|
||||
with open(output_filename, 'w') as f:
|
||||
with open("./artefacts/" + output_filename, 'w') as f:
|
||||
f.write(summary.strip() + "\n")
|
||||
else:
|
||||
logger.info("Breaking transcript into smaller chunks")
|
||||
|
||||
@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
|
||||
else:
|
||||
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
|
||||
plt.savefig(wordcloud_name)
|
||||
plt.savefig("./artefacts/" + wordcloud_name)
|
||||
|
||||
|
||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
# Load the transcription with timestamp
|
||||
filename = ""
|
||||
if real_time:
|
||||
filename = "real_time_transcript_with_timestamp_" +\
|
||||
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename = "transcript_with_timestamp_" +\
|
||||
filename = "./artefacts/transcript_with_timestamp_" +\
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
with open(filename) as f:
|
||||
transcription_timestamp_text = f.read()
|
||||
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
df.to_pickle(df_name)
|
||||
df.to_pickle("./artefacts/" + df_name)
|
||||
|
||||
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
pickle.dump(my_mappings, open(mappings_name, "wb"))
|
||||
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
|
||||
|
||||
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||
|
||||
@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
|
||||
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
cat_1 = topic_times[0][0]
|
||||
cat_1_name = topic_times[0][0]
|
||||
cat_2_name = topic_times[1][0]
|
||||
if len(topic_times) > 1:
|
||||
cat_1 = topic_times[0][0]
|
||||
cat_1_name = topic_times[0][0]
|
||||
cat_2_name = topic_times[1][0]
|
||||
|
||||
# Scatter plot of topics
|
||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||
corpus = st.CorpusFromParsedDocuments(
|
||||
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||
html = st.produce_scattertext_explorer(
|
||||
corpus,
|
||||
category=cat_1,
|
||||
category_name=cat_1_name,
|
||||
not_category_name=cat_2_name,
|
||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||
width_in_pixels=1000,
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
if real_time:
|
||||
open('./artefacts/real_time_scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
else:
|
||||
open('./artefacts/scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
# Scatter plot of topics
|
||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||
corpus = st.CorpusFromParsedDocuments(
|
||||
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||
html = st.produce_scattertext_explorer(
|
||||
corpus,
|
||||
category=cat_1,
|
||||
category_name=cat_1_name,
|
||||
not_category_name=cat_2_name,
|
||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||
width_in_pixels=1000,
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
if real_time:
|
||||
open('./artefacts/real_time_scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
else:
|
||||
open('./artefacts/scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
|
||||
15
whisjax.py
15
whisjax.py
@@ -127,7 +127,7 @@ def main():
|
||||
audio_filename = media_file
|
||||
|
||||
logger.info("Finished extracting audio")
|
||||
|
||||
logger.info("Transcribing")
|
||||
# Convert the audio to text using the OpenAI Whisper model
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||
dtype=jnp.float16,
|
||||
@@ -157,13 +157,14 @@ def main():
|
||||
create_talk_diff_scatter_viz(NOW)
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
prefix = "./artefacts/"
|
||||
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
files_to_upload = ["transcript_" + suffix + ".txt",
|
||||
"transcript_with_timestamp_" + suffix + ".txt",
|
||||
"df_" + suffix + ".pkl",
|
||||
"wordcloud_" + suffix + ".png",
|
||||
"mappings_" + suffix + ".pkl",
|
||||
"scatter_" + suffix + ".html"]
|
||||
files_to_upload = [prefix + "transcript_" + suffix + ".txt",
|
||||
prefix + "transcript_with_timestamp_" + suffix + ".txt",
|
||||
prefix + "df_" + suffix + ".pkl",
|
||||
prefix + "wordcloud_" + suffix + ".png",
|
||||
prefix + "mappings_" + suffix + ".pkl",
|
||||
prefix + "scatter_" + suffix + ".html"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
summarize(transcript_text, NOW, False, False)
|
||||
|
||||
Reference in New Issue
Block a user