mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
refactor
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -160,9 +160,6 @@ cython_debug/
|
|||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
*.mp4
|
*.mp4
|
||||||
summary.txt
|
|
||||||
transcript.txt
|
|
||||||
transcript_timestamps.txt
|
|
||||||
*.html
|
*.html
|
||||||
*.pkl
|
*.pkl
|
||||||
transcript_*.txt
|
transcript_*.txt
|
||||||
@@ -176,3 +173,4 @@ test_samples/
|
|||||||
.DS_Store/
|
.DS_Store/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.vscode/
|
.vscode/
|
||||||
|
artefacts/
|
||||||
|
|||||||
22
config.ini
22
config.ini
@@ -1,22 +0,0 @@
|
|||||||
[DEFAULT]
|
|
||||||
# Set exception rule for OpenMP error to allow duplicate lib initialization
|
|
||||||
KMP_DUPLICATE_LIB_OK = TRUE
|
|
||||||
# Export OpenAI API Key
|
|
||||||
OPENAI_APIKEY =
|
|
||||||
# Export Whisper Model Size
|
|
||||||
WHISPER_MODEL_SIZE = tiny
|
|
||||||
WHISPER_REAL_TIME_MODEL_SIZE = tiny
|
|
||||||
# AWS config
|
|
||||||
AWS_ACCESS_KEY = ***REMOVED***
|
|
||||||
AWS_SECRET_KEY = ***REMOVED***
|
|
||||||
BUCKET_NAME = 'reflector-bucket'
|
|
||||||
# Summarizer config
|
|
||||||
SUMMARY_MODEL = facebook/bart-large-cnn
|
|
||||||
INPUT_ENCODING_MAX_LENGTH = 1024
|
|
||||||
MAX_LENGTH = 2048
|
|
||||||
BEAM_SIZE = 6
|
|
||||||
MAX_CHUNK_LENGTH = 1024
|
|
||||||
SUMMARIZE_USING_CHUNKS = YES
|
|
||||||
# Audio device
|
|
||||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
|
|
||||||
AV_FOUNDATION_DEVICE_ID = 2
|
|
||||||
@@ -56,5 +56,4 @@ cached_property==1.5.2
|
|||||||
stamina==23.1.0
|
stamina==23.1.0
|
||||||
httpx==0.24.1
|
httpx==0.24.1
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
|
|
||||||
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
|
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
|
||||||
|
|||||||
@@ -1,15 +1,24 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
# Directory to search for Python files
|
# Directory to search for Python files
|
||||||
directory="."
|
cwd=$(pwd)
|
||||||
|
last_component="${cwd##*/}"
|
||||||
|
|
||||||
|
if [ "$last_component" = "reflector" ]; then
|
||||||
|
directory="./artefacts"
|
||||||
|
elif [ "$last_component" = "scripts" ]; then
|
||||||
|
directory="../artefacts"
|
||||||
|
fi
|
||||||
|
|
||||||
# Pattern to match Python files (e.g., "*.py" for all .py files)
|
# Pattern to match Python files (e.g., "*.py" for all .py files)
|
||||||
text_file_pattern="transcript_*.txt"
|
transcript_file_pattern="transcript_*.txt"
|
||||||
|
summary_file_pattern="summary_*.txt"
|
||||||
pickle_file_pattern="*.pkl"
|
pickle_file_pattern="*.pkl"
|
||||||
html_file_pattern="*.html"
|
html_file_pattern="*.html"
|
||||||
png_file_pattern="wordcloud*.png"
|
png_file_pattern="wordcloud*.png"
|
||||||
|
|
||||||
find "$directory" -type f -name "$text_file_pattern" -delete
|
find "$directory" -type f -name "$transcript_file_pattern" -delete
|
||||||
|
find "$directory" -type f -name "$summary_file_pattern" -delete
|
||||||
find "$directory" -type f -name "$pickle_file_pattern" -delete
|
find "$directory" -type f -name "$pickle_file_pattern" -delete
|
||||||
find "$directory" -type f -name "$html_file_pattern" -delete
|
find "$directory" -type f -name "$html_file_pattern" -delete
|
||||||
find "$directory" -type f -name "$png_file_pattern" -delete
|
find "$directory" -type f -name "$png_file_pattern" -delete
|
||||||
|
|||||||
@@ -65,6 +65,7 @@ def get_transcription():
|
|||||||
transcribe = True
|
transcribe = True
|
||||||
|
|
||||||
if transcribe:
|
if transcribe:
|
||||||
|
print("Transcribing..")
|
||||||
try:
|
try:
|
||||||
sorted_message_queue[frames[0].time] = None
|
sorted_message_queue[frames[0].time] = None
|
||||||
out_file = io.BytesIO()
|
out_file = io.BytesIO()
|
||||||
@@ -113,7 +114,7 @@ def start_messaging_thread():
|
|||||||
|
|
||||||
def start_transcription_thread(max_threads: int):
|
def start_transcription_thread(max_threads: int):
|
||||||
for i in range(max_threads):
|
for i in range(max_threads):
|
||||||
t_thread = threading.Thread(target=get_transcription, args=(i,))
|
t_thread = threading.Thread(target=get_transcription)
|
||||||
t_thread.start()
|
t_thread.start()
|
||||||
|
|
||||||
|
|
||||||
@@ -128,7 +129,7 @@ async def offer(request: requests.Request):
|
|||||||
def log_info(msg: str, *args):
|
def log_info(msg: str, *args):
|
||||||
logger.info(pc_id + " " + msg, *args)
|
logger.info(pc_id + " " + msg, *args)
|
||||||
|
|
||||||
log_info("Created for %s", request.remote)
|
log_info("Created for " + request.remote)
|
||||||
|
|
||||||
@pc.on("datachannel")
|
@pc.on("datachannel")
|
||||||
def on_datachannel(channel):
|
def on_datachannel(channel):
|
||||||
@@ -146,14 +147,14 @@ async def offer(request: requests.Request):
|
|||||||
|
|
||||||
@pc.on("connectionstatechange")
|
@pc.on("connectionstatechange")
|
||||||
async def on_connectionstatechange():
|
async def on_connectionstatechange():
|
||||||
log_info("Connection state is %s", pc.connectionState)
|
log_info("Connection state is " + pc.connectionState)
|
||||||
if pc.connectionState == "failed":
|
if pc.connectionState == "failed":
|
||||||
await pc.close()
|
await pc.close()
|
||||||
pcs.discard(pc)
|
pcs.discard(pc)
|
||||||
|
|
||||||
@pc.on("track")
|
@pc.on("track")
|
||||||
def on_track(track):
|
def on_track(track):
|
||||||
log_info("Track %s received", track.kind)
|
log_info("Track " + track.kind + " received")
|
||||||
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
|
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
|
||||||
|
|
||||||
# handle offer
|
# handle offer
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ import sys
|
|||||||
import boto3
|
import boto3
|
||||||
import botocore
|
import botocore
|
||||||
|
|
||||||
from log_utils import logger
|
from .log_utils import logger
|
||||||
from run_utils import config
|
from .run_utils import config
|
||||||
|
|
||||||
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]
|
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]
|
||||||
|
|
||||||
|
|||||||
@@ -6,18 +6,18 @@ from threading import Lock
|
|||||||
from typing import ContextManager, Generic, TypeVar
|
from typing import ContextManager, Generic, TypeVar
|
||||||
|
|
||||||
|
|
||||||
class ConfigParser:
|
class ReflectorConfig:
|
||||||
__config = configparser.ConfigParser()
|
__config = None
|
||||||
|
|
||||||
def __init__(self, config_file='../config.ini'):
|
|
||||||
self.__config.read(config_file)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_config():
|
def get_config():
|
||||||
return ConfigParser.__config
|
if ReflectorConfig.__config is None:
|
||||||
|
ReflectorConfig.__config = configparser.ConfigParser()
|
||||||
|
ReflectorConfig.__config.read('utils/config.ini')
|
||||||
|
return ReflectorConfig.__config
|
||||||
|
|
||||||
|
|
||||||
config = ConfigParser.get_config()
|
config = ReflectorConfig.get_config()
|
||||||
|
|
||||||
|
|
||||||
def run_in_executor(func, *args, executor=None, **kwargs):
|
def run_in_executor(func, *args, executor=None, **kwargs):
|
||||||
|
|||||||
0
utils/test.py
Normal file
0
utils/test.py
Normal file
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
from transformers import BartForConditionalGeneration, BartTokenizer
|
from transformers import BartForConditionalGeneration, BartTokenizer
|
||||||
|
|
||||||
from log_utils import logger
|
from utils.log_utils import logger
|
||||||
from run_utils import config
|
from utils.run_utils import config
|
||||||
|
|
||||||
nltk.download('punkt', quiet=True)
|
nltk.download('punkt', quiet=True)
|
||||||
|
|
||||||
@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
|
|||||||
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
||||||
for summary in summaries]
|
for summary in summaries]
|
||||||
summary = " ".join(decoded_summaries)
|
summary = " ".join(decoded_summaries)
|
||||||
with open(output_filename, 'w') as f:
|
with open("./artefacts/" + output_filename, 'w') as f:
|
||||||
f.write(summary.strip() + "\n")
|
f.write(summary.strip() + "\n")
|
||||||
else:
|
else:
|
||||||
logger.info("Breaking transcript into smaller chunks")
|
logger.info("Breaking transcript into smaller chunks")
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
|
|||||||
else:
|
else:
|
||||||
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||||
|
|
||||||
plt.savefig(wordcloud_name)
|
plt.savefig("./artefacts/" + wordcloud_name)
|
||||||
|
|
||||||
|
|
||||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||||
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
# Load the transcription with timestamp
|
# Load the transcription with timestamp
|
||||||
filename = ""
|
filename = ""
|
||||||
if real_time:
|
if real_time:
|
||||||
filename = "real_time_transcript_with_timestamp_" +\
|
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
else:
|
else:
|
||||||
filename = "transcript_with_timestamp_" +\
|
filename = "./artefacts/transcript_with_timestamp_" +\
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
transcription_timestamp_text = f.read()
|
transcription_timestamp_text = f.read()
|
||||||
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
else:
|
else:
|
||||||
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
df.to_pickle(df_name)
|
df.to_pickle("./artefacts/" + df_name)
|
||||||
|
|
||||||
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||||
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||||
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
else:
|
else:
|
||||||
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||||
pickle.dump(my_mappings, open(mappings_name, "wb"))
|
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
|
||||||
|
|
||||||
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||||
|
|
||||||
@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
|||||||
|
|
||||||
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
|
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
cat_1 = topic_times[0][0]
|
if len(topic_times) > 1:
|
||||||
cat_1_name = topic_times[0][0]
|
cat_1 = topic_times[0][0]
|
||||||
cat_2_name = topic_times[1][0]
|
cat_1_name = topic_times[0][0]
|
||||||
|
cat_2_name = topic_times[1][0]
|
||||||
|
|
||||||
# Scatter plot of topics
|
# Scatter plot of topics
|
||||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||||
corpus = st.CorpusFromParsedDocuments(
|
corpus = st.CorpusFromParsedDocuments(
|
||||||
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
||||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||||
html = st.produce_scattertext_explorer(
|
html = st.produce_scattertext_explorer(
|
||||||
corpus,
|
corpus,
|
||||||
category=cat_1,
|
category=cat_1,
|
||||||
category_name=cat_1_name,
|
category_name=cat_1_name,
|
||||||
not_category_name=cat_2_name,
|
not_category_name=cat_2_name,
|
||||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||||
width_in_pixels=1000,
|
width_in_pixels=1000,
|
||||||
transform=st.Scalers.dense_rank
|
transform=st.Scalers.dense_rank
|
||||||
)
|
)
|
||||||
if real_time:
|
if real_time:
|
||||||
open('./artefacts/real_time_scatter_' +
|
open('./artefacts/real_time_scatter_' +
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||||
else:
|
else:
|
||||||
open('./artefacts/scatter_' +
|
open('./artefacts/scatter_' +
|
||||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||||
|
|||||||
15
whisjax.py
15
whisjax.py
@@ -127,7 +127,7 @@ def main():
|
|||||||
audio_filename = media_file
|
audio_filename = media_file
|
||||||
|
|
||||||
logger.info("Finished extracting audio")
|
logger.info("Finished extracting audio")
|
||||||
|
logger.info("Transcribing")
|
||||||
# Convert the audio to text using the OpenAI Whisper model
|
# Convert the audio to text using the OpenAI Whisper model
|
||||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||||
dtype=jnp.float16,
|
dtype=jnp.float16,
|
||||||
@@ -157,13 +157,14 @@ def main():
|
|||||||
create_talk_diff_scatter_viz(NOW)
|
create_talk_diff_scatter_viz(NOW)
|
||||||
|
|
||||||
# S3 : Push artefacts to S3 bucket
|
# S3 : Push artefacts to S3 bucket
|
||||||
|
prefix = "./artefacts/"
|
||||||
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||||
files_to_upload = ["transcript_" + suffix + ".txt",
|
files_to_upload = [prefix + "transcript_" + suffix + ".txt",
|
||||||
"transcript_with_timestamp_" + suffix + ".txt",
|
prefix + "transcript_with_timestamp_" + suffix + ".txt",
|
||||||
"df_" + suffix + ".pkl",
|
prefix + "df_" + suffix + ".pkl",
|
||||||
"wordcloud_" + suffix + ".png",
|
prefix + "wordcloud_" + suffix + ".png",
|
||||||
"mappings_" + suffix + ".pkl",
|
prefix + "mappings_" + suffix + ".pkl",
|
||||||
"scatter_" + suffix + ".html"]
|
prefix + "scatter_" + suffix + ".html"]
|
||||||
upload_files(files_to_upload)
|
upload_files(files_to_upload)
|
||||||
|
|
||||||
summarize(transcript_text, NOW, False, False)
|
summarize(transcript_text, NOW, False, False)
|
||||||
|
|||||||
Reference in New Issue
Block a user