diff --git a/.gitignore b/.gitignore index fd3e8b20..c08eb9a3 100644 --- a/.gitignore +++ b/.gitignore @@ -160,9 +160,6 @@ cython_debug/ #.idea/ *.mp4 -summary.txt -transcript.txt -transcript_timestamps.txt *.html *.pkl transcript_*.txt @@ -176,3 +173,4 @@ test_samples/ .DS_Store/ .DS_Store .vscode/ +artefacts/ diff --git a/config.ini b/config.ini deleted file mode 100644 index 0092129f..00000000 --- a/config.ini +++ /dev/null @@ -1,22 +0,0 @@ -[DEFAULT] -# Set exception rule for OpenMP error to allow duplicate lib initialization -KMP_DUPLICATE_LIB_OK = TRUE -# Export OpenAI API Key -OPENAI_APIKEY = -# Export Whisper Model Size -WHISPER_MODEL_SIZE = tiny -WHISPER_REAL_TIME_MODEL_SIZE = tiny -# AWS config -AWS_ACCESS_KEY = ***REMOVED*** -AWS_SECRET_KEY = ***REMOVED*** -BUCKET_NAME = 'reflector-bucket' -# Summarizer config -SUMMARY_MODEL = facebook/bart-large-cnn -INPUT_ENCODING_MAX_LENGTH = 1024 -MAX_LENGTH = 2048 -BEAM_SIZE = 6 -MAX_CHUNK_LENGTH = 1024 -SUMMARIZE_USING_CHUNKS = YES -# Audio device -BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator -AV_FOUNDATION_DEVICE_ID = 2 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 23b8e38d..21fdd61a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,5 +56,4 @@ cached_property==1.5.2 stamina==23.1.0 httpx==0.24.1 sortedcontainers==2.4.0 -openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz diff --git a/scripts/clear_artefacts.sh b/scripts/clear_artefacts.sh index c06c4c2c..d6e7722f 100755 --- a/scripts/clear_artefacts.sh +++ b/scripts/clear_artefacts.sh @@ -1,15 +1,24 @@ #!/bin/bash # Directory to search for Python files -directory="." +cwd=$(pwd) +last_component="${cwd##*/}" + +if [ "$last_component" = "reflector" ]; then + directory="./artefacts" +elif [ "$last_component" = "scripts" ]; then + directory="../artefacts" +fi # Pattern to match Python files (e.g., "*.py" for all .py files) -text_file_pattern="transcript_*.txt" +transcript_file_pattern="transcript_*.txt" +summary_file_pattern="summary_*.txt" pickle_file_pattern="*.pkl" html_file_pattern="*.html" png_file_pattern="wordcloud*.png" -find "$directory" -type f -name "$text_file_pattern" -delete +find "$directory" -type f -name "$transcript_file_pattern" -delete +find "$directory" -type f -name "$summary_file_pattern" -delete find "$directory" -type f -name "$pickle_file_pattern" -delete find "$directory" -type f -name "$html_file_pattern" -delete find "$directory" -type f -name "$png_file_pattern" -delete diff --git a/server_multithreaded.py b/server_multithreaded.py index 7382a654..2862fa36 100644 --- a/server_multithreaded.py +++ b/server_multithreaded.py @@ -65,6 +65,7 @@ def get_transcription(): transcribe = True if transcribe: + print("Transcribing..") try: sorted_message_queue[frames[0].time] = None out_file = io.BytesIO() @@ -113,7 +114,7 @@ def start_messaging_thread(): def start_transcription_thread(max_threads: int): for i in range(max_threads): - t_thread = threading.Thread(target=get_transcription, args=(i,)) + t_thread = threading.Thread(target=get_transcription) t_thread.start() @@ -128,7 +129,7 @@ async def offer(request: requests.Request): def log_info(msg: str, *args): logger.info(pc_id + " " + msg, *args) - log_info("Created for %s", request.remote) + log_info("Created for " + request.remote) @pc.on("datachannel") def on_datachannel(channel): @@ -146,14 +147,14 @@ async def offer(request: requests.Request): @pc.on("connectionstatechange") async def on_connectionstatechange(): - log_info("Connection state is %s", pc.connectionState) + log_info("Connection state is " + pc.connectionState) if pc.connectionState == "failed": await pc.close() pcs.discard(pc) @pc.on("track") def on_track(track): - log_info("Track %s received", track.kind) + log_info("Track " + track.kind + " received") pc.addTrack(AudioStreamTrack(relay.subscribe(track))) # handle offer diff --git a/utils/file_utils.py b/utils/file_utils.py index 2c14f00f..cc9a9ded 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -3,8 +3,8 @@ import sys import boto3 import botocore -from log_utils import logger -from run_utils import config +from .log_utils import logger +from .run_utils import config BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"] diff --git a/utils/run_utils.py b/utils/run_utils.py index dca09c87..bb2b6348 100644 --- a/utils/run_utils.py +++ b/utils/run_utils.py @@ -6,18 +6,18 @@ from threading import Lock from typing import ContextManager, Generic, TypeVar -class ConfigParser: - __config = configparser.ConfigParser() - - def __init__(self, config_file='../config.ini'): - self.__config.read(config_file) +class ReflectorConfig: + __config = None @staticmethod def get_config(): - return ConfigParser.__config + if ReflectorConfig.__config is None: + ReflectorConfig.__config = configparser.ConfigParser() + ReflectorConfig.__config.read('utils/config.ini') + return ReflectorConfig.__config -config = ConfigParser.get_config() +config = ReflectorConfig.get_config() def run_in_executor(func, *args, executor=None, **kwargs): diff --git a/utils/test.py b/utils/test.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/text_utilities.py b/utils/text_utilities.py index 519990cb..ef15c7a3 100644 --- a/utils/text_utilities.py +++ b/utils/text_utilities.py @@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import BartForConditionalGeneration, BartTokenizer -from log_utils import logger -from run_utils import config +from utils.log_utils import logger +from utils.run_utils import config nltk.download('punkt', quiet=True) @@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp, decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for summary in summaries] summary = " ".join(decoded_summaries) - with open(output_filename, 'w') as f: + with open("./artefacts/" + output_filename, 'w') as f: f.write(summary.strip() + "\n") else: logger.info("Breaking transcript into smaller chunks") diff --git a/utils/viz_utilities.py b/utils/viz_utilities.py index e1ab88c9..93a9b56f 100644 --- a/utils/viz_utilities.py +++ b/utils/viz_utilities.py @@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False): else: wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" - plt.savefig(wordcloud_name) + plt.savefig("./artefacts/" + wordcloud_name) def create_talk_diff_scatter_viz(timestamp, real_time=False): @@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): # Load the transcription with timestamp filename = "" if real_time: - filename = "real_time_transcript_with_timestamp_" +\ + filename = "./artefacts/real_time_transcript_with_timestamp_" +\ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: - filename = "transcript_with_timestamp_" +\ + filename = "./artefacts/transcript_with_timestamp_" +\ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" with open(filename) as f: transcription_timestamp_text = f.read() @@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" - df.to_pickle(df_name) + df.to_pickle("./artefacts/" + df_name) my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2, topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2] @@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" - pickle.dump(my_mappings, open(mappings_name, "wb")) + pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb")) # to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") ) @@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True) - cat_1 = topic_times[0][0] - cat_1_name = topic_times[0][0] - cat_2_name = topic_times[1][0] + if len(topic_times) > 1: + cat_1 = topic_times[0][0] + cat_1_name = topic_times[0][0] + cat_2_name = topic_times[1][0] - # Scatter plot of topics - df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) - corpus = st.CorpusFromParsedDocuments( - df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse' - ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) - html = st.produce_scattertext_explorer( - corpus, - category=cat_1, - category_name=cat_1_name, - not_category_name=cat_2_name, - minimum_term_frequency=0, pmi_threshold_coefficient=0, - width_in_pixels=1000, - transform=st.Scalers.dense_rank - ) - if real_time: - open('./artefacts/real_time_scatter_' + - timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) - else: - open('./artefacts/scatter_' + - timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) + # Scatter plot of topics + df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) + corpus = st.CorpusFromParsedDocuments( + df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse' + ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) + html = st.produce_scattertext_explorer( + corpus, + category=cat_1, + category_name=cat_1_name, + not_category_name=cat_2_name, + minimum_term_frequency=0, pmi_threshold_coefficient=0, + width_in_pixels=1000, + transform=st.Scalers.dense_rank + ) + if real_time: + open('./artefacts/real_time_scatter_' + + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) + else: + open('./artefacts/scatter_' + + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) diff --git a/whisjax.py b/whisjax.py index 9e8ce4cf..53e16cd3 100644 --- a/whisjax.py +++ b/whisjax.py @@ -127,7 +127,7 @@ def main(): audio_filename = media_file logger.info("Finished extracting audio") - + logger.info("Transcribing") # Convert the audio to text using the OpenAI Whisper model pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, @@ -157,13 +157,14 @@ def main(): create_talk_diff_scatter_viz(NOW) # S3 : Push artefacts to S3 bucket + prefix = "./artefacts/" suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S") - files_to_upload = ["transcript_" + suffix + ".txt", - "transcript_with_timestamp_" + suffix + ".txt", - "df_" + suffix + ".pkl", - "wordcloud_" + suffix + ".png", - "mappings_" + suffix + ".pkl", - "scatter_" + suffix + ".html"] + files_to_upload = [prefix + "transcript_" + suffix + ".txt", + prefix + "transcript_with_timestamp_" + suffix + ".txt", + prefix + "df_" + suffix + ".pkl", + prefix + "wordcloud_" + suffix + ".png", + prefix + "mappings_" + suffix + ".pkl", + prefix + "scatter_" + suffix + ".html"] upload_files(files_to_upload) summarize(transcript_text, NOW, False, False)