refactor

2025-12-21 04:39:06 +00:00 · 2023-07-11 18:47:21 +05:30
parent d962ff1712
commit 71eb277fd7
11 changed files with 67 additions and 80 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -160,9 +160,6 @@ cython_debug/
 #.idea/
 *.mp4
 summary.txt
 transcript.txt
 transcript_timestamps.txt
 *.html
 *.pkl
 transcript_*.txt
@@ -176,3 +173,4 @@ test_samples/
 .DS_Store/
 .DS_Store
 .vscode/
 artefacts/
--- a/config.ini
+++ b/config.ini
@@ -1,22 +0,0 @@
 [DEFAULT]
 # Set exception rule for OpenMP error to allow duplicate lib initialization
 KMP_DUPLICATE_LIB_OK = TRUE
 # Export OpenAI API Key
 OPENAI_APIKEY =
 # Export Whisper Model Size
 WHISPER_MODEL_SIZE = tiny
 WHISPER_REAL_TIME_MODEL_SIZE = tiny
 # AWS config
 AWS_ACCESS_KEY = ***REMOVED***
 AWS_SECRET_KEY = ***REMOVED***
 BUCKET_NAME = 'reflector-bucket'
 # Summarizer config
 SUMMARY_MODEL = facebook/bart-large-cnn
 INPUT_ENCODING_MAX_LENGTH = 1024
 MAX_LENGTH = 2048
 BEAM_SIZE = 6
 MAX_CHUNK_LENGTH = 1024
 SUMMARIZE_USING_CHUNKS = YES
 # Audio device
 BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
 AV_FOUNDATION_DEVICE_ID = 2
--- a/requirements.txt
+++ b/requirements.txt
@@ -56,5 +56,4 @@ cached_property==1.5.2
 stamina==23.1.0
 httpx==0.24.1
 sortedcontainers==2.4.0
 openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
 https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
--- a/scripts/clear_artefacts.sh
+++ b/scripts/clear_artefacts.sh
@@ -1,15 +1,24 @@
 #!/bin/bash
 # Directory to search for Python files
-directory="."
+cwd=$(pwd)
 last_component="${cwd##*/}"
 if [ "$last_component" = "reflector" ]; then
    directory="./artefacts"
 elif [ "$last_component" = "scripts" ]; then
    directory="../artefacts"
 fi
 # Pattern to match Python files (e.g., "*.py" for all .py files)
-text_file_pattern="transcript_*.txt"
+transcript_file_pattern="transcript_*.txt"
 summary_file_pattern="summary_*.txt"
 pickle_file_pattern="*.pkl"
 html_file_pattern="*.html"
 png_file_pattern="wordcloud*.png"
-find "$directory" -type f -name "$text_file_pattern" -delete
+find "$directory" -type f -name "$transcript_file_pattern" -delete
 find "$directory" -type f -name "$summary_file_pattern" -delete
 find "$directory" -type f -name "$pickle_file_pattern" -delete
 find "$directory" -type f -name "$html_file_pattern" -delete
 find "$directory" -type f -name "$png_file_pattern" -delete
--- a/server_multithreaded.py
+++ b/server_multithreaded.py
@@ -65,6 +65,7 @@ def get_transcription():
                transcribe = True
        if transcribe:
            print("Transcribing..")
            try:
                sorted_message_queue[frames[0].time] = None
                out_file = io.BytesIO()
@@ -113,7 +114,7 @@ def start_messaging_thread():
 def start_transcription_thread(max_threads: int):
    for i in range(max_threads):
-        t_thread = threading.Thread(target=get_transcription, args=(i,))
+        t_thread = threading.Thread(target=get_transcription)
        t_thread.start()
@@ -128,7 +129,7 @@ async def offer(request: requests.Request):
    def log_info(msg: str, *args):
        logger.info(pc_id + " " + msg, *args)
-    log_info("Created for %s", request.remote)
+    log_info("Created for " + request.remote)
    @pc.on("datachannel")
    def on_datachannel(channel):
@@ -146,14 +147,14 @@ async def offer(request: requests.Request):
    @pc.on("connectionstatechange")
    async def on_connectionstatechange():
-        log_info("Connection state is %s", pc.connectionState)
+        log_info("Connection state is " + pc.connectionState)
        if pc.connectionState == "failed":
            await pc.close()
            pcs.discard(pc)
    @pc.on("track")
    def on_track(track):
-        log_info("Track %s received", track.kind)
+        log_info("Track " + track.kind + " received")
        pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
    # handle offer
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -3,8 +3,8 @@ import sys
 import boto3
 import botocore
-from log_utils import logger
+from .log_utils import logger
-from run_utils import config
+from .run_utils import config
 BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]
--- a/utils/run_utils.py
+++ b/utils/run_utils.py
@@ -6,18 +6,18 @@ from threading import Lock
 from typing import ContextManager, Generic, TypeVar
-class ConfigParser:
+class ReflectorConfig:
-    __config = configparser.ConfigParser()
+    __config = None
    def __init__(self, config_file='../config.ini'):
        self.__config.read(config_file)
    @staticmethod
    def get_config():
-        return ConfigParser.__config
+        if ReflectorConfig.__config is None:
            ReflectorConfig.__config = configparser.ConfigParser()
            ReflectorConfig.__config.read('utils/config.ini')
        return ReflectorConfig.__config
-config = ConfigParser.get_config()
+config = ReflectorConfig.get_config()
 def run_in_executor(func, *args, executor=None, **kwargs):
--- a/utils/test.py
+++ b/utils/test.py
--- a/utils/text_utilities.py
+++ b/utils/text_utilities.py
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import BartForConditionalGeneration, BartTokenizer
-from log_utils import logger
+from utils.log_utils import logger
-from run_utils import config
+from utils.run_utils import config
 nltk.download('punkt', quiet=True)
@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
        decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                             for summary in summaries]
        summary = " ".join(decoded_summaries)
-        with open(output_filename, 'w') as f:
+        with open("./artefacts/" + output_filename, 'w') as f:
            f.write(summary.strip() + "\n")
    else:
        logger.info("Breaking transcript into smaller chunks")
--- a/utils/viz_utilities.py
+++ b/utils/viz_utilities.py
@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
    else:
        wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
-    plt.savefig(wordcloud_name)
+    plt.savefig("./artefacts/" + wordcloud_name)
 def create_talk_diff_scatter_viz(timestamp, real_time=False):
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
    # Load the transcription with timestamp
    filename = ""
    if real_time:
-        filename = "real_time_transcript_with_timestamp_" +\
+        filename = "./artefacts/real_time_transcript_with_timestamp_" +\
                   timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
    else:
-        filename = "transcript_with_timestamp_" +\
+        filename = "./artefacts/transcript_with_timestamp_" +\
                   timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
    with open(filename) as f:
        transcription_timestamp_text = f.read()
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                  timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
    else:
        df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    df.to_pickle(df_name)
+    df.to_pickle("./artefacts/" + df_name)
    my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
                   topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                        timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
    else:
        mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    pickle.dump(my_mappings, open(mappings_name, "wb"))
+    pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
    # to load,  my_mappings = pickle.load( open ("mappings.pkl", "rb") )
@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
    topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
-    cat_1 = topic_times[0][0]
+    if len(topic_times) > 1:
-    cat_1_name = topic_times[0][0]
+        cat_1 = topic_times[0][0]
-    cat_2_name = topic_times[1][0]
+        cat_1_name = topic_times[0][0]
        cat_2_name = topic_times[1][0]
-    # Scatter plot of topics
+        # Scatter plot of topics
-    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
+        df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
-    corpus = st.CorpusFromParsedDocuments(
+        corpus = st.CorpusFromParsedDocuments(
-            df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
+                df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
-    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
+        ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
-    html = st.produce_scattertext_explorer(
+        html = st.produce_scattertext_explorer(
-            corpus,
+                corpus,
-            category=cat_1,
+                category=cat_1,
-            category_name=cat_1_name,
+                category_name=cat_1_name,
-            not_category_name=cat_2_name,
+                not_category_name=cat_2_name,
-            minimum_term_frequency=0, pmi_threshold_coefficient=0,
+                minimum_term_frequency=0, pmi_threshold_coefficient=0,
-            width_in_pixels=1000,
+                width_in_pixels=1000,
-            transform=st.Scalers.dense_rank
+                transform=st.Scalers.dense_rank
-    )
+        )
-    if real_time:
+        if real_time:
-        open('./artefacts/real_time_scatter_' +
+            open('./artefacts/real_time_scatter_' +
-             timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
-    else:
+        else:
-        open('./artefacts/scatter_' +
+            open('./artefacts/scatter_' +
-             timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
--- a/whisjax.py
+++ b/whisjax.py
@@ -127,7 +127,7 @@ def main():
        audio_filename = media_file
    logger.info("Finished extracting audio")
-
+    logger.info("Transcribing")
    # Convert the audio to text using the OpenAI Whisper model
    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
                                  dtype=jnp.float16,
@@ -157,13 +157,14 @@ def main():
    create_talk_diff_scatter_viz(NOW)
    # S3 : Push artefacts to S3 bucket
    prefix = "./artefacts/"
    suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
-    files_to_upload = ["transcript_" + suffix + ".txt",
+    files_to_upload = [prefix + "transcript_" + suffix + ".txt",
-                       "transcript_with_timestamp_" + suffix + ".txt",
+                       prefix + "transcript_with_timestamp_" + suffix + ".txt",
-                       "df_" + suffix + ".pkl",
+                       prefix + "df_" + suffix + ".pkl",
-                       "wordcloud_" + suffix + ".png",
+                       prefix + "wordcloud_" + suffix + ".png",
-                       "mappings_" + suffix + ".pkl",
+                       prefix + "mappings_" + suffix + ".pkl",
-                       "scatter_" + suffix + ".html"]
+                       prefix + "scatter_" + suffix + ".html"]
    upload_files(files_to_upload)
    summarize(transcript_text, NOW, False, False)