refactor

2026-02-04 09:56:47 +00:00 · 2023-07-11 18:47:21 +05:30
parent d962ff1712
commit 71eb277fd7
11 changed files with 67 additions and 80 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -160,9 +160,6 @@ cython_debug/
 #.idea/

 *.mp4
-summary.txt
-transcript.txt
-transcript_timestamps.txt
 *.html
 *.pkl
 transcript_*.txt
@@ -176,3 +173,4 @@ test_samples/
 .DS_Store/
 .DS_Store
 .vscode/
+artefacts/
--- a/config.ini
+++ b/config.ini
@@ -1,22 +0,0 @@
-[DEFAULT]
-# Set exception rule for OpenMP error to allow duplicate lib initialization
-KMP_DUPLICATE_LIB_OK = TRUE
-# Export OpenAI API Key
-OPENAI_APIKEY =
-# Export Whisper Model Size
-WHISPER_MODEL_SIZE = tiny
-WHISPER_REAL_TIME_MODEL_SIZE = tiny
-# AWS config
-AWS_ACCESS_KEY = ***REMOVED***
-AWS_SECRET_KEY = ***REMOVED***
-BUCKET_NAME = 'reflector-bucket'
-# Summarizer config
-SUMMARY_MODEL = facebook/bart-large-cnn
-INPUT_ENCODING_MAX_LENGTH = 1024
-MAX_LENGTH = 2048
-BEAM_SIZE = 6
-MAX_CHUNK_LENGTH = 1024
-SUMMARIZE_USING_CHUNKS = YES
-# Audio device
-BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
-AV_FOUNDATION_DEVICE_ID = 2
--- a/requirements.txt
+++ b/requirements.txt
@@ -56,5 +56,4 @@ cached_property==1.5.2
 stamina==23.1.0
 httpx==0.24.1
 sortedcontainers==2.4.0
-openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
 https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
--- a/scripts/clear_artefacts.sh
+++ b/scripts/clear_artefacts.sh
@@ -1,15 +1,24 @@
 #!/bin/bash

 # Directory to search for Python files
-directory="."
+cwd=$(pwd)
+last_component="${cwd##*/}"
+
+if [ "$last_component" = "reflector" ]; then
+    directory="./artefacts"
+elif [ "$last_component" = "scripts" ]; then
+    directory="../artefacts"
+fi

 # Pattern to match Python files (e.g., "*.py" for all .py files)
-text_file_pattern="transcript_*.txt"
+transcript_file_pattern="transcript_*.txt"
+summary_file_pattern="summary_*.txt"
 pickle_file_pattern="*.pkl"
 html_file_pattern="*.html"
 png_file_pattern="wordcloud*.png"

-find "$directory" -type f -name "$text_file_pattern" -delete
+find "$directory" -type f -name "$transcript_file_pattern" -delete
+find "$directory" -type f -name "$summary_file_pattern" -delete
 find "$directory" -type f -name "$pickle_file_pattern" -delete
 find "$directory" -type f -name "$html_file_pattern" -delete
 find "$directory" -type f -name "$png_file_pattern" -delete
--- a/server_multithreaded.py
+++ b/server_multithreaded.py
@@ -65,6 +65,7 @@ def get_transcription():
                transcribe = True

        if transcribe:
+            print("Transcribing..")
            try:
                sorted_message_queue[frames[0].time] = None
                out_file = io.BytesIO()
@@ -113,7 +114,7 @@ def start_messaging_thread():

 def start_transcription_thread(max_threads: int):
    for i in range(max_threads):
-        t_thread = threading.Thread(target=get_transcription, args=(i,))
+        t_thread = threading.Thread(target=get_transcription)
        t_thread.start()


@@ -128,7 +129,7 @@ async def offer(request: requests.Request):
    def log_info(msg: str, *args):
        logger.info(pc_id + " " + msg, *args)

-    log_info("Created for %s", request.remote)
+    log_info("Created for " + request.remote)

    @pc.on("datachannel")
    def on_datachannel(channel):
@@ -146,14 +147,14 @@ async def offer(request: requests.Request):

    @pc.on("connectionstatechange")
    async def on_connectionstatechange():
-        log_info("Connection state is %s", pc.connectionState)
+        log_info("Connection state is " + pc.connectionState)
        if pc.connectionState == "failed":
            await pc.close()
            pcs.discard(pc)

    @pc.on("track")
    def on_track(track):
-        log_info("Track %s received", track.kind)
+        log_info("Track " + track.kind + " received")
        pc.addTrack(AudioStreamTrack(relay.subscribe(track)))

    # handle offer
--- a/utils/file_utils.py
+++ b/utils/file_utils.py
@@ -3,8 +3,8 @@ import sys
 import boto3
 import botocore

-from log_utils import logger
-from run_utils import config
+from .log_utils import logger
+from .run_utils import config

 BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]

--- a/utils/run_utils.py
+++ b/utils/run_utils.py
@@ -6,18 +6,18 @@ from threading import Lock
 from typing import ContextManager, Generic, TypeVar


-class ConfigParser:
-    __config = configparser.ConfigParser()
-
-    def __init__(self, config_file='../config.ini'):
-        self.__config.read(config_file)
+class ReflectorConfig:
+    __config = None

    @staticmethod
    def get_config():
-        return ConfigParser.__config
+        if ReflectorConfig.__config is None:
+            ReflectorConfig.__config = configparser.ConfigParser()
+            ReflectorConfig.__config.read('utils/config.ini')
+        return ReflectorConfig.__config


-config = ConfigParser.get_config()
+config = ReflectorConfig.get_config()


 def run_in_executor(func, *args, executor=None, **kwargs):
--- a/utils/test.py
+++ b/utils/test.py
--- a/utils/text_utilities.py
+++ b/utils/text_utilities.py
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import BartForConditionalGeneration, BartTokenizer

-from log_utils import logger
-from run_utils import config
+from utils.log_utils import logger
+from utils.run_utils import config

 nltk.download('punkt', quiet=True)

@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
        decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                             for summary in summaries]
        summary = " ".join(decoded_summaries)
-        with open(output_filename, 'w') as f:
+        with open("./artefacts/" + output_filename, 'w') as f:
            f.write(summary.strip() + "\n")
    else:
        logger.info("Breaking transcript into smaller chunks")
--- a/utils/viz_utilities.py
+++ b/utils/viz_utilities.py
@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
    else:
        wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"

-    plt.savefig(wordcloud_name)
+    plt.savefig("./artefacts/" + wordcloud_name)


 def create_talk_diff_scatter_viz(timestamp, real_time=False):
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
    # Load the transcription with timestamp
    filename = ""
    if real_time:
-        filename = "real_time_transcript_with_timestamp_" +\
+        filename = "./artefacts/real_time_transcript_with_timestamp_" +\
                   timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
    else:
-        filename = "transcript_with_timestamp_" +\
+        filename = "./artefacts/transcript_with_timestamp_" +\
                   timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
    with open(filename) as f:
        transcription_timestamp_text = f.read()
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                  timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
    else:
        df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    df.to_pickle(df_name)
+    df.to_pickle("./artefacts/" + df_name)

    my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
                   topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                        timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
    else:
        mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    pickle.dump(my_mappings, open(mappings_name, "wb"))
+    pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))

    # to load,  my_mappings = pickle.load( open ("mappings.pkl", "rb") )

@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):

    topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)

-    cat_1 = topic_times[0][0]
-    cat_1_name = topic_times[0][0]
-    cat_2_name = topic_times[1][0]
+    if len(topic_times) > 1:
+        cat_1 = topic_times[0][0]
+        cat_1_name = topic_times[0][0]
+        cat_2_name = topic_times[1][0]

-    # Scatter plot of topics
-    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
-    corpus = st.CorpusFromParsedDocuments(
-            df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
-    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
-    html = st.produce_scattertext_explorer(
-            corpus,
-            category=cat_1,
-            category_name=cat_1_name,
-            not_category_name=cat_2_name,
-            minimum_term_frequency=0, pmi_threshold_coefficient=0,
-            width_in_pixels=1000,
-            transform=st.Scalers.dense_rank
-    )
-    if real_time:
-        open('./artefacts/real_time_scatter_' +
-             timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
-    else:
-        open('./artefacts/scatter_' +
-             timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+        # Scatter plot of topics
+        df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
+        corpus = st.CorpusFromParsedDocuments(
+                df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
+        ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
+        html = st.produce_scattertext_explorer(
+                corpus,
+                category=cat_1,
+                category_name=cat_1_name,
+                not_category_name=cat_2_name,
+                minimum_term_frequency=0, pmi_threshold_coefficient=0,
+                width_in_pixels=1000,
+                transform=st.Scalers.dense_rank
+        )
+        if real_time:
+            open('./artefacts/real_time_scatter_' +
+                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+        else:
+            open('./artefacts/scatter_' +
+                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
--- a/whisjax.py
+++ b/whisjax.py
@@ -127,7 +127,7 @@ def main():
        audio_filename = media_file

    logger.info("Finished extracting audio")
-
+    logger.info("Transcribing")
    # Convert the audio to text using the OpenAI Whisper model
    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
                                  dtype=jnp.float16,
@@ -157,13 +157,14 @@ def main():
    create_talk_diff_scatter_viz(NOW)

    # S3 : Push artefacts to S3 bucket
+    prefix = "./artefacts/"
    suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
-    files_to_upload = ["transcript_" + suffix + ".txt",
-                       "transcript_with_timestamp_" + suffix + ".txt",
-                       "df_" + suffix + ".pkl",
-                       "wordcloud_" + suffix + ".png",
-                       "mappings_" + suffix + ".pkl",
-                       "scatter_" + suffix + ".html"]
+    files_to_upload = [prefix + "transcript_" + suffix + ".txt",
+                       prefix + "transcript_with_timestamp_" + suffix + ".txt",
+                       prefix + "df_" + suffix + ".pkl",
+                       prefix + "wordcloud_" + suffix + ".png",
+                       prefix + "mappings_" + suffix + ".pkl",
+                       prefix + "scatter_" + suffix + ".html"]
    upload_files(files_to_upload)

    summarize(transcript_text, NOW, False, False)