From 25f34bf9e5d1bc37316ebf263dfa6d2b5b0c2398 Mon Sep 17 00:00:00 2001
From: Gokul Mohanarangan <gokul@monadical.com>
Date: Tue, 25 Jul 2023 10:02:25 +0530
Subject: [PATCH] organize imports

---
 __init__.py             |  0
 format_output.py        | 30 ------------------------------
 server.py               | 34 ++++++++++++++++------------------
 stream_client.py        |  2 +-
 utils/config.ini        | 32 ++++++++++++++++----------------
 utils/format_output.py  | 32 ++++++++++++++++++++++++++++++++
 utils/text_utilities.py |  6 +++---
 utils/viz_utilities.py  | 23 +++++++++++------------
 8 files changed, 79 insertions(+), 80 deletions(-)
 create mode 100644 __init__.py
 delete mode 100644 format_output.py
 create mode 100644 utils/format_output.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/format_output.py b/format_output.py
deleted file mode 100644
index 6cc3006c..00000000
--- a/format_output.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import json
-
-with open("meeting_titles_and_summaries.txt", "r") as f:
-    outputs = f.read()
-
-outputs = json.loads(outputs)
-
-transcript_file = open("meeting_transcript.txt", "a")
-title_description_file = open("meeting_title_description.txt", "a")
-
-for item in outputs["topics"]:
-    transcript_file.write(item["transcript"])
-
-    title_description_file.write("TITLE: \n")
-    title_description_file.write(item["title"])
-    title_description_file.write("\n")
-
-    title_description_file.write("DESCRIPTION: \n")
-    title_description_file.write(item["description"])
-    title_description_file.write("\n")
-
-    title_description_file.write("TRANSCRIPT: \n")
-    title_description_file.write(item["transcript"])
-    title_description_file.write("\n")
-
-    title_description_file.write("---------------------------------------- \n\n")
-
-
-
-
diff --git a/server.py b/server.py
index 2b9ffd4e..6ff68400 100644
--- a/server.py
+++ b/server.py
@@ -1,25 +1,23 @@
 import asyncio
 import datetime
-import os
-import io
-import numpy as np
 import json
+import os
 import uuid
 import wave
 from concurrent.futures import ThreadPoolExecutor
-from faster_whisper import WhisperModel
+
 import aiohttp_cors
-import jax.numpy as jnp
 import requests
 from aiohttp import web
 from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
 from aiortc.contrib.media import MediaRelay
 from av import AudioFifo
+from faster_whisper import WhisperModel
 from loguru import logger
-from whisper_jax import FlaxWhisperPipline
-from utils.run_utils import run_in_executor
 from sortedcontainers import SortedDict
 
+from utils.run_utils import run_in_executor
+
 pcs = set()
 relay = MediaRelay()
 data_channel = None
@@ -45,7 +43,7 @@ blacklisted_messages = [" Thank you.", " See you next time!",
 
 
 def get_title_and_summary(llm_input_text, last_timestamp):
-    print("Generating title and summary")
+    ("Generating title and summary")
     # output = llm.generate(prompt)
 
     # Use monadical-ml to fire this query to an LLM and get result
@@ -69,13 +67,13 @@ def get_title_and_summary(llm_input_text, last_timestamp):
             "prompt": prompt
     }
 
-    # To-do: Handle unexpected output formats from the model
+    # TODO : Handle unexpected output formats from the model
     try:
         response = requests.post(LLM_URL, headers=headers, json=data)
         output = json.loads(response.json()["results"][0]["text"])
         output["description"] = output.pop("summary")
         output["transcript"] = llm_input_text
-        output["timestamp"] =\
+        output["timestamp"] = \
             str(datetime.timedelta(seconds=round(last_timestamp)))
         incremental_responses.append(output)
         result = {
@@ -84,13 +82,13 @@ def get_title_and_summary(llm_input_text, last_timestamp):
         }
 
     except Exception as e:
-        print("Exception" + str(e))
+        logger.info("Exception" + str(e))
         result = None
     return result
 
 
 def channel_log(channel, t, message):
-    print("channel(%s) %s %s" % (channel.label, t, message))
+    logger.info("channel(%s) %s %s" % (channel.label, t, message))
 
 
 def channel_send(channel, message):
@@ -120,17 +118,18 @@ def channel_send_transcript(channel):
                 if len(sorted_transcripts) >= 3:
                     del sorted_transcripts[least_time]
         except Exception as e:
-            print("Exception", str(e))
+            logger.info("Exception", str(e))
             pass
 
 
 def get_transcription(frames):
-    print("Transcribing..")
+    logger.info("Transcribing..")
     sorted_transcripts[frames[0].time] = None
 
+    # TODO:
     # Passing IO objects instead of temporary files throws an error
     # Passing ndarrays (typecasted with float) does not give any
-    # transcription. Refer issue
+    # transcription. Refer issue,
     # https://github.com/guillaumekln/faster-whisper/issues/369
     audiofilename = "test" + str(datetime.datetime.now())
     wf = wave.open(audiofilename, "wb")
@@ -170,7 +169,7 @@ def get_transcription(frames):
         transcription_text += result_text
 
     except Exception as e:
-        print("Exception" + str(e))
+        logger.info("Exception" + str(e))
         pass
 
     result = {
@@ -195,7 +194,7 @@ def get_final_summary_response():
             "summary": final_summary
     }
 
-    with open("meeting_titles_and_summaries.txt", "a") as f:
+    with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f:
         f.write(json.dumps(incremental_responses))
     return response
 
@@ -275,7 +274,6 @@ async def offer(request):
             if isinstance(message, str) and message.startswith("ping"):
                 channel_send(channel, "pong" + message[4:])
 
-
     @pc.on("connectionstatechange")
     async def on_connectionstatechange():
         log_info("Connection state is " + pc.connectionState)
diff --git a/stream_client.py b/stream_client.py
index 124c734d..1ed9cf31 100644
--- a/stream_client.py
+++ b/stream_client.py
@@ -114,7 +114,7 @@ class StreamClient:
                 self.channel_log(channel, "<", message)
 
                 if isinstance(message, str) and message.startswith("pong"):
-                    elapsed_ms = (self.current_stamp() - int(message[5:]))\
+                    elapsed_ms = (self.current_stamp() - int(message[5:])) \
                                  / 1000
                     print(" RTT %.2f ms" % elapsed_ms)
 
diff --git a/utils/config.ini b/utils/config.ini
index 976f4a32..9ba12959 100644
--- a/utils/config.ini
+++ b/utils/config.ini
@@ -1,24 +1,24 @@
 [DEFAULT]
 #SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
-KMP_DUPLICATE_LIB_OK=TRUE
+KMP_DUPLICATE_LIB_OK = TRUE
 #ExportOpenAIAPIKey
-OPENAI_APIKEY=
+OPENAI_APIKEY =
 #ExportWhisperModelSize
-WHISPER_MODEL_SIZE=tiny
-WHISPER_REAL_TIME_MODEL_SIZE=tiny
+WHISPER_MODEL_SIZE = tiny
+WHISPER_REAL_TIME_MODEL_SIZE = tiny
 #AWSconfig
-AWS_ACCESS_KEY=***REMOVED***
-AWS_SECRET_KEY=***REMOVED***
-BUCKET_NAME=reflector-bucket
+AWS_ACCESS_KEY = ***REMOVED***
+AWS_SECRET_KEY = ***REMOVED***
+BUCKET_NAME = reflector-bucket
 #Summarizerconfig
-SUMMARY_MODEL=facebook/bart-large-cnn
-INPUT_ENCODING_MAX_LENGTH=1024
-MAX_LENGTH=2048
-BEAM_SIZE=6
-MAX_CHUNK_LENGTH=1024
-SUMMARIZE_USING_CHUNKS=YES
+SUMMARY_MODEL = facebook/bart-large-cnn
+INPUT_ENCODING_MAX_LENGTH = 1024
+MAX_LENGTH = 2048
+BEAM_SIZE = 6
+MAX_CHUNK_LENGTH = 1024
+SUMMARIZE_USING_CHUNKS = YES
 #Audiodevice
-BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
-AV_FOUNDATION_DEVICE_ID=1
+BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
+AV_FOUNDATION_DEVICE_ID = 1
 # LLM PATH
-LLM_PATH=
+LLM_PATH =
diff --git a/utils/format_output.py b/utils/format_output.py
new file mode 100644
index 00000000..4f026ce2
--- /dev/null
+++ b/utils/format_output.py
@@ -0,0 +1,32 @@
+import json
+
+with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f:
+    outputs = f.read()
+
+outputs = json.loads(outputs)
+
+transcript_file = open("../artefacts/meeting_transcript.txt", "a")
+title_desc_file = open("../artefacts/meeting_title_description.txt", "a")
+summary_file = open("../artefacts/meeting_summary.txt", "a")
+
+for item in outputs["topics"]:
+    transcript_file.write(item["transcript"])
+    summary_file.write(item["description"])
+
+    title_desc_file.write("TITLE: \n")
+    title_desc_file.write(item["title"])
+    title_desc_file.write("\n")
+
+    title_desc_file.write("DESCRIPTION: \n")
+    title_desc_file.write(item["description"])
+    title_desc_file.write("\n")
+
+    title_desc_file.write("TRANSCRIPT: \n")
+    title_desc_file.write(item["transcript"])
+    title_desc_file.write("\n")
+
+    title_desc_file.write("---------------------------------------- \n\n")
+
+transcript_file.close()
+title_desc_file.close()
+summary_file.close()
diff --git a/utils/text_utilities.py b/utils/text_utilities.py
index ef15c7a3..6210e78e 100644
--- a/utils/text_utilities.py
+++ b/utils/text_utilities.py
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import BartForConditionalGeneration, BartTokenizer
 
-from utils.log_utils import logger
-from utils.run_utils import config
+from log_utils import logger
+from run_utils import config
 
 nltk.download('punkt', quiet=True)
 
@@ -171,7 +171,7 @@ def summarize(transcript_text, timestamp,
         output_filename = "real_time_" + output_filename
 
     if summarize_using_chunks != "YES":
-        inputs = tokenizer.\
+        inputs = tokenizer. \
             batch_encode_plus([transcript_text], truncation=True,
                               padding='longest',
                               max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),
diff --git a/utils/viz_utilities.py b/utils/viz_utilities.py
index 93a9b56f..6da24bb0 100644
--- a/utils/viz_utilities.py
+++ b/utils/viz_utilities.py
@@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud
 en = spacy.load('en_core_web_md')
 spacy_stopwords = en.Defaults.stop_words
 
-STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\
+STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
     union(set(spacy_stopwords))
 
 
@@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False):
     """
     filename = "transcript"
     if real_time:
-        filename = "real_time_" + filename + "_" +\
+        filename = "real_time_" + filename + "_" + \
                    timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
     else:
         filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
@@ -47,7 +47,7 @@ def create_wordcloud(timestamp, real_time=False):
 
     wordcloud_name = "wordcloud"
     if real_time:
-        wordcloud_name = "real_time_" + wordcloud_name + "_" +\
+        wordcloud_name = "real_time_" + wordcloud_name + "_" + \
                          timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
     else:
         wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
@@ -57,12 +57,12 @@ def create_wordcloud(timestamp, real_time=False):
 
 def create_talk_diff_scatter_viz(timestamp, real_time=False):
     """
-    Perform agenda vs transription diff to see covered topics.
+    Perform agenda vs transcription diff to see covered topics.
     Create a scatter plot of words in topics.
     :return: None. Saved locally.
     """
-    spaCy_model = "en_core_web_md"
-    nlp = spacy.load(spaCy_model)
+    spacy_model = "en_core_web_md"
+    nlp = spacy.load(spacy_model)
     nlp.add_pipe('sentencizer')
 
     agenda_topics = []
@@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                 agenda_topics.append(line.split(":")[0])
 
     # Load the transcription with timestamp
-    filename = ""
     if real_time:
-        filename = "./artefacts/real_time_transcript_with_timestamp_" +\
+        filename = "./artefacts/real_time_transcript_with_timestamp_" + \
                    timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
     else:
-        filename = "./artefacts/transcript_with_timestamp_" +\
+        filename = "./artefacts/transcript_with_timestamp_" + \
                    timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
     with open(filename) as f:
         transcription_timestamp_text = f.read()
@@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
 
     df = df.apply(create_new_columns, axis=1)
 
-    # Count the number of items covered and calculatre the percentage
+    # Count the number of items covered and calculate the percentage
     num_covered_items = sum(covered_items.values())
     percentage_covered = num_covered_items / len(agenda) * 100
 
@@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
     # Save df, mappings for further experimentation
     df_name = "df"
     if real_time:
-        df_name = "real_time_" + df_name + "_" +\
+        df_name = "real_time_" + df_name + "_" + \
                   timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
     else:
         df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
@@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
 
     mappings_name = "mappings"
     if real_time:
-        mappings_name = "real_time_" + mappings_name + "_" +\
+        mappings_name = "real_time_" + mappings_name + "_" + \
                         timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
     else:
         mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"