From ed5cbf191ad581bf396d0ccac4bdd4dddfde5a7a Mon Sep 17 00:00:00 2001
From: Gokul Mohanarangan <gokul@monadical.com>
Date: Mon, 26 Jun 2023 19:46:23 +0530
Subject: [PATCH] Setup pipeline in new Mac and make changes to setup

---
 README.md                                  | 83 ++++++++++++----------
 config.ini                                 |  2 +-
 reflector-local/whisper_summarizer_bart.py |  2 +-
 requirements.txt                           |  3 +-
 setup_dependencies.sh                      | 10 +--
 text_utilities.py                          |  3 +-
 whisjax_realtime.py                        |  9 +--
 7 files changed, 58 insertions(+), 54 deletions(-)
diff --git a/README.md b/README.md
index dba2ec44..01cf5a87 100644
--- a/README.md
+++ b/README.md
@@ -4,41 +4,6 @@ This is the code base for the Reflector demo (formerly called agenda-talk-diff)
 
 The target deliverable is a local-first live transcription and visualization tool to compare a discussion's target agenda/objectives to the actual discussion live.
 
-To setup, 
-
-1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests.
-2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
-3) Run the script setup_depedencies.sh.
-
-    ``` chmod +x setup_dependencies.sh ```
-
-    ``` sh setup_dependencies.sh  <ENV>```
-
-    
-   ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU]
-   
-   ```ENV``` is :
-   
-   cpu -> JAX CPU installation
-
-   cuda11 -> JAX CUDA 11.x version
-
-   cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```)
-
-    ```sh setup_dependencies.sh cuda12```
-
-
-4) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
-
-``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```
-
-You can even run it on local file or a file in your configured S3 bucket.
-
-``` python3 whisjax.py "startup.mp4"```
-
-The script will take care of a few cases like youtube file, local file, video file, audio-only file, 
-file in S3, etc. If local file is not present, it can automatically take the file from S3.
-
 
 **S3 bucket:**
 
@@ -74,9 +39,52 @@ Download:
 If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
 mentioned above or simply use the GUI of AWS Management Console.
 
-**WORKFLOW:**
 
-1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
+To setup, 
+
+1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests.
+2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
+
+NOTE: If you don't have portaudio installed already, run ```brew install portaudio```
+
+3) Run the script setup_depedencies.sh.
+
+    ``` chmod +x setup_dependencies.sh ```
+
+    ``` sh setup_dependencies.sh  <ENV>```
+
+    
+   ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU]
+   
+   ```ENV``` is :
+   
+   cpu -> JAX CPU installation
+
+   cuda11 -> JAX CUDA 11.x version
+
+   cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```)
+
+    ```sh setup_dependencies.sh cuda12```
+
+4) If not already done, install ffmpeg. ```brew install ffmpeg```
+
+For NLTK SSL error, check [here](https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)
+
+
+5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
+
+``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```
+
+You can even run it on local file or a file in your configured S3 bucket.
+
+``` python3 whisjax.py "startup.mp4"```
+
+The script will take care of a few cases like youtube file, local file, video file, audio-only file, 
+file in S3, etc. If local file is not present, it can automatically take the file from S3.
+
+**OFFLINE WORKFLOW:**
+
+1) Specify the input source file] from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
    ```.m4a``` format, it will get converted to ```.mp4``` automatically.
 2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
    This version of the pipeline compares covered agenda topics using agenda headers in the following format.
@@ -101,7 +109,6 @@ HTML file, a sample word cloud and uploads them to the S3 bucket
 Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
 
 
-
 **Visualization experiments:**
 
 This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
diff --git a/config.ini b/config.ini
index 62699b42..11e4ddcf 100644
--- a/config.ini
+++ b/config.ini
@@ -18,4 +18,4 @@ BEAM_SIZE=6
 MAX_CHUNK_LENGTH=1024
 SUMMARIZE_USING_CHUNKS=YES
 # Audio device
-BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=ref-agg-input
\ No newline at end of file
+BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
\ No newline at end of file
diff --git a/reflector-local/whisper_summarizer_bart.py b/reflector-local/whisper_summarizer_bart.py
index 13b6c66a..b0de87f7 100644
--- a/reflector-local/whisper_summarizer_bart.py
+++ b/reflector-local/whisper_summarizer_bart.py
@@ -6,7 +6,7 @@ from loguru import logger
 from transformers import BartTokenizer, BartForConditionalGeneration
 import whisper
 import nltk
-nltk.download('punkt')
+nltk.download('punkt', quiet=True)
 
 WHISPER_MODEL_SIZE = "base"
 
diff --git a/requirements.txt b/requirements.txt
index 490ee05d..4bd15883 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,4 +48,5 @@ pandas
 jupyter
 seaborn
 matplotlib
-termcolor
\ No newline at end of file
+termcolor
+ffmpeg
\ No newline at end of file
diff --git a/setup_dependencies.sh b/setup_dependencies.sh
index 951dcb38..dab656a7 100755
--- a/setup_dependencies.sh
+++ b/setup_dependencies.sh
@@ -1,4 +1,6 @@
- Upgrade pip
+#!/bin/sh
+
+# Upgrade pip
 pip install --upgrade pip
 
 # Default to CPU Installation of JAX
@@ -27,7 +29,5 @@ pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit
 pip install -r requirements.txt
 
 # download spacy models
-export KMP_DUPLICATE_LIB_OK=True
-python -m spacy download en_core_web_sm
-python -m spacy download en_core_web_md
-
+spacy download en_core_web_sm
+spacy download en_core_web_md
diff --git a/text_utilities.py b/text_utilities.py
index 1ff0a70c..0d523dd3 100644
--- a/text_utilities.py
+++ b/text_utilities.py
@@ -7,7 +7,7 @@ from nltk.corpus import stopwords
 from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.tokenize import word_tokenize
 from sklearn.metrics.pairwise import cosine_similarity
-
+nltk.download('punkt', quiet=True)
 
 config = configparser.ConfigParser()
 config.read('config.ini')
@@ -20,7 +20,6 @@ def preprocess_sentence(sentence):
 
 def compute_similarity(sent1, sent2):
     tfidf_vectorizer = TfidfVectorizer()
-    print("semt1", sent1, sent2)
     if sent1 is not None and sent2 is not None:
         tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
         return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
diff --git a/whisjax_realtime.py b/whisjax_realtime.py
index 6bbe0f19..fb26c3b6 100644
--- a/whisjax_realtime.py
+++ b/whisjax_realtime.py
@@ -11,12 +11,9 @@ from file_utilities import upload_files
 from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
 from text_utilities import summarize, post_process_transcription
 from loguru import logger
-import nltk
 import time
 from termcolor import colored
 
-nltk.download('stopwords', quiet=True)
-
 config = configparser.ConfigParser()
 config.read('config.ini')
 
@@ -25,7 +22,7 @@ WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
 FRAMES_PER_BUFFER = 8000
 FORMAT = pyaudio.paInt16
 CHANNELS = 2
-RATE = 44100
+RATE = 96000
 RECORD_SECONDS = 15
 NOW = datetime.now()
 
@@ -43,7 +40,7 @@ def main():
         rate=RATE,
         input=True,
         frames_per_buffer=FRAMES_PER_BUFFER,
-        input_device_index=audio_devices['index']
+        input_device_index=int(audio_devices['index'])
     )
 
     pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"],
@@ -124,7 +121,7 @@ def main():
     # S3 : Push artefacts to S3 bucket
     suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
     files_to_upload = ["real_time_transcript_" + suffix + ".txt",
-                       "real_time_transcript_with_timestamp" + suffix + ".txt",
+                       "real_time_transcript_with_timestamp_" + suffix + ".txt",
                        "real_time_df_" + suffix + ".pkl",
                        "real_time_wordcloud_" + suffix + ".png",
                        "real_time_mappings_" + suffix + ".pkl",