From ed5cbf191ad581bf396d0ccac4bdd4dddfde5a7a Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Mon, 26 Jun 2023 19:46:23 +0530 Subject: [PATCH] Setup pipeline in new Mac and make changes to setup --- README.md | 83 ++++++++++++---------- config.ini | 2 +- reflector-local/whisper_summarizer_bart.py | 2 +- requirements.txt | 3 +- setup_dependencies.sh | 10 +-- text_utilities.py | 3 +- whisjax_realtime.py | 9 +-- 7 files changed, 58 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index dba2ec44..01cf5a87 100644 --- a/README.md +++ b/README.md @@ -4,41 +4,6 @@ This is the code base for the Reflector demo (formerly called agenda-talk-diff) The target deliverable is a local-first live transcription and visualization tool to compare a discussion's target agenda/objectives to the actual discussion live. -To setup, - -1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests. -2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.] -3) Run the script setup_depedencies.sh. - - ``` chmod +x setup_dependencies.sh ``` - - ``` sh setup_dependencies.sh ``` - - - ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU] - - ```ENV``` is : - - cpu -> JAX CPU installation - - cuda11 -> JAX CUDA 11.x version - - cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```) - - ```sh setup_dependencies.sh cuda12``` - - -4) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it. - -``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"``` - -You can even run it on local file or a file in your configured S3 bucket. - -``` python3 whisjax.py "startup.mp4"``` - -The script will take care of a few cases like youtube file, local file, video file, audio-only file, -file in S3, etc. If local file is not present, it can automatically take the file from S3. - **S3 bucket:** @@ -74,9 +39,52 @@ Download: If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands mentioned above or simply use the GUI of AWS Management Console. -**WORKFLOW:** -1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in +To setup, + +1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests. +2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.] + +NOTE: If you don't have portaudio installed already, run ```brew install portaudio``` + +3) Run the script setup_depedencies.sh. + + ``` chmod +x setup_dependencies.sh ``` + + ``` sh setup_dependencies.sh ``` + + + ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU] + + ```ENV``` is : + + cpu -> JAX CPU installation + + cuda11 -> JAX CUDA 11.x version + + cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```) + + ```sh setup_dependencies.sh cuda12``` + +4) If not already done, install ffmpeg. ```brew install ffmpeg``` + +For NLTK SSL error, check [here](https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed) + + +5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it. + +``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"``` + +You can even run it on local file or a file in your configured S3 bucket. + +``` python3 whisjax.py "startup.mp4"``` + +The script will take care of a few cases like youtube file, local file, video file, audio-only file, +file in S3, etc. If local file is not present, it can automatically take the file from S3. + +**OFFLINE WORKFLOW:** + +1) Specify the input source file] from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in ```.m4a``` format, it will get converted to ```.mp4``` automatically. 2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run. This version of the pipeline compares covered agenda topics using agenda headers in the following format. @@ -101,7 +109,6 @@ HTML file, a sample word cloud and uploads them to the S3 bucket Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```. - **Visualization experiments:** This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the diff --git a/config.ini b/config.ini index 62699b42..11e4ddcf 100644 --- a/config.ini +++ b/config.ini @@ -18,4 +18,4 @@ BEAM_SIZE=6 MAX_CHUNK_LENGTH=1024 SUMMARIZE_USING_CHUNKS=YES # Audio device -BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=ref-agg-input \ No newline at end of file +BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator \ No newline at end of file diff --git a/reflector-local/whisper_summarizer_bart.py b/reflector-local/whisper_summarizer_bart.py index 13b6c66a..b0de87f7 100644 --- a/reflector-local/whisper_summarizer_bart.py +++ b/reflector-local/whisper_summarizer_bart.py @@ -6,7 +6,7 @@ from loguru import logger from transformers import BartTokenizer, BartForConditionalGeneration import whisper import nltk -nltk.download('punkt') +nltk.download('punkt', quiet=True) WHISPER_MODEL_SIZE = "base" diff --git a/requirements.txt b/requirements.txt index 490ee05d..4bd15883 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,4 +48,5 @@ pandas jupyter seaborn matplotlib -termcolor \ No newline at end of file +termcolor +ffmpeg \ No newline at end of file diff --git a/setup_dependencies.sh b/setup_dependencies.sh index 951dcb38..dab656a7 100755 --- a/setup_dependencies.sh +++ b/setup_dependencies.sh @@ -1,4 +1,6 @@ - Upgrade pip +#!/bin/sh + +# Upgrade pip pip install --upgrade pip # Default to CPU Installation of JAX @@ -27,7 +29,5 @@ pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit pip install -r requirements.txt # download spacy models -export KMP_DUPLICATE_LIB_OK=True -python -m spacy download en_core_web_sm -python -m spacy download en_core_web_md - +spacy download en_core_web_sm +spacy download en_core_web_md diff --git a/text_utilities.py b/text_utilities.py index 1ff0a70c..0d523dd3 100644 --- a/text_utilities.py +++ b/text_utilities.py @@ -7,7 +7,7 @@ from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize import word_tokenize from sklearn.metrics.pairwise import cosine_similarity - +nltk.download('punkt', quiet=True) config = configparser.ConfigParser() config.read('config.ini') @@ -20,7 +20,6 @@ def preprocess_sentence(sentence): def compute_similarity(sent1, sent2): tfidf_vectorizer = TfidfVectorizer() - print("semt1", sent1, sent2) if sent1 is not None and sent2 is not None: tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2]) return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] diff --git a/whisjax_realtime.py b/whisjax_realtime.py index 6bbe0f19..fb26c3b6 100644 --- a/whisjax_realtime.py +++ b/whisjax_realtime.py @@ -11,12 +11,9 @@ from file_utilities import upload_files from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz from text_utilities import summarize, post_process_transcription from loguru import logger -import nltk import time from termcolor import colored -nltk.download('stopwords', quiet=True) - config = configparser.ConfigParser() config.read('config.ini') @@ -25,7 +22,7 @@ WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] FRAMES_PER_BUFFER = 8000 FORMAT = pyaudio.paInt16 CHANNELS = 2 -RATE = 44100 +RATE = 96000 RECORD_SECONDS = 15 NOW = datetime.now() @@ -43,7 +40,7 @@ def main(): rate=RATE, input=True, frames_per_buffer=FRAMES_PER_BUFFER, - input_device_index=audio_devices['index'] + input_device_index=int(audio_devices['index']) ) pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"], @@ -124,7 +121,7 @@ def main(): # S3 : Push artefacts to S3 bucket suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S") files_to_upload = ["real_time_transcript_" + suffix + ".txt", - "real_time_transcript_with_timestamp" + suffix + ".txt", + "real_time_transcript_with_timestamp_" + suffix + ".txt", "real_time_df_" + suffix + ".pkl", "real_time_wordcloud_" + suffix + ".png", "real_time_mappings_" + suffix + ".pkl",