mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Setup pipeline in new Mac and make changes to setup
This commit is contained in:
83
README.md
83
README.md
@@ -4,41 +4,6 @@ This is the code base for the Reflector demo (formerly called agenda-talk-diff)
|
||||
|
||||
The target deliverable is a local-first live transcription and visualization tool to compare a discussion's target agenda/objectives to the actual discussion live.
|
||||
|
||||
To setup,
|
||||
|
||||
1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests.
|
||||
2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
|
||||
3) Run the script setup_depedencies.sh.
|
||||
|
||||
``` chmod +x setup_dependencies.sh ```
|
||||
|
||||
``` sh setup_dependencies.sh <ENV>```
|
||||
|
||||
|
||||
ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU]
|
||||
|
||||
```ENV``` is :
|
||||
|
||||
cpu -> JAX CPU installation
|
||||
|
||||
cuda11 -> JAX CUDA 11.x version
|
||||
|
||||
cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```)
|
||||
|
||||
```sh setup_dependencies.sh cuda12```
|
||||
|
||||
|
||||
4) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
|
||||
|
||||
``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```
|
||||
|
||||
You can even run it on local file or a file in your configured S3 bucket.
|
||||
|
||||
``` python3 whisjax.py "startup.mp4"```
|
||||
|
||||
The script will take care of a few cases like youtube file, local file, video file, audio-only file,
|
||||
file in S3, etc. If local file is not present, it can automatically take the file from S3.
|
||||
|
||||
|
||||
**S3 bucket:**
|
||||
|
||||
@@ -74,9 +39,52 @@ Download:
|
||||
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
|
||||
mentioned above or simply use the GUI of AWS Management Console.
|
||||
|
||||
**WORKFLOW:**
|
||||
|
||||
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
|
||||
To setup,
|
||||
|
||||
1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests.
|
||||
2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
|
||||
|
||||
NOTE: If you don't have portaudio installed already, run ```brew install portaudio```
|
||||
|
||||
3) Run the script setup_depedencies.sh.
|
||||
|
||||
``` chmod +x setup_dependencies.sh ```
|
||||
|
||||
``` sh setup_dependencies.sh <ENV>```
|
||||
|
||||
|
||||
ENV refers to the intended environment for JAX. JAX is available in several variants, [CPU | GPU | Colab TPU | Google Cloud TPU]
|
||||
|
||||
```ENV``` is :
|
||||
|
||||
cpu -> JAX CPU installation
|
||||
|
||||
cuda11 -> JAX CUDA 11.x version
|
||||
|
||||
cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```)
|
||||
|
||||
```sh setup_dependencies.sh cuda12```
|
||||
|
||||
4) If not already done, install ffmpeg. ```brew install ffmpeg```
|
||||
|
||||
For NLTK SSL error, check [here](https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)
|
||||
|
||||
|
||||
5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
|
||||
|
||||
``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```
|
||||
|
||||
You can even run it on local file or a file in your configured S3 bucket.
|
||||
|
||||
``` python3 whisjax.py "startup.mp4"```
|
||||
|
||||
The script will take care of a few cases like youtube file, local file, video file, audio-only file,
|
||||
file in S3, etc. If local file is not present, it can automatically take the file from S3.
|
||||
|
||||
**OFFLINE WORKFLOW:**
|
||||
|
||||
1) Specify the input source file] from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
|
||||
```.m4a``` format, it will get converted to ```.mp4``` automatically.
|
||||
2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
|
||||
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
|
||||
@@ -101,7 +109,6 @@ HTML file, a sample word cloud and uploads them to the S3 bucket
|
||||
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
|
||||
|
||||
|
||||
|
||||
**Visualization experiments:**
|
||||
|
||||
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
|
||||
|
||||
@@ -18,4 +18,4 @@ BEAM_SIZE=6
|
||||
MAX_CHUNK_LENGTH=1024
|
||||
SUMMARIZE_USING_CHUNKS=YES
|
||||
# Audio device
|
||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=ref-agg-input
|
||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
|
||||
@@ -6,7 +6,7 @@ from loguru import logger
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
import whisper
|
||||
import nltk
|
||||
nltk.download('punkt')
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
WHISPER_MODEL_SIZE = "base"
|
||||
|
||||
|
||||
@@ -48,4 +48,5 @@ pandas
|
||||
jupyter
|
||||
seaborn
|
||||
matplotlib
|
||||
termcolor
|
||||
termcolor
|
||||
ffmpeg
|
||||
@@ -1,4 +1,6 @@
|
||||
Upgrade pip
|
||||
#!/bin/sh
|
||||
|
||||
# Upgrade pip
|
||||
pip install --upgrade pip
|
||||
|
||||
# Default to CPU Installation of JAX
|
||||
@@ -27,7 +29,5 @@ pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit
|
||||
pip install -r requirements.txt
|
||||
|
||||
# download spacy models
|
||||
export KMP_DUPLICATE_LIB_OK=True
|
||||
python -m spacy download en_core_web_sm
|
||||
python -m spacy download en_core_web_md
|
||||
|
||||
spacy download en_core_web_sm
|
||||
spacy download en_core_web_md
|
||||
|
||||
@@ -7,7 +7,7 @@ from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
@@ -20,7 +20,6 @@ def preprocess_sentence(sentence):
|
||||
|
||||
def compute_similarity(sent1, sent2):
|
||||
tfidf_vectorizer = TfidfVectorizer()
|
||||
print("semt1", sent1, sent2)
|
||||
if sent1 is not None and sent2 is not None:
|
||||
tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
|
||||
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
|
||||
|
||||
@@ -11,12 +11,9 @@ from file_utilities import upload_files
|
||||
from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
|
||||
from text_utilities import summarize, post_process_transcription
|
||||
from loguru import logger
|
||||
import nltk
|
||||
import time
|
||||
from termcolor import colored
|
||||
|
||||
nltk.download('stopwords', quiet=True)
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
|
||||
@@ -25,7 +22,7 @@ WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||
FRAMES_PER_BUFFER = 8000
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 2
|
||||
RATE = 44100
|
||||
RATE = 96000
|
||||
RECORD_SECONDS = 15
|
||||
NOW = datetime.now()
|
||||
|
||||
@@ -43,7 +40,7 @@ def main():
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=FRAMES_PER_BUFFER,
|
||||
input_device_index=audio_devices['index']
|
||||
input_device_index=int(audio_devices['index'])
|
||||
)
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"],
|
||||
@@ -124,7 +121,7 @@ def main():
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
files_to_upload = ["real_time_transcript_" + suffix + ".txt",
|
||||
"real_time_transcript_with_timestamp" + suffix + ".txt",
|
||||
"real_time_transcript_with_timestamp_" + suffix + ".txt",
|
||||
"real_time_df_" + suffix + ".pkl",
|
||||
"real_time_wordcloud_" + suffix + ".png",
|
||||
"real_time_mappings_" + suffix + ".pkl",
|
||||
|
||||
Reference in New Issue
Block a user