fix bug in wordcloud

This commit is contained in:
gokul
2023-06-23 15:35:06 +05:30
parent c8c2f6b6b7
commit 2fe02bb70b
6 changed files with 543 additions and 152 deletions

View File

@@ -27,10 +27,8 @@ To setup,
```sh setup_dependencies.sh cuda12``` ```sh setup_dependencies.sh cuda12```
4) ``` pip install -r requirements.txt```
4) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"``` ``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```

File diff suppressed because one or more lines are too long

View File

@@ -1,4 +1,4 @@
# Upgrade pip Upgrade pip
pip install --upgrade pip pip install --upgrade pip
# Default to CPU Installation of JAX # Default to CPU Installation of JAX
@@ -24,3 +24,10 @@ pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
# Update to latest version # Update to latest version
pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git
pip install -r requirements.txt
# download spacy models
export KMP_DUPLICATE_LIB_OK=True
python -m spacy download en_core_web_sm
python -m spacy download en_core_web_md

View File

@@ -1,6 +1,6 @@
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords as nltk_stopwords from nltk.corpus import stopwords
import collections import collections
import spacy import spacy
import pickle import pickle
@@ -15,7 +15,7 @@ config.read('config.ini')
en = spacy.load('en_core_web_md') en = spacy.load('en_core_web_md')
spacy_stopwords = en.Defaults.stop_words spacy_stopwords = en.Defaults.stop_words
STOPWORDS = set(STOPWORDS).union(set(nltk_stopwords)).union(set(spacy_stopwords)) STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords))
def create_wordcloud(timestamp, real_time=False): def create_wordcloud(timestamp, real_time=False):
""" """
@@ -195,4 +195,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
width_in_pixels=1000, width_in_pixels=1000,
transform=st.Scalers.dense_rank transform=st.Scalers.dense_rank
) )
open('./scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) if real_time:
open('./real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)

View File

@@ -26,8 +26,8 @@ from file_utilities import upload_files, download_files
from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
from text_utilities import summarize, post_process_transcription from text_utilities import summarize, post_process_transcription
nltk.download('punkt') nltk.download('punkt', quiet=True)
nltk.download('stopwords') nltk.download('stopwords', quiet=True)
# Configurations can be found in config.ini. Set them properly before executing # Configurations can be found in config.ini. Set them properly before executing
config = configparser.ConfigParser() config = configparser.ConfigParser()
@@ -141,7 +141,8 @@ def main():
"transcript_with_timestamp_" + suffix + ".txt", "transcript_with_timestamp_" + suffix + ".txt",
"df_" + suffix + ".pkl", "df_" + suffix + ".pkl",
"wordcloud_" + suffix + ".png", "wordcloud_" + suffix + ".png",
"mappings_" + suffix + ".pkl"] "mappings_" + suffix + ".pkl",
"scatter_" + suffix + ".html"]
upload_files(files_to_upload) upload_files(files_to_upload)
summarize(transcript_text, NOW, False, False) summarize(transcript_text, NOW, False, False)

View File

@@ -12,7 +12,7 @@ from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
from text_utilities import summarize, post_process_transcription from text_utilities import summarize, post_process_transcription
from loguru import logger from loguru import logger
import nltk import nltk
nltk.download('stopwords') nltk.download('stopwords', quiet=True)
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read('config.ini') config.read('config.ini')
@@ -118,7 +118,8 @@ def main():
"real_time_transcript_with_timestamp" + suffix + ".txt", "real_time_transcript_with_timestamp" + suffix + ".txt",
"real_time_df_" + suffix + ".pkl", "real_time_df_" + suffix + ".pkl",
"real_time_wordcloud_" + suffix + ".png", "real_time_wordcloud_" + suffix + ".png",
"real_time_mappings_" + suffix + ".pkl"] "real_time_mappings_" + suffix + ".pkl",
"real_time_scatter_" + suffix + ".html"]
upload_files(files_to_upload) upload_files(files_to_upload)
summarize(transcript_with_timestamp["text"], NOW, True, True) summarize(transcript_with_timestamp["text"], NOW, True, True)