fix bug in wordcloud

2026-02-04 09:56:47 +00:00 · 2023-06-23 15:35:06 +05:30
parent c8c2f6b6b7
commit 2fe02bb70b
6 changed files with 543 additions and 152 deletions
--- a/README.md
+++ b/README.md
@@ -27,10 +27,8 @@ To setup,

    ```sh setup_dependencies.sh cuda12```

-4) ``` pip install -r requirements.txt```

-
-5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
+4) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.

 ``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```

--- a/Viz-experiments.ipynb
+++ b/Viz-experiments.ipynb
--- a/setup_dependencies.sh
+++ b/setup_dependencies.sh
@@ -1,4 +1,4 @@
-# Upgrade pip
+ Upgrade pip
 pip install --upgrade pip

 # Default to CPU Installation of JAX
@@ -24,3 +24,10 @@ pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
 # Update to latest version
 pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git

+pip install -r requirements.txt
+
+# download spacy models
+export KMP_DUPLICATE_LIB_OK=True
+python -m spacy download en_core_web_sm
+python -m spacy download en_core_web_md
+
--- a/viz_utilities.py
+++ b/viz_utilities.py
@@ -1,6 +1,6 @@
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud, STOPWORDS
-from nltk.corpus import stopwords as nltk_stopwords
+from nltk.corpus import stopwords
 import collections
 import spacy
 import pickle
@@ -15,7 +15,7 @@ config.read('config.ini')
 en = spacy.load('en_core_web_md')
 spacy_stopwords = en.Defaults.stop_words

-STOPWORDS = set(STOPWORDS).union(set(nltk_stopwords)).union(set(spacy_stopwords))
+STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords))

 def create_wordcloud(timestamp, real_time=False):
    """
@@ -195,4 +195,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
        width_in_pixels=1000,
        transform=st.Scalers.dense_rank
    )
-    open('./scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+    if real_time:
+        open('./real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+    else:
+        open('./scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
--- a/whisjax.py
+++ b/whisjax.py
@@ -26,8 +26,8 @@ from file_utilities import upload_files, download_files
 from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
 from text_utilities import summarize, post_process_transcription

-nltk.download('punkt')
-nltk.download('stopwords')
+nltk.download('punkt', quiet=True)
+nltk.download('stopwords', quiet=True)

 # Configurations can be found in config.ini. Set them properly before executing
 config = configparser.ConfigParser()
@@ -141,7 +141,8 @@ def main():
                       "transcript_with_timestamp_" + suffix + ".txt",
                       "df_" + suffix + ".pkl",
                       "wordcloud_" + suffix + ".png",
-                       "mappings_" + suffix + ".pkl"]
+                       "mappings_" + suffix + ".pkl",
+                       "scatter_" + suffix + ".html"]
    upload_files(files_to_upload)

    summarize(transcript_text, NOW, False, False)
--- a/whisjax_realtime.py
+++ b/whisjax_realtime.py
@@ -12,7 +12,7 @@ from viz_utilities import create_wordcloud, create_talk_diff_scatter_viz
 from text_utilities import summarize, post_process_transcription
 from loguru import logger
 import nltk
-nltk.download('stopwords')
+nltk.download('stopwords', quiet=True)

 config = configparser.ConfigParser()
 config.read('config.ini')
@@ -118,7 +118,8 @@ def main():
                       "real_time_transcript_with_timestamp" + suffix + ".txt",
                       "real_time_df_" + suffix + ".pkl",
                       "real_time_wordcloud_" + suffix + ".png",
-                       "real_time_mappings_" + suffix + ".pkl"]
+                       "real_time_mappings_" + suffix + ".pkl",
+                       "real_time_scatter_" + suffix + ".html"]
    upload_files(files_to_upload)

    summarize(transcript_with_timestamp["text"], NOW, True, True)