Merge pull request #10 from Monadical-SAS/whisper-jax-gokul

Add new features
This commit is contained in:
projects-g
2023-06-14 19:38:03 +05:30
committed by GitHub
5 changed files with 510 additions and 17 deletions

View File

@@ -73,23 +73,42 @@ Download:
``` python3 file_util.py download <object_name_in_S3_bucket>``` ``` python3 file_util.py download <object_name_in_S3_bucket>```
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
mentioned above or simply use the GUI of AWS Management Console.
**WORKFLOW:** **WORKFLOW:**
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script. 1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
2) Keep the agenda header topics in a local file named "agenda-headers.txt". This needs to be present where the script is run. ```.m4a``` format, it will get converted to ```.mp4``` automatically.
3) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive 2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
1) ```agenda_topic : <short description>```
3) Check all the values in ```config.ini```. You need to predefine 2 categories for which you need to scatter plot the
topic modelling visualization in the config file. This is the default visualization. But, from the dataframe artefact called
```df.pkl``` , you can load the df and choose different topics to plot. You can filter using certain words to search for the
transcriptions and you can see the top influencers and characteristic in each topic we have chosen to plot in the
interactive HTML document. I have added a new jupyter notebook that gives the base template to play around with, named
```Viz_experiments.ipynb```.
4) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
HTML file, a sample word cloud and uploads them to the S3 bucket HTML file, a sample word cloud and uploads them to the S3 bucket
4) Additional artefacts pushed to S3: 5) Additional artefacts pushed to S3:
1) HTML visualiztion file 1) HTML visualization file
2) pandas df in pickle format for others to collaborate and make their own visualizations 2) pandas df in pickle format for others to collaborate and make their own visualizations
3) Summary, transcript and transcript with timestamps file in text format. 3) Summary, transcript and transcript with timestamps file in text format.
The script also creates 2 types of mappings. The script also creates 2 types of mappings.
1) Timestamp -> The top 2 matched agenda topic 1) Timestamp -> The top 2 matched agenda topic
2) Topic -> All matched timestamps in the transcription 2) Topic -> All matched timestamps in the transcription
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
Other visualizations can be planned based on available artefacts or new ones can be created.
**Visualization experiments:**
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
visualizations on top of the metadata.
NEXT STEPS: NEXT STEPS:

449
Viz-experiments.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -7,4 +7,10 @@ OPENAI_APIKEY=
WHISPER_MODEL_SIZE=tiny WHISPER_MODEL_SIZE=tiny
AWS_ACCESS_KEY=***REMOVED*** AWS_ACCESS_KEY=***REMOVED***
AWS_SECRET_KEY=***REMOVED*** AWS_SECRET_KEY=***REMOVED***
BUCKET_NAME='reflector-bucket' BUCKET_NAME='reflector-bucket'
# For the topic modelling viz chart
CATEGORY_1=TAM
CATEGORY_1_NAME=TAM
CATEGORY_2_NAME=Churn

View File

@@ -44,4 +44,7 @@ nltk==3.8.1
wordcloud wordcloud
spacy spacy
scattertext scattertext
pandas pandas
jupyter
seaborn
matplotlib

View File

@@ -6,6 +6,7 @@
import argparse import argparse
import ast import ast
import collections
import configparser import configparser
import jax.numpy as jnp import jax.numpy as jnp
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@@ -13,7 +14,9 @@ import moviepy.editor
import moviepy.editor import moviepy.editor
import nltk import nltk
import os import os
import subprocess
import pandas as pd import pandas as pd
import pickle
import re import re
import scattertext as st import scattertext as st
import spacy import spacy
@@ -35,6 +38,7 @@ config.read('config.ini')
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
def init_argparse() -> argparse.ArgumentParser: def init_argparse() -> argparse.ArgumentParser:
""" """
Parse the CLI arguments Parse the CLI arguments
@@ -156,8 +160,8 @@ def create_talk_diff_scatter_viz():
ts_to_topic_mapping_top_2 = {} ts_to_topic_mapping_top_2 = {}
# Also create a mapping of the different timestamps in which each topic was covered # Also create a mapping of the different timestamps in which each topic was covered
topic_to_ts_mapping_top_1 = {} topic_to_ts_mapping_top_1 = collections.defaultdict(list)
topic_to_ts_mapping_top_2 = {} topic_to_ts_mapping_top_2 = collections.defaultdict(list)
similarity_threshold = 0.7 similarity_threshold = 0.7
@@ -178,12 +182,11 @@ def create_talk_diff_scatter_viz():
# top1 match # top1 match
if i == 0: if i == 0:
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]] ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"] topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
# top2 match # top2 match
else: else:
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]] ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"] topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
def create_new_columns(record): def create_new_columns(record):
""" """
@@ -210,9 +213,15 @@ def create_talk_diff_scatter_viz():
print("", item) print("", item)
print("📊 Coverage: {:.2f}%".format(percentage_covered)) print("📊 Coverage: {:.2f}%".format(percentage_covered))
# Save df for further experimentation # Save df, mappings for further experimentation
df.to_pickle("df.pkl") df.to_pickle("df.pkl")
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
pickle.dump(my_mappings, open("mappings.pkl", "wb"))
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
# Scatter plot of topics # Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments( corpus = st.CorpusFromParsedDocuments(
@@ -220,13 +229,16 @@ def create_talk_diff_scatter_viz():
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer( html = st.produce_scattertext_explorer(
corpus, corpus,
category='TAM', category_name='TAM', not_category_name='Churn', category=config["DEFAULT"]["CATEGORY_1"],
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
minimum_term_frequency=0, pmi_threshold_coefficient=0, minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, width_in_pixels=1000,
transform=st.Scalers.dense_rank transform=st.Scalers.dense_rank
) )
open('./demo_compact.html', 'w').write(html) open('./demo_compact.html', 'w').write(html)
def main(): def main():
parser = init_argparse() parser = init_argparse()
args = parser.parse_args() args = parser.parse_args()
@@ -261,6 +273,10 @@ def main():
# If file is not present locally, take it from S3 bucket # If file is not present locally, take it from S3 bucket
if not os.path.exists(media_file): if not os.path.exists(media_file):
download_files([media_file]) download_files([media_file])
if media_file.endswith(".m4a"):
subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
input_file = f"{media_file}.mp4"
else: else:
print("Unsupported URL scheme: " + url.scheme) print("Unsupported URL scheme: " + url.scheme)
quit() quit()
@@ -291,7 +307,7 @@ def main():
if args.transcript: if args.transcript:
logger.info(f"Saving transcript to: {args.transcript}") logger.info(f"Saving transcript to: {args.transcript}")
transcript_file = open(args.transcript, "w") transcript_file = open(args.transcript, "w")
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w") transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
transcript_file.write(whisper_result["text"]) transcript_file.write(whisper_result["text"])
transcript_file_timestamps.write(str(whisper_result)) transcript_file_timestamps.write(str(whisper_result))
transcript_file.close() transcript_file.close()
@@ -306,7 +322,7 @@ def main():
# S3 : Push artefacts to S3 bucket # S3 : Push artefacts to S3 bucket
files_to_upload = ["transcript.txt", "transcript_timestamps.txt", files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
"demo_compact.html", "df.pkl", "demo_compact.html", "df.pkl",
"wordcloud.png"] "wordcloud.png", "mappings.pkl"]
upload_files(files_to_upload) upload_files(files_to_upload)
# Summarize the generated transcript using the BART model # Summarize the generated transcript using the BART model