mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Merge pull request #10 from Monadical-SAS/whisper-jax-gokul
Add new features
This commit is contained in:
31
README.md
31
README.md
@@ -73,23 +73,42 @@ Download:
|
|||||||
|
|
||||||
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
||||||
|
|
||||||
|
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
|
||||||
|
mentioned above or simply use the GUI of AWS Management Console.
|
||||||
|
|
||||||
**WORKFLOW:**
|
**WORKFLOW:**
|
||||||
|
|
||||||
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.
|
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
|
||||||
2) Keep the agenda header topics in a local file named "agenda-headers.txt". This needs to be present where the script is run.
|
```.m4a``` format, it will get converted to ```.mp4``` automatically.
|
||||||
3) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
|
||||||
|
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
|
||||||
|
1) ```agenda_topic : <short description>```
|
||||||
|
3) Check all the values in ```config.ini```. You need to predefine 2 categories for which you need to scatter plot the
|
||||||
|
topic modelling visualization in the config file. This is the default visualization. But, from the dataframe artefact called
|
||||||
|
```df.pkl``` , you can load the df and choose different topics to plot. You can filter using certain words to search for the
|
||||||
|
transcriptions and you can see the top influencers and characteristic in each topic we have chosen to plot in the
|
||||||
|
interactive HTML document. I have added a new jupyter notebook that gives the base template to play around with, named
|
||||||
|
```Viz_experiments.ipynb```.
|
||||||
|
4) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
||||||
HTML file, a sample word cloud and uploads them to the S3 bucket
|
HTML file, a sample word cloud and uploads them to the S3 bucket
|
||||||
4) Additional artefacts pushed to S3:
|
5) Additional artefacts pushed to S3:
|
||||||
1) HTML visualiztion file
|
1) HTML visualization file
|
||||||
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
||||||
3) Summary, transcript and transcript with timestamps file in text format.
|
3) Summary, transcript and transcript with timestamps file in text format.
|
||||||
|
|
||||||
The script also creates 2 types of mappings.
|
The script also creates 2 types of mappings.
|
||||||
1) Timestamp -> The top 2 matched agenda topic
|
1) Timestamp -> The top 2 matched agenda topic
|
||||||
2) Topic -> All matched timestamps in the transcription
|
2) Topic -> All matched timestamps in the transcription
|
||||||
|
|
||||||
|
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
|
||||||
|
|
||||||
Other visualizations can be planned based on available artefacts or new ones can be created.
|
|
||||||
|
|
||||||
|
**Visualization experiments:**
|
||||||
|
|
||||||
|
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
|
||||||
|
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
|
||||||
|
visualizations on top of the metadata.
|
||||||
|
|
||||||
|
|
||||||
NEXT STEPS:
|
NEXT STEPS:
|
||||||
|
|||||||
449
Viz-experiments.ipynb
Normal file
449
Viz-experiments.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -7,4 +7,10 @@ OPENAI_APIKEY=
|
|||||||
WHISPER_MODEL_SIZE=tiny
|
WHISPER_MODEL_SIZE=tiny
|
||||||
AWS_ACCESS_KEY=***REMOVED***
|
AWS_ACCESS_KEY=***REMOVED***
|
||||||
AWS_SECRET_KEY=***REMOVED***
|
AWS_SECRET_KEY=***REMOVED***
|
||||||
BUCKET_NAME='reflector-bucket'
|
BUCKET_NAME='reflector-bucket'
|
||||||
|
|
||||||
|
# For the topic modelling viz chart
|
||||||
|
CATEGORY_1=TAM
|
||||||
|
CATEGORY_1_NAME=TAM
|
||||||
|
CATEGORY_2_NAME=Churn
|
||||||
|
|
||||||
|
|||||||
@@ -44,4 +44,7 @@ nltk==3.8.1
|
|||||||
wordcloud
|
wordcloud
|
||||||
spacy
|
spacy
|
||||||
scattertext
|
scattertext
|
||||||
pandas
|
pandas
|
||||||
|
jupyter
|
||||||
|
seaborn
|
||||||
|
matplotlib
|
||||||
34
whisjax.py
34
whisjax.py
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import ast
|
import ast
|
||||||
|
import collections
|
||||||
import configparser
|
import configparser
|
||||||
import jax.numpy as jnp
|
import jax.numpy as jnp
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@@ -13,7 +14,9 @@ import moviepy.editor
|
|||||||
import moviepy.editor
|
import moviepy.editor
|
||||||
import nltk
|
import nltk
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pickle
|
||||||
import re
|
import re
|
||||||
import scattertext as st
|
import scattertext as st
|
||||||
import spacy
|
import spacy
|
||||||
@@ -35,6 +38,7 @@ config.read('config.ini')
|
|||||||
|
|
||||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||||
|
|
||||||
|
|
||||||
def init_argparse() -> argparse.ArgumentParser:
|
def init_argparse() -> argparse.ArgumentParser:
|
||||||
"""
|
"""
|
||||||
Parse the CLI arguments
|
Parse the CLI arguments
|
||||||
@@ -156,8 +160,8 @@ def create_talk_diff_scatter_viz():
|
|||||||
ts_to_topic_mapping_top_2 = {}
|
ts_to_topic_mapping_top_2 = {}
|
||||||
|
|
||||||
# Also create a mapping of the different timestamps in which each topic was covered
|
# Also create a mapping of the different timestamps in which each topic was covered
|
||||||
topic_to_ts_mapping_top_1 = {}
|
topic_to_ts_mapping_top_1 = collections.defaultdict(list)
|
||||||
topic_to_ts_mapping_top_2 = {}
|
topic_to_ts_mapping_top_2 = collections.defaultdict(list)
|
||||||
|
|
||||||
similarity_threshold = 0.7
|
similarity_threshold = 0.7
|
||||||
|
|
||||||
@@ -178,12 +182,11 @@ def create_talk_diff_scatter_viz():
|
|||||||
# top1 match
|
# top1 match
|
||||||
if i == 0:
|
if i == 0:
|
||||||
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||||
# top2 match
|
# top2 match
|
||||||
else:
|
else:
|
||||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||||
|
|
||||||
|
|
||||||
def create_new_columns(record):
|
def create_new_columns(record):
|
||||||
"""
|
"""
|
||||||
@@ -210,9 +213,15 @@ def create_talk_diff_scatter_viz():
|
|||||||
print("❌ ", item)
|
print("❌ ", item)
|
||||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||||
|
|
||||||
# Save df for further experimentation
|
# Save df, mappings for further experimentation
|
||||||
df.to_pickle("df.pkl")
|
df.to_pickle("df.pkl")
|
||||||
|
|
||||||
|
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||||
|
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||||
|
pickle.dump(my_mappings, open("mappings.pkl", "wb"))
|
||||||
|
|
||||||
|
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||||
|
|
||||||
# Scatter plot of topics
|
# Scatter plot of topics
|
||||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||||
corpus = st.CorpusFromParsedDocuments(
|
corpus = st.CorpusFromParsedDocuments(
|
||||||
@@ -220,13 +229,16 @@ def create_talk_diff_scatter_viz():
|
|||||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||||
html = st.produce_scattertext_explorer(
|
html = st.produce_scattertext_explorer(
|
||||||
corpus,
|
corpus,
|
||||||
category='TAM', category_name='TAM', not_category_name='Churn',
|
category=config["DEFAULT"]["CATEGORY_1"],
|
||||||
|
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
|
||||||
|
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
|
||||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||||
width_in_pixels=1000,
|
width_in_pixels=1000,
|
||||||
transform=st.Scalers.dense_rank
|
transform=st.Scalers.dense_rank
|
||||||
)
|
)
|
||||||
open('./demo_compact.html', 'w').write(html)
|
open('./demo_compact.html', 'w').write(html)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = init_argparse()
|
parser = init_argparse()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -261,6 +273,10 @@ def main():
|
|||||||
# If file is not present locally, take it from S3 bucket
|
# If file is not present locally, take it from S3 bucket
|
||||||
if not os.path.exists(media_file):
|
if not os.path.exists(media_file):
|
||||||
download_files([media_file])
|
download_files([media_file])
|
||||||
|
|
||||||
|
if media_file.endswith(".m4a"):
|
||||||
|
subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
|
||||||
|
input_file = f"{media_file}.mp4"
|
||||||
else:
|
else:
|
||||||
print("Unsupported URL scheme: " + url.scheme)
|
print("Unsupported URL scheme: " + url.scheme)
|
||||||
quit()
|
quit()
|
||||||
@@ -291,7 +307,7 @@ def main():
|
|||||||
if args.transcript:
|
if args.transcript:
|
||||||
logger.info(f"Saving transcript to: {args.transcript}")
|
logger.info(f"Saving transcript to: {args.transcript}")
|
||||||
transcript_file = open(args.transcript, "w")
|
transcript_file = open(args.transcript, "w")
|
||||||
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
|
transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
|
||||||
transcript_file.write(whisper_result["text"])
|
transcript_file.write(whisper_result["text"])
|
||||||
transcript_file_timestamps.write(str(whisper_result))
|
transcript_file_timestamps.write(str(whisper_result))
|
||||||
transcript_file.close()
|
transcript_file.close()
|
||||||
@@ -306,7 +322,7 @@ def main():
|
|||||||
# S3 : Push artefacts to S3 bucket
|
# S3 : Push artefacts to S3 bucket
|
||||||
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
||||||
"demo_compact.html", "df.pkl",
|
"demo_compact.html", "df.pkl",
|
||||||
"wordcloud.png"]
|
"wordcloud.png", "mappings.pkl"]
|
||||||
upload_files(files_to_upload)
|
upload_files(files_to_upload)
|
||||||
|
|
||||||
# Summarize the generated transcript using the BART model
|
# Summarize the generated transcript using the BART model
|
||||||
|
|||||||
Reference in New Issue
Block a user