mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Merge pull request #10 from Monadical-SAS/whisper-jax-gokul
Add new features
This commit is contained in:
31
README.md
31
README.md
@@ -73,23 +73,42 @@ Download:
|
||||
|
||||
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
||||
|
||||
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
|
||||
mentioned above or simply use the GUI of AWS Management Console.
|
||||
|
||||
**WORKFLOW:**
|
||||
|
||||
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.
|
||||
2) Keep the agenda header topics in a local file named "agenda-headers.txt". This needs to be present where the script is run.
|
||||
3) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
||||
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
|
||||
```.m4a``` format, it will get converted to ```.mp4``` automatically.
|
||||
2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
|
||||
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
|
||||
1) ```agenda_topic : <short description>```
|
||||
3) Check all the values in ```config.ini```. You need to predefine 2 categories for which you need to scatter plot the
|
||||
topic modelling visualization in the config file. This is the default visualization. But, from the dataframe artefact called
|
||||
```df.pkl``` , you can load the df and choose different topics to plot. You can filter using certain words to search for the
|
||||
transcriptions and you can see the top influencers and characteristic in each topic we have chosen to plot in the
|
||||
interactive HTML document. I have added a new jupyter notebook that gives the base template to play around with, named
|
||||
```Viz_experiments.ipynb```.
|
||||
4) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
||||
HTML file, a sample word cloud and uploads them to the S3 bucket
|
||||
4) Additional artefacts pushed to S3:
|
||||
1) HTML visualiztion file
|
||||
5) Additional artefacts pushed to S3:
|
||||
1) HTML visualization file
|
||||
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
||||
3) Summary, transcript and transcript with timestamps file in text format.
|
||||
|
||||
The script also creates 2 types of mappings.
|
||||
1) Timestamp -> The top 2 matched agenda topic
|
||||
2) Topic -> All matched timestamps in the transcription
|
||||
|
||||
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
|
||||
|
||||
Other visualizations can be planned based on available artefacts or new ones can be created.
|
||||
|
||||
|
||||
**Visualization experiments:**
|
||||
|
||||
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
|
||||
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
|
||||
visualizations on top of the metadata.
|
||||
|
||||
|
||||
NEXT STEPS:
|
||||
|
||||
449
Viz-experiments.ipynb
Normal file
449
Viz-experiments.ipynb
Normal file
File diff suppressed because one or more lines are too long
@@ -7,4 +7,10 @@ OPENAI_APIKEY=
|
||||
WHISPER_MODEL_SIZE=tiny
|
||||
AWS_ACCESS_KEY=***REMOVED***
|
||||
AWS_SECRET_KEY=***REMOVED***
|
||||
BUCKET_NAME='reflector-bucket'
|
||||
BUCKET_NAME='reflector-bucket'
|
||||
|
||||
# For the topic modelling viz chart
|
||||
CATEGORY_1=TAM
|
||||
CATEGORY_1_NAME=TAM
|
||||
CATEGORY_2_NAME=Churn
|
||||
|
||||
|
||||
@@ -44,4 +44,7 @@ nltk==3.8.1
|
||||
wordcloud
|
||||
spacy
|
||||
scattertext
|
||||
pandas
|
||||
pandas
|
||||
jupyter
|
||||
seaborn
|
||||
matplotlib
|
||||
34
whisjax.py
34
whisjax.py
@@ -6,6 +6,7 @@
|
||||
|
||||
import argparse
|
||||
import ast
|
||||
import collections
|
||||
import configparser
|
||||
import jax.numpy as jnp
|
||||
import matplotlib.pyplot as plt
|
||||
@@ -13,7 +14,9 @@ import moviepy.editor
|
||||
import moviepy.editor
|
||||
import nltk
|
||||
import os
|
||||
import subprocess
|
||||
import pandas as pd
|
||||
import pickle
|
||||
import re
|
||||
import scattertext as st
|
||||
import spacy
|
||||
@@ -35,6 +38,7 @@ config.read('config.ini')
|
||||
|
||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Parse the CLI arguments
|
||||
@@ -156,8 +160,8 @@ def create_talk_diff_scatter_viz():
|
||||
ts_to_topic_mapping_top_2 = {}
|
||||
|
||||
# Also create a mapping of the different timestamps in which each topic was covered
|
||||
topic_to_ts_mapping_top_1 = {}
|
||||
topic_to_ts_mapping_top_2 = {}
|
||||
topic_to_ts_mapping_top_1 = collections.defaultdict(list)
|
||||
topic_to_ts_mapping_top_2 = collections.defaultdict(list)
|
||||
|
||||
similarity_threshold = 0.7
|
||||
|
||||
@@ -178,12 +182,11 @@ def create_talk_diff_scatter_viz():
|
||||
# top1 match
|
||||
if i == 0:
|
||||
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
# top2 match
|
||||
else:
|
||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
||||
|
||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
|
||||
def create_new_columns(record):
|
||||
"""
|
||||
@@ -210,9 +213,15 @@ def create_talk_diff_scatter_viz():
|
||||
print("❌ ", item)
|
||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||
|
||||
# Save df for further experimentation
|
||||
# Save df, mappings for further experimentation
|
||||
df.to_pickle("df.pkl")
|
||||
|
||||
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||
pickle.dump(my_mappings, open("mappings.pkl", "wb"))
|
||||
|
||||
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||
|
||||
# Scatter plot of topics
|
||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||
corpus = st.CorpusFromParsedDocuments(
|
||||
@@ -220,13 +229,16 @@ def create_talk_diff_scatter_viz():
|
||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||
html = st.produce_scattertext_explorer(
|
||||
corpus,
|
||||
category='TAM', category_name='TAM', not_category_name='Churn',
|
||||
category=config["DEFAULT"]["CATEGORY_1"],
|
||||
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
|
||||
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
|
||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||
width_in_pixels=1000,
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
open('./demo_compact.html', 'w').write(html)
|
||||
|
||||
|
||||
def main():
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
@@ -261,6 +273,10 @@ def main():
|
||||
# If file is not present locally, take it from S3 bucket
|
||||
if not os.path.exists(media_file):
|
||||
download_files([media_file])
|
||||
|
||||
if media_file.endswith(".m4a"):
|
||||
subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
|
||||
input_file = f"{media_file}.mp4"
|
||||
else:
|
||||
print("Unsupported URL scheme: " + url.scheme)
|
||||
quit()
|
||||
@@ -291,7 +307,7 @@ def main():
|
||||
if args.transcript:
|
||||
logger.info(f"Saving transcript to: {args.transcript}")
|
||||
transcript_file = open(args.transcript, "w")
|
||||
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
|
||||
transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
|
||||
transcript_file.write(whisper_result["text"])
|
||||
transcript_file_timestamps.write(str(whisper_result))
|
||||
transcript_file.close()
|
||||
@@ -306,7 +322,7 @@ def main():
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
||||
"demo_compact.html", "df.pkl",
|
||||
"wordcloud.png"]
|
||||
"wordcloud.png", "mappings.pkl"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
# Summarize the generated transcript using the BART model
|
||||
|
||||
Reference in New Issue
Block a user