mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
Push new features
This commit is contained in:
31
README.md
31
README.md
@@ -73,15 +73,26 @@ Download:
|
|||||||
|
|
||||||
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
||||||
|
|
||||||
|
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
|
||||||
|
mentioned above or simply use the GUI of AWS Management Console.
|
||||||
|
|
||||||
**WORKFLOW:**
|
**WORKFLOW:**
|
||||||
|
|
||||||
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.
|
1) Specify the input source file from a local, youtube link or upload to S3 if needed and pass it as input to the script.If the source file is in
|
||||||
2) Keep the agenda header topics in a local file named "agenda-headers.txt". This needs to be present where the script is run.
|
```.m4a``` format, it will get converted to ```.mp4``` automatically.
|
||||||
3) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the script is run.
|
||||||
|
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
|
||||||
|
1) ```agenda_topic : <short description>```
|
||||||
|
3) Check all the values in ```config.ini```. You need to predefine 2 categories for which you need to scatter plot the
|
||||||
|
topic modelling visualization in the config file. This is the default visualization. But, from the dataframe artefact called
|
||||||
|
```df.pkl``` , you can load the df and choose different topics to plot. You can filter using certain words to search for the
|
||||||
|
transcriptions and you can see the top influencers and characteristic in each topic we have chosen to plot in the
|
||||||
|
interactive HTML document. I have added a new jupyter notebook that gives the base template to play around with, named
|
||||||
|
```Viz_experiments.ipynb```.
|
||||||
|
4) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the form of an interactive
|
||||||
HTML file, a sample word cloud and uploads them to the S3 bucket
|
HTML file, a sample word cloud and uploads them to the S3 bucket
|
||||||
4) Additional artefacts pushed to S3:
|
5) Additional artefacts pushed to S3:
|
||||||
1) HTML visualiztion file
|
1) HTML visualization file
|
||||||
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
||||||
3) Summary, transcript and transcript with timestamps file in text format.
|
3) Summary, transcript and transcript with timestamps file in text format.
|
||||||
|
|
||||||
@@ -89,7 +100,15 @@ HTML file, a sample word cloud and uploads them to the S3 bucket
|
|||||||
1) Timestamp -> The top 2 matched agenda topic
|
1) Timestamp -> The top 2 matched agenda topic
|
||||||
2) Topic -> All matched timestamps in the transcription
|
2) Topic -> All matched timestamps in the transcription
|
||||||
|
|
||||||
Other visualizations can be planned based on available artefacts or new ones can be created.
|
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the section ```Viz-experiments```.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
**Visualization experiments:**
|
||||||
|
|
||||||
|
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated from the
|
||||||
|
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
|
||||||
|
visualizations on top of the metadata.
|
||||||
|
|
||||||
|
|
||||||
NEXT STEPS:
|
NEXT STEPS:
|
||||||
|
|||||||
147
Viz-experiments.ipynb
Normal file
147
Viz-experiments.ipynb
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f604fe38",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Visualization Experiments"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "cad594ed",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dbd7b93d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from file_util import download_files\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"\n",
|
||||||
|
"# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n",
|
||||||
|
"files_to_download = [\"df.pkl\", \"mapping.pkl\"]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f59ff46b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Download spacy model for the first time\n",
|
||||||
|
"!spacy download en_core_web_md\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"id": "61aee352",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"\n",
|
||||||
|
"spaCy_model = \"en_core_web_md\"\n",
|
||||||
|
"nlp = spacy.load(spaCy_model)\n",
|
||||||
|
"stopwords = nlp.Defaults.stop_words\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5584c887",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Scatter plot of transcription with Topic modelling"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5fae1776",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "43e01074",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import scattertext as st\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
|
||||||
|
" df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
|
||||||
|
"\n",
|
||||||
|
" corpus = st.CorpusFromParsedDocuments(\n",
|
||||||
|
" df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n",
|
||||||
|
" ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n",
|
||||||
|
" \n",
|
||||||
|
" html = st.produce_scattertext_explorer(\n",
|
||||||
|
" corpus,\n",
|
||||||
|
" category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n",
|
||||||
|
" minimum_term_frequency=0, pmi_threshold_coefficient=0,\n",
|
||||||
|
" width_in_pixels=1000,\n",
|
||||||
|
" transform=st.Scalers.dense_rank\n",
|
||||||
|
" )\n",
|
||||||
|
" open('./demo_compact.html', 'w').write(html)\n",
|
||||||
|
"\n",
|
||||||
|
"plot_topic_modelling_and_word_to_sentence_search(df,\n",
|
||||||
|
" cat_1=\"TAM\",\n",
|
||||||
|
" cat_1_name=\"TAM\",\n",
|
||||||
|
" cat_2_name=\"Churn\")\n",
|
||||||
|
"\n",
|
||||||
|
"# once you are done, check the generated HTML file\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e2d6ec49",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Timeline visualizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "08e83128",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -8,3 +8,9 @@ WHISPER_MODEL_SIZE=tiny
|
|||||||
AWS_ACCESS_KEY=***REMOVED***
|
AWS_ACCESS_KEY=***REMOVED***
|
||||||
AWS_SECRET_KEY=***REMOVED***
|
AWS_SECRET_KEY=***REMOVED***
|
||||||
BUCKET_NAME='reflector-bucket'
|
BUCKET_NAME='reflector-bucket'
|
||||||
|
|
||||||
|
# For the topic modelling viz chart
|
||||||
|
CATEGORY_1="TAM"
|
||||||
|
CATEGORY_1_NAME="TAM"
|
||||||
|
CATEGORY_2_NAME="Churn"
|
||||||
|
|
||||||
|
|||||||
@@ -45,3 +45,4 @@ wordcloud
|
|||||||
spacy
|
spacy
|
||||||
scattertext
|
scattertext
|
||||||
pandas
|
pandas
|
||||||
|
jupyter
|
||||||
25
whisjax.py
25
whisjax.py
@@ -13,7 +13,9 @@ import moviepy.editor
|
|||||||
import moviepy.editor
|
import moviepy.editor
|
||||||
import nltk
|
import nltk
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import pickle
|
||||||
import re
|
import re
|
||||||
import scattertext as st
|
import scattertext as st
|
||||||
import spacy
|
import spacy
|
||||||
@@ -35,6 +37,7 @@ config.read('config.ini')
|
|||||||
|
|
||||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||||
|
|
||||||
|
|
||||||
def init_argparse() -> argparse.ArgumentParser:
|
def init_argparse() -> argparse.ArgumentParser:
|
||||||
"""
|
"""
|
||||||
Parse the CLI arguments
|
Parse the CLI arguments
|
||||||
@@ -184,7 +187,6 @@ def create_talk_diff_scatter_viz():
|
|||||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
||||||
|
|
||||||
|
|
||||||
def create_new_columns(record):
|
def create_new_columns(record):
|
||||||
"""
|
"""
|
||||||
Accumulate the mapping information into the df
|
Accumulate the mapping information into the df
|
||||||
@@ -210,9 +212,15 @@ def create_talk_diff_scatter_viz():
|
|||||||
print("❌ ", item)
|
print("❌ ", item)
|
||||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||||
|
|
||||||
# Save df for further experimentation
|
# Save df, mappings for further experimentation
|
||||||
df.to_pickle("df.pkl")
|
df.to_pickle("df.pkl")
|
||||||
|
|
||||||
|
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||||
|
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||||
|
pickle.dump(my_mappings, open("mappings.pkl", "wb"))
|
||||||
|
|
||||||
|
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||||
|
|
||||||
# Scatter plot of topics
|
# Scatter plot of topics
|
||||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||||
corpus = st.CorpusFromParsedDocuments(
|
corpus = st.CorpusFromParsedDocuments(
|
||||||
@@ -220,13 +228,16 @@ def create_talk_diff_scatter_viz():
|
|||||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||||
html = st.produce_scattertext_explorer(
|
html = st.produce_scattertext_explorer(
|
||||||
corpus,
|
corpus,
|
||||||
category='TAM', category_name='TAM', not_category_name='Churn',
|
category=config["DEFAULT"]["CATEGORY_1"],
|
||||||
|
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
|
||||||
|
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
|
||||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||||
width_in_pixels=1000,
|
width_in_pixels=1000,
|
||||||
transform=st.Scalers.dense_rank
|
transform=st.Scalers.dense_rank
|
||||||
)
|
)
|
||||||
open('./demo_compact.html', 'w').write(html)
|
open('./demo_compact.html', 'w').write(html)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = init_argparse()
|
parser = init_argparse()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -261,6 +272,10 @@ def main():
|
|||||||
# If file is not present locally, take it from S3 bucket
|
# If file is not present locally, take it from S3 bucket
|
||||||
if not os.path.exists(media_file):
|
if not os.path.exists(media_file):
|
||||||
download_files([media_file])
|
download_files([media_file])
|
||||||
|
|
||||||
|
if media_file.endswith(".m4a"):
|
||||||
|
subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
|
||||||
|
input_file = f"{media_file}.mp4"
|
||||||
else:
|
else:
|
||||||
print("Unsupported URL scheme: " + url.scheme)
|
print("Unsupported URL scheme: " + url.scheme)
|
||||||
quit()
|
quit()
|
||||||
@@ -291,7 +306,7 @@ def main():
|
|||||||
if args.transcript:
|
if args.transcript:
|
||||||
logger.info(f"Saving transcript to: {args.transcript}")
|
logger.info(f"Saving transcript to: {args.transcript}")
|
||||||
transcript_file = open(args.transcript, "w")
|
transcript_file = open(args.transcript, "w")
|
||||||
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
|
transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
|
||||||
transcript_file.write(whisper_result["text"])
|
transcript_file.write(whisper_result["text"])
|
||||||
transcript_file_timestamps.write(str(whisper_result))
|
transcript_file_timestamps.write(str(whisper_result))
|
||||||
transcript_file.close()
|
transcript_file.close()
|
||||||
@@ -306,7 +321,7 @@ def main():
|
|||||||
# S3 : Push artefacts to S3 bucket
|
# S3 : Push artefacts to S3 bucket
|
||||||
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
||||||
"demo_compact.html", "df.pkl",
|
"demo_compact.html", "df.pkl",
|
||||||
"wordcloud.png"]
|
"wordcloud.png", "mappings.pkl"]
|
||||||
upload_files(files_to_upload)
|
upload_files(files_to_upload)
|
||||||
|
|
||||||
# Summarize the generated transcript using the BART model
|
# Summarize the generated transcript using the BART model
|
||||||
|
|||||||
Reference in New Issue
Block a user