Push new features

This commit is contained in:
gokul
2023-06-14 17:49:59 +05:30
parent 48ca4fce63
commit cfc91568fa
5 changed files with 201 additions and 13 deletions

View File

@@ -13,7 +13,9 @@ import moviepy.editor
import moviepy.editor
import nltk
import os
import subprocess
import pandas as pd
import pickle
import re
import scattertext as st
import spacy
@@ -35,6 +37,7 @@ config.read('config.ini')
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
def init_argparse() -> argparse.ArgumentParser:
"""
Parse the CLI arguments
@@ -184,7 +187,6 @@ def create_talk_diff_scatter_viz():
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
def create_new_columns(record):
"""
Accumulate the mapping information into the df
@@ -210,9 +212,15 @@ def create_talk_diff_scatter_viz():
print("", item)
print("📊 Coverage: {:.2f}%".format(percentage_covered))
# Save df for further experimentation
# Save df, mappings for further experimentation
df.to_pickle("df.pkl")
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
pickle.dump(my_mappings, open("mappings.pkl", "wb"))
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
@@ -220,13 +228,16 @@ def create_talk_diff_scatter_viz():
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='TAM', category_name='TAM', not_category_name='Churn',
category=config["DEFAULT"]["CATEGORY_1"],
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
open('./demo_compact.html', 'w').write(html)
def main():
parser = init_argparse()
args = parser.parse_args()
@@ -261,6 +272,10 @@ def main():
# If file is not present locally, take it from S3 bucket
if not os.path.exists(media_file):
download_files([media_file])
if media_file.endswith(".m4a"):
subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
input_file = f"{media_file}.mp4"
else:
print("Unsupported URL scheme: " + url.scheme)
quit()
@@ -291,7 +306,7 @@ def main():
if args.transcript:
logger.info(f"Saving transcript to: {args.transcript}")
transcript_file = open(args.transcript, "w")
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
transcript_file.write(whisper_result["text"])
transcript_file_timestamps.write(str(whisper_result))
transcript_file.close()
@@ -306,7 +321,7 @@ def main():
# S3 : Push artefacts to S3 bucket
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
"demo_compact.html", "df.pkl",
"wordcloud.png"]
"wordcloud.png", "mappings.pkl"]
upload_files(files_to_upload)
# Summarize the generated transcript using the BART model