Push new features

2025-12-21 04:39:06 +00:00 · 2023-06-14 17:49:59 +05:30
parent 48ca4fce63
commit cfc91568fa
5 changed files with 201 additions and 13 deletions
--- a/whisjax.py
+++ b/whisjax.py
@@ -13,7 +13,9 @@ import moviepy.editor
 import moviepy.editor
 import nltk
 import os
+import subprocess
 import pandas as pd
+import pickle
 import re
 import scattertext as st
 import spacy
@@ -35,6 +37,7 @@ config.read('config.ini')

 WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]

+
 def init_argparse() -> argparse.ArgumentParser:
    """
    Parse the CLI arguments
@@ -184,7 +187,6 @@ def create_talk_diff_scatter_viz():
                ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
                topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]

-
    def create_new_columns(record):
        """
        Accumulate the mapping information into the df
@@ -210,9 +212,15 @@ def create_talk_diff_scatter_viz():
            print("❌ ", item)
    print("📊 Coverage: {:.2f}%".format(percentage_covered))

-    # Save df for further experimentation
+    # Save df, mappings for further experimentation
    df.to_pickle("df.pkl")

+    my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
+                   topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
+    pickle.dump(my_mappings, open("mappings.pkl", "wb"))
+
+    # to load,  my_mappings = pickle.load( open ("mappings.pkl", "rb") )
+
    # Scatter plot of topics
    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
    corpus = st.CorpusFromParsedDocuments(
@@ -220,13 +228,16 @@ def create_talk_diff_scatter_viz():
    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
    html = st.produce_scattertext_explorer(
        corpus,
-        category='TAM', category_name='TAM', not_category_name='Churn',
+        category=config["DEFAULT"]["CATEGORY_1"],
+        category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
+        not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
        minimum_term_frequency=0, pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        transform=st.Scalers.dense_rank
    )
    open('./demo_compact.html', 'w').write(html)

+
 def main():
    parser = init_argparse()
    args = parser.parse_args()
@@ -261,6 +272,10 @@ def main():
        # If file is not present locally, take it from S3 bucket
        if not os.path.exists(media_file):
            download_files([media_file])
+
+        if media_file.endswith(".m4a"):
+            subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"])
+            input_file = f"{media_file}.mp4"
    else:
        print("Unsupported URL scheme: " + url.scheme)
        quit()
@@ -291,7 +306,7 @@ def main():
    if args.transcript:
        logger.info(f"Saving transcript to: {args.transcript}")
        transcript_file = open(args.transcript, "w")
-        transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
+        transcript_file_timestamps = open(args.transcript[0:len(args.transcript) - 4] + "_timestamps.txt", "w")
        transcript_file.write(whisper_result["text"])
        transcript_file_timestamps.write(str(whisper_result))
        transcript_file.close()
@@ -306,7 +321,7 @@ def main():
    # S3 : Push artefacts to S3 bucket
    files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
                       "demo_compact.html", "df.pkl",
-                       "wordcloud.png"]
+                       "wordcloud.png", "mappings.pkl"]
    upload_files(files_to_upload)

    # Summarize the generated transcript using the BART model