Add more features to the reflector demo code repo

2025-12-21 04:39:06 +00:00 · 2023-06-13 22:21:35 +05:30
parent 3eab6db142
commit ed77fadd34
9 changed files with 402 additions and 85 deletions
--- a/whisjax.py
+++ b/whisjax.py
@@ -4,31 +4,42 @@
 # summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
 # summarize podcast.mp3 summary.txt

-from urllib.parse import urlparse
-from pytube import YouTube
-from loguru import logger
-from whisper_jax import FlaxWhisperPipline
-import jax.numpy as jnp
-import moviepy.editor
 import argparse
-import tempfile
-import whisper
-import openai
-import re
+import ast
 import configparser
+import jax.numpy as jnp
+import matplotlib.pyplot as plt
+import moviepy.editor
+import moviepy.editor
+import nltk
 import os
+import pandas as pd
+import re
+import scattertext as st
+import spacy
+import tempfile
+from loguru import logger
+from pytube import YouTube
+from transformers import BartTokenizer, BartForConditionalGeneration
+from urllib.parse import urlparse
+from whisper_jax import FlaxWhisperPipline
+from wordcloud import WordCloud, STOPWORDS

+from file_util import upload_files, download_files
+
+nltk.download('punkt')
+
+# Configurations can be found in config.ini. Set them properly before executing
 config = configparser.ConfigParser()
 config.read('config.ini')

 WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
-OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"]
-
-MAX_WORDS_IN_CHUNK = 2500
-MAX_OUTPUT_TOKENS = 1000
-

 def init_argparse() -> argparse.ArgumentParser:
+    """
+    Parse the CLI arguments
+    :return: parser object
+    """
    parser = argparse.ArgumentParser(
        usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
        description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT."
@@ -37,43 +48,185 @@ def init_argparse() -> argparse.ArgumentParser:
    parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str,
                        default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
    parser.add_argument("-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
+    parser.add_argument(
+        "-m", "--model_name", help="Name or path of the BART model",
+        type=str, default="facebook/bart-base")
    parser.add_argument("location")
    parser.add_argument("output")

    return parser


-def chunk_text(txt):
-    sentences = re.split('[.!?]', txt)
-
+def chunk_text(txt, max_chunk_length=500):
+    """
+    Split text into smaller chunks.
+    :param txt: Text to be chunked
+    :param max_chunk_length: length of chunk
+    :return: chunked texts
+    """
+    sentences = nltk.sent_tokenize(txt)
    chunks = []
-    chunk = ""
-    size = 0
-
-    for s in sentences:
-        # Get the number of words in this sentence.
-        n = len(re.findall(r'\w+', s))
-
-        # Skip over empty sentences.
-        if n == 0:
-            continue
-
-        # We need to break the text up into chunks so as not to exceed the max
-        # number of tokens accepted by the ChatGPT model.
-        if size + n > MAX_WORDS_IN_CHUNK:
-            chunks.append(chunk)
-            size = n
-            chunk = s
+    current_chunk = ""
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < max_chunk_length:
+            current_chunk += f" {sentence.strip()}"
        else:
-            chunk = chunk + s
-            size = size + n
-
-    if chunk:
-        chunks.append(chunk)
-
+            chunks.append(current_chunk.strip())
+            current_chunk = f"{sentence.strip()}"
+    chunks.append(current_chunk.strip())
    return chunks


+def summarize_chunks(chunks, tokenizer, model):
+    """
+    Summarize each chunk using a summarizer model
+    :param chunks:
+    :param tokenizer:
+    :param model:
+    :return:
+    """
+    summaries = []
+    for c in chunks:
+        input_ids = tokenizer.encode(c, return_tensors='pt')
+        summary_ids = model.generate(
+            input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        summaries.append(summary)
+    return summaries
+
+
+def create_wordcloud():
+    """
+    Create a basic word cloud visualization of transcribed text
+    :return: None. The wordcloud image is saved locally
+    """
+    with open("transcript.txt", "r") as f:
+        transcription_text = f.read()
+
+    stopwords = set(STOPWORDS)
+
+    # python_mask = np.array(PIL.Image.open("download1.png"))
+
+    wordcloud = WordCloud(height=800, width=800,
+                          background_color='white',
+                          stopwords=stopwords,
+                          min_font_size=8).generate(transcription_text)
+
+    # Plot wordcloud and save image
+    plt.figure(facecolor=None)
+    plt.imshow(wordcloud, interpolation="bilinear")
+    plt.axis("off")
+    plt.tight_layout(pad=0)
+    plt.savefig("wordcloud.png")
+
+
+def create_talk_diff_scatter_viz():
+    """
+    Perform agenda vs transription diff to see covered topics.
+    Create a scatter plot of words in topics.
+    :return: None. Saved locally.
+    """
+    spaCy_model = "en_core_web_md"
+    nlp = spacy.load(spaCy_model)
+    nlp.add_pipe('sentencizer')
+
+    agenda_topics = []
+    agenda = []
+    # Load the agenda
+    with open("agenda-headers.txt", "r") as f:
+        for line in f.readlines():
+            if line.strip():
+                agenda.append(line.strip())
+                agenda_topics.append(line.split(":")[0])
+
+    # Load the transcription with timestamp
+    with open("transcript_timestamps.txt", "r") as f:
+        transcription_timestamp_text = f.read()
+
+    res = ast.literal_eval(transcription_timestamp_text)
+    chunks = res["chunks"]
+
+    # create df for processing
+    df = pd.DataFrame.from_dict(res["chunks"])
+
+    covered_items = {}
+    # ts: timestamp
+    # Map each timestamped chunk with top1 and top2 matched agenda
+    ts_to_topic_mapping_top_1 = {}
+    ts_to_topic_mapping_top_2 = {}
+
+    # Also create a mapping of the different timestamps in which each topic was covered
+    topic_to_ts_mapping_top_1 = {}
+    topic_to_ts_mapping_top_2 = {}
+
+    similarity_threshold = 0.7
+
+    for c in chunks:
+        doc_transcription = nlp(c["text"])
+        topic_similarities = []
+        for item in range(len(agenda)):
+            item_doc = nlp(agenda[item])
+            # if not doc_transcription or not all(token.has_vector for token in doc_transcription):
+            if not doc_transcription:
+                continue
+            similarity = doc_transcription.similarity(item_doc)
+            topic_similarities.append((item, similarity))
+        topic_similarities.sort(key=lambda x: x[1], reverse=True)
+        for i in range(2):
+            if topic_similarities[i][1] >= similarity_threshold:
+                covered_items[agenda[topic_similarities[i][0]]] = True
+            # top1 match
+            if i == 0:
+                ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
+                topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
+            # top2 match
+            else:
+                ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
+                topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
+
+
+    def create_new_columns(record):
+        """
+        Accumulate the mapping information into the df
+        :param record:
+        :return:
+        """
+        record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]]
+        record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]]
+        return record
+
+    df = df.apply(create_new_columns, axis=1)
+
+    # Count the number of items covered and calculatre the percentage
+    num_covered_items = sum(covered_items.values())
+    percentage_covered = num_covered_items / len(agenda) * 100
+
+    # Print the results
+    print("💬 Agenda items covered in the transcription:")
+    for item in agenda:
+        if item in covered_items and covered_items[item]:
+            print("✅ ", item)
+        else:
+            print("❌ ", item)
+    print("📊 Coverage: {:.2f}%".format(percentage_covered))
+
+    # Save df for further experimentation
+    df.to_pickle("df.pkl")
+
+    # Scatter plot of topics
+    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
+    corpus = st.CorpusFromParsedDocuments(
+        df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
+    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
+    html = st.produce_scattertext_explorer(
+        corpus,
+        category='TAM', category_name='TAM', not_category_name='Churn',
+        minimum_term_frequency=0, pmi_threshold_coefficient=0,
+        width_in_pixels=1000,
+        transform=st.Scalers.dense_rank
+    )
+    open('./demo_compact.html', 'w').write(html)
+
 def main():
    parser = init_argparse()
    args = parser.parse_args()
@@ -83,6 +236,8 @@ def main():
    # audio or video file.
    url = urlparse(args.location)

+    # S3 : Pull artefacts to S3 bucket ?
+
    media_file = ""
    if url.scheme == 'http' or url.scheme == 'https':
        # Check if we're being asked to retreive a YouTube URL, which is handled
@@ -103,65 +258,81 @@ def main():
            logger.info("  XXX - This method hasn't been implemented yet.")
    elif url.scheme == '':
        media_file = url.path
+        # If file is not present locally, take it from S3 bucket
+        if not os.path.exists(media_file):
+            download_files([media_file])
    else:
        print("Unsupported URL scheme: " + url.scheme)
        quit()

-    # If the media file we just retrieved is a video, extract its audio stream.
-    # XXX - We should be checking if we've downloaded an audio file (eg .mp3),
-    # XXX - in which case we can skip this step.  For now we'll assume that
-    # XXX - everything is an mp4 video.
-    audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
-    logger.info(f"Extracting audio to: {audio_filename}")
-
-    video = moviepy.editor.VideoFileClip(media_file)
-    video.audio.write_audiofile(audio_filename, logger=None)
+    # Handle video
+    try:
+        video = moviepy.editor.VideoFileClip(media_file)
+        audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
+        video.audio.write_audiofile(audio_filename, logger=None)
+        logger.info(f"Extracting audio to: {audio_filename}")
+    # Handle audio only file
+    except:
+        audio = moviepy.editor.AudioFileClip(media_file)
+        audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
+        audio.write_audiofile(audio_filename, logger=None)

    logger.info("Finished extracting audio")

    # Convert the audio to text using the OpenAI Whisper model
-    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, batch_size=16)
+    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
+                                  dtype=jnp.float16,
+                                  batch_size=16)
    whisper_result = pipeline(audio_filename, return_timestamps=True)
    logger.info("Finished transcribing file")

-    # If we got the transcript parameter on the command line, save the transcript to the specified file.
+    # If we got the transcript parameter on the command line,
+    # save the transcript to the specified file.
    if args.transcript:
        logger.info(f"Saving transcript to: {args.transcript}")
        transcript_file = open(args.transcript, "w")
+        transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
        transcript_file.write(whisper_result["text"])
+        transcript_file_timestamps.write(str(whisper_result))
        transcript_file.close()
+        transcript_file_timestamps.close()

-    # Summarize the generated transcript using OpenAI
-    openai.api_key = OPENAI_APIKEY
+    logger.info("Creating word cloud")
+    create_wordcloud()

-    # Break the text up into smaller chunks for ChatGPT to summarize.
-    logger.info(f"Breaking transcript up into smaller chunks with MAX_WORDS_IN_CHUNK = {MAX_WORDS_IN_CHUNK}")
+    logger.info("Performing talk-diff and talk-diff visualization")
+    create_talk_diff_scatter_viz()
+
+    # S3 : Push artefacts to S3 bucket
+    files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
+                       "demo_compact.html", "df.pkl",
+                       "wordcloud.png"]
+    upload_files(files_to_upload)
+
+    # Summarize the generated transcript using the BART model
+    logger.info(f"Loading BART model: {args.model_name}")
+    tokenizer = BartTokenizer.from_pretrained(args.model_name)
+    model = BartForConditionalGeneration.from_pretrained(args.model_name)
+
+    logger.info("Breaking transcript into smaller chunks")
    chunks = chunk_text(whisper_result['text'])
-    logger.info(f"Transcript broken up into {len(chunks)} chunks")

-    language = args.language
+    logger.info(
+        f"Transcript broken into {len(chunks)} chunks of at most 500 words")  # TODO fix variable

-    logger.info(f"Writing summary text in {language} to: {args.output}")
+    logger.info(f"Writing summary text in {args.language} to: {args.output}")
    with open(args.output, 'w') as f:
        f.write('Summary of: ' + args.location + "\n\n")
-
-        for c in chunks:
-            response = openai.ChatCompletion.create(
-                frequency_penalty=0.0,
-                max_tokens=1000,
-                model="gpt-3.5-turbo",
-                presence_penalty=1.0,
-                temperature=0.2,
-                messages=[
-                    {"role": "system",
-                     "content": f"You are an assistant helping to summarize transcipts of an audio or video conversation.  The summary should be written in the {language} language."},
-                    {"role": "user", "content": c}
-                ],
-            )
-            f.write(response['choices'][0]['message']['content'] + "\n\n")
+        summaries = summarize_chunks(chunks, tokenizer, model)
+        for summary in summaries:
+            f.write(summary.strip() + "\n\n")

    logger.info("Summarization completed")

+    # Summarization takes a lot of time, so do this separately at the end
+    files_to_upload = ["summary.txt"]
+    upload_files(files_to_upload)
+

 if __name__ == "__main__":
    main()