Add more features to the reflector demo code repo

2026-04-19 11:46:55 +00:00 · 2023-06-13 22:21:35 +05:30
parent 3eab6db142
commit ed77fadd34
9 changed files with 402 additions and 85 deletions
--- a/42min-StartupsTechTalk-AGENDA-FULL.txt
+++ b/42min-StartupsTechTalk-AGENDA-FULL.txt
@@ -0,0 +1,47 @@
 AGENDA:  Most important things to look for in a start up
 TAM: Make sure the market is sufficiently large than once they win they can get rewarded
 - Medium sized markets that should be winner take all can work
 - TAM needs to be realistic of direct market size
 Product market fit: Being in a good market with a product than can satisfy that market
 - Solves a problem
 - Builds a solution a customer wants to buy
 - Either saves the customer something (time/money/pain) or gives them something (revenue/enjoyment)
 Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
 - Revenue minus direct costs
 - Raw input costs (materials, variable labour), direct cost of delivering and servicing the sale
 - Attractive as a % of sales so it can contribute to fixed overhead
 - Look for high incremental contribution margin
 LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
 - LTV = Purchase value x number of purchases x customer lifespan
 - CAC = All-in costs of sales + marketing over number of new customer additions
 - Strong reputation leads to referrals leads to lower CAC. Want customers evangelizing product/service
 - Rule of thumb higher than 3
 Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
 - Selling to replenish revenue every year is hard
 - Can run through entire customer base over time
 - Low churn builds strong net dollar retention
 Business: Must have sufficient barriers to entry to ward off copy-cats once established
 - High switching costs (lock-in)
 - Addictive
 - Steep learning curve once adopted (form of switching cost)
 - Two sided liquidity
 - Patents, IP, Branding
 - No hyper-scaler who can roll over you quickly
 - Scale could be a barrier to entry but works against most start-ups, not for them
 - Once developed, answer question: Could a well funded competitor starting up today easily duplicate this business or is it cheaper to buy the start up?
 Founders: Must be religious about their product. Believe they will change the world against all odds.
 - Just money in the bank is not enough to build a successful company. Just good tech not enough
 to build a successful company
 - Founders must be motivated to build something, not (all) about money. They would be doing
 this for free because they believe in it. Not looking for quick score
 - Founders must be persuasive. They will be asking others to sacrifice to make their dream come
 to life. They will need to convince investors this company can work and deserves funding.
 - Must understand who the customer is and what problem they are helping to solve.
 - Founders aren’t expected to know all the preceding points in this document but have an understanding of most of this, and be able to offer a vision.
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ To setup,
 2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
 3) Run the script setup_depedencies.sh.
-    ``` chmod +x setup_dependecies.sh ```
+    ``` chmod +x setup_dependencies.sh ```
    ``` sh setup_dependencies.sh  <ENV>```
@@ -31,13 +31,7 @@ To setup,
 ``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ" --transcript transcript.txt summary.txt ```
-
+5) ``` pip install -r requirements.txt```
 To run the current whisper-jax real time trial,
 You need to run one additional step which is 
 ``` pip install -r requirements.txt```
--- a/TWC.png
+++ b/TWC.png
--- a/agenda-headers.txt
+++ b/agenda-headers.txt
@@ -0,0 +1,8 @@
 AGENDA:  Most important things to look for in a start up
 TAM: Make sure the market is sufficiently large than once they win they can get rewarded
 Product market fit: Being in a good market with a product than can satisfy that market
 Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
 LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
 Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
 Business: Must have sufficient barriers to entry to ward off copy-cats once established
 Founders: Must be religious about their product. Believe they will change the world against all odds.
--- a/config.ini
+++ b/config.ini
@@ -2,6 +2,9 @@
 # Set exception rule for OpenMP error to allow duplicate lib initialization
 KMP_DUPLICATE_LIB_OK=TRUE
 # Export OpenAI API Key
-OPENAI_APIKEY=API_KEY
+OPENAI_APIKEY=***REMOVED***
 # Export Whisper Model Size
-WHISPER_MODEL_SIZE=tiny
+WHISPER_MODEL_SIZE=tiny
 AWS_ACCESS_KEY=
 AWS_SECRET_KEY=
 BUCKET_NAME='reflector-bucket'
--- a/file_util.py
+++ b/file_util.py
@@ -0,0 +1,51 @@
 import boto3
 import botocore
 import configparser
 from loguru import logger
 config = configparser.ConfigParser()
 config.read('config.ini')
 BUCKET_NAME = 'reflector-bucket'
 s3 = boto3.client('s3',
                  aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
                  aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"])
 def upload_files(files_to_upload):
    """
    Upload a list of files to the configured S3 bucket
    :param files_to_upload:
    :return:
    """
    for KEY in files_to_upload:
        logger.info("Uploading file " + KEY)
        try:
            s3.upload_file(KEY, BUCKET_NAME, KEY)
        except botocore.exceptions.ClientError as e:
            print(e.response)
 def download_files(files_to_download):
    """
    Download a list of files from the configured S3 bucket
    :param files_to_download:
    :return:
    """
    for KEY in files_to_download:
        logger.info("Downloading file " + KEY)
        try:
            s3.download_file(BUCKET_NAME, KEY, KEY)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
 if __name__ == "__main__":
    import sys
    if sys.argv[1] == "download":
        download_files([sys.argv[2]])
    elif sys.argv[1] == "upload":
        upload_files([sys.argv[2]])
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,47 @@
 pyaudio==0.2.13
 keyboard==0.13.5
 pynput==1.7.6
-wave==0.0.2
+wave==0.0.2
 aiohttp==3.8.4
 aiosignal==1.3.1
 async-timeout==4.0.2
 attrs==23.1.0
 certifi==2023.5.7
 charset-normalizer==3.1.0
 decorator==4.4.2
 filelock==3.12.0
 frozenlist==1.3.3
 idna==3.4
 imageio==2.29.0
 imageio-ffmpeg==0.4.8
 Jinja2==3.1.2
 llvmlite==0.40.0
 loguru==0.7.0
 MarkupSafe==2.1.2
 more-itertools==9.1.0
 moviepy==1.0.3
 mpmath==1.3.0
 multidict==6.0.4
 networkx==3.1
 numba==0.57.0
 numpy==1.24.3
 openai==0.27.7
 openai-whisper @ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
 Pillow==9.5.0
 proglog==0.1.10
 pytube==15.0.0
 regex==2023.5.5
 six==1.16.0
 sympy==1.12
 tiktoken==0.3.3
 torch==2.0.1
 tqdm==4.65.0
 typing_extensions==4.6.2
 urllib3
 yarl==1.9.2
 boto3==1.26.151
 nltk==3.8.1
 wordcloud
 spacy
 scattertext
 pandas
--- a/transcript_timestamps.txt
+++ b/transcript_timestamps.txt
--- a/whisjax.py
+++ b/whisjax.py
@@ -4,31 +4,42 @@
 # summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
 # summarize podcast.mp3 summary.txt
 from urllib.parse import urlparse
 from pytube import YouTube
 from loguru import logger
 from whisper_jax import FlaxWhisperPipline
 import jax.numpy as jnp
 import moviepy.editor
 import argparse
-import tempfile
+import ast
 import whisper
 import openai
 import re
 import configparser
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
 import moviepy.editor
 import moviepy.editor
 import nltk
 import os
 import pandas as pd
 import re
 import scattertext as st
 import spacy
 import tempfile
 from loguru import logger
 from pytube import YouTube
 from transformers import BartTokenizer, BartForConditionalGeneration
 from urllib.parse import urlparse
 from whisper_jax import FlaxWhisperPipline
 from wordcloud import WordCloud, STOPWORDS
 from file_util import upload_files, download_files
 nltk.download('punkt')
 # Configurations can be found in config.ini. Set them properly before executing
 config = configparser.ConfigParser()
 config.read('config.ini')
 WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
 OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"]
 MAX_WORDS_IN_CHUNK = 2500
 MAX_OUTPUT_TOKENS = 1000
 def init_argparse() -> argparse.ArgumentParser:
    """
    Parse the CLI arguments
    :return: parser object
    """
    parser = argparse.ArgumentParser(
        usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
        description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT."
@@ -37,43 +48,185 @@ def init_argparse() -> argparse.ArgumentParser:
    parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str,
                        default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
    parser.add_argument("-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
    parser.add_argument(
        "-m", "--model_name", help="Name or path of the BART model",
        type=str, default="facebook/bart-base")
    parser.add_argument("location")
    parser.add_argument("output")
    return parser
-def chunk_text(txt):
+def chunk_text(txt, max_chunk_length=500):
-    sentences = re.split('[.!?]', txt)
+    """
-
+    Split text into smaller chunks.
    :param txt: Text to be chunked
    :param max_chunk_length: length of chunk
    :return: chunked texts
    """
    sentences = nltk.sent_tokenize(txt)
    chunks = []
-    chunk = ""
+    current_chunk = ""
-    size = 0
+    for sentence in sentences:
-
+        if len(current_chunk) + len(sentence) < max_chunk_length:
-    for s in sentences:
+            current_chunk += f" {sentence.strip()}"
        # Get the number of words in this sentence.
        n = len(re.findall(r'\w+', s))
        # Skip over empty sentences.
        if n == 0:
            continue
        # We need to break the text up into chunks so as not to exceed the max
        # number of tokens accepted by the ChatGPT model.
        if size + n > MAX_WORDS_IN_CHUNK:
            chunks.append(chunk)
            size = n
            chunk = s
        else:
-            chunk = chunk + s
+            chunks.append(current_chunk.strip())
-            size = size + n
+            current_chunk = f"{sentence.strip()}"
-
+    chunks.append(current_chunk.strip())
    if chunk:
        chunks.append(chunk)
    return chunks
 def summarize_chunks(chunks, tokenizer, model):
    """
    Summarize each chunk using a summarizer model
    :param chunks:
    :param tokenizer:
    :param model:
    :return:
    """
    summaries = []
    for c in chunks:
        input_ids = tokenizer.encode(c, return_tensors='pt')
        summary_ids = model.generate(
            input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries
 def create_wordcloud():
    """
    Create a basic word cloud visualization of transcribed text
    :return: None. The wordcloud image is saved locally
    """
    with open("transcript.txt", "r") as f:
        transcription_text = f.read()
    stopwords = set(STOPWORDS)
    # python_mask = np.array(PIL.Image.open("download1.png"))
    wordcloud = WordCloud(height=800, width=800,
                          background_color='white',
                          stopwords=stopwords,
                          min_font_size=8).generate(transcription_text)
    # Plot wordcloud and save image
    plt.figure(facecolor=None)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.savefig("wordcloud.png")
 def create_talk_diff_scatter_viz():
    """
    Perform agenda vs transription diff to see covered topics.
    Create a scatter plot of words in topics.
    :return: None. Saved locally.
    """
    spaCy_model = "en_core_web_md"
    nlp = spacy.load(spaCy_model)
    nlp.add_pipe('sentencizer')
    agenda_topics = []
    agenda = []
    # Load the agenda
    with open("agenda-headers.txt", "r") as f:
        for line in f.readlines():
            if line.strip():
                agenda.append(line.strip())
                agenda_topics.append(line.split(":")[0])
    # Load the transcription with timestamp
    with open("transcript_timestamps.txt", "r") as f:
        transcription_timestamp_text = f.read()
    res = ast.literal_eval(transcription_timestamp_text)
    chunks = res["chunks"]
    # create df for processing
    df = pd.DataFrame.from_dict(res["chunks"])
    covered_items = {}
    # ts: timestamp
    # Map each timestamped chunk with top1 and top2 matched agenda
    ts_to_topic_mapping_top_1 = {}
    ts_to_topic_mapping_top_2 = {}
    # Also create a mapping of the different timestamps in which each topic was covered
    topic_to_ts_mapping_top_1 = {}
    topic_to_ts_mapping_top_2 = {}
    similarity_threshold = 0.7
    for c in chunks:
        doc_transcription = nlp(c["text"])
        topic_similarities = []
        for item in range(len(agenda)):
            item_doc = nlp(agenda[item])
            # if not doc_transcription or not all(token.has_vector for token in doc_transcription):
            if not doc_transcription:
                continue
            similarity = doc_transcription.similarity(item_doc)
            topic_similarities.append((item, similarity))
        topic_similarities.sort(key=lambda x: x[1], reverse=True)
        for i in range(2):
            if topic_similarities[i][1] >= similarity_threshold:
                covered_items[agenda[topic_similarities[i][0]]] = True
            # top1 match
            if i == 0:
                ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
                topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
            # top2 match
            else:
                ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
                topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
    def create_new_columns(record):
        """
        Accumulate the mapping information into the df
        :param record:
        :return:
        """
        record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]]
        record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]]
        return record
    df = df.apply(create_new_columns, axis=1)
    # Count the number of items covered and calculatre the percentage
    num_covered_items = sum(covered_items.values())
    percentage_covered = num_covered_items / len(agenda) * 100
    # Print the results
    print("💬 Agenda items covered in the transcription:")
    for item in agenda:
        if item in covered_items and covered_items[item]:
            print("✅ ", item)
        else:
            print("❌ ", item)
    print("📊 Coverage: {:.2f}%".format(percentage_covered))
    # Save df for further experimentation
    df.to_pickle("df.pkl")
    # Scatter plot of topics
    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
    corpus = st.CorpusFromParsedDocuments(
        df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
    html = st.produce_scattertext_explorer(
        corpus,
        category='TAM', category_name='TAM', not_category_name='Churn',
        minimum_term_frequency=0, pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        transform=st.Scalers.dense_rank
    )
    open('./demo_compact.html', 'w').write(html)
 def main():
    parser = init_argparse()
    args = parser.parse_args()
@@ -83,6 +236,8 @@ def main():
    # audio or video file.
    url = urlparse(args.location)
    # S3 : Pull artefacts to S3 bucket ?
    media_file = ""
    if url.scheme == 'http' or url.scheme == 'https':
        # Check if we're being asked to retreive a YouTube URL, which is handled
@@ -103,65 +258,81 @@ def main():
            logger.info("  XXX - This method hasn't been implemented yet.")
    elif url.scheme == '':
        media_file = url.path
        # If file is not present locally, take it from S3 bucket
        if not os.path.exists(media_file):
            download_files([media_file])
    else:
        print("Unsupported URL scheme: " + url.scheme)
        quit()
-    # If the media file we just retrieved is a video, extract its audio stream.
+    # Handle video
-    # XXX - We should be checking if we've downloaded an audio file (eg .mp3),
+    try:
-    # XXX - in which case we can skip this step.  For now we'll assume that
+        video = moviepy.editor.VideoFileClip(media_file)
-    # XXX - everything is an mp4 video.
+        audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
-    audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
+        video.audio.write_audiofile(audio_filename, logger=None)
-    logger.info(f"Extracting audio to: {audio_filename}")
+        logger.info(f"Extracting audio to: {audio_filename}")
-
+    # Handle audio only file
-    video = moviepy.editor.VideoFileClip(media_file)
+    except:
-    video.audio.write_audiofile(audio_filename, logger=None)
+        audio = moviepy.editor.AudioFileClip(media_file)
        audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
        audio.write_audiofile(audio_filename, logger=None)
    logger.info("Finished extracting audio")
    # Convert the audio to text using the OpenAI Whisper model
-    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, batch_size=16)
+    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
                                  dtype=jnp.float16,
                                  batch_size=16)
    whisper_result = pipeline(audio_filename, return_timestamps=True)
    logger.info("Finished transcribing file")
-    # If we got the transcript parameter on the command line, save the transcript to the specified file.
+    # If we got the transcript parameter on the command line,
    # save the transcript to the specified file.
    if args.transcript:
        logger.info(f"Saving transcript to: {args.transcript}")
        transcript_file = open(args.transcript, "w")
        transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
        transcript_file.write(whisper_result["text"])
        transcript_file_timestamps.write(str(whisper_result))
        transcript_file.close()
        transcript_file_timestamps.close()
-    # Summarize the generated transcript using OpenAI
+    logger.info("Creating word cloud")
-    openai.api_key = OPENAI_APIKEY
+    create_wordcloud()
-    # Break the text up into smaller chunks for ChatGPT to summarize.
+    logger.info("Performing talk-diff and talk-diff visualization")
-    logger.info(f"Breaking transcript up into smaller chunks with MAX_WORDS_IN_CHUNK = {MAX_WORDS_IN_CHUNK}")
+    create_talk_diff_scatter_viz()
    # S3 : Push artefacts to S3 bucket
    files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
                       "demo_compact.html", "df.pkl",
                       "wordcloud.png"]
    upload_files(files_to_upload)
    # Summarize the generated transcript using the BART model
    logger.info(f"Loading BART model: {args.model_name}")
    tokenizer = BartTokenizer.from_pretrained(args.model_name)
    model = BartForConditionalGeneration.from_pretrained(args.model_name)
    logger.info("Breaking transcript into smaller chunks")
    chunks = chunk_text(whisper_result['text'])
    logger.info(f"Transcript broken up into {len(chunks)} chunks")
-    language = args.language
+    logger.info(
        f"Transcript broken into {len(chunks)} chunks of at most 500 words")  # TODO fix variable
-    logger.info(f"Writing summary text in {language} to: {args.output}")
+    logger.info(f"Writing summary text in {args.language} to: {args.output}")
    with open(args.output, 'w') as f:
        f.write('Summary of: ' + args.location + "\n\n")
-
+        summaries = summarize_chunks(chunks, tokenizer, model)
-        for c in chunks:
+        for summary in summaries:
-            response = openai.ChatCompletion.create(
+            f.write(summary.strip() + "\n\n")
                frequency_penalty=0.0,
                max_tokens=1000,
                model="gpt-3.5-turbo",
                presence_penalty=1.0,
                temperature=0.2,
                messages=[
                    {"role": "system",
                     "content": f"You are an assistant helping to summarize transcipts of an audio or video conversation.  The summary should be written in the {language} language."},
                    {"role": "user", "content": c}
                ],
            )
            f.write(response['choices'][0]['message']['content'] + "\n\n")
    logger.info("Summarization completed")
    # Summarization takes a lot of time, so do this separately at the end
    files_to_upload = ["summary.txt"]
    upload_files(files_to_upload)
 if __name__ == "__main__":
    main()