diff --git a/42min-StartupsTechTalk-AGENDA-FULL.txt b/42min-StartupsTechTalk-AGENDA-FULL.txt new file mode 100644 index 00000000..8ad3ff1c --- /dev/null +++ b/42min-StartupsTechTalk-AGENDA-FULL.txt @@ -0,0 +1,47 @@ +AGENDA: Most important things to look for in a start up + +TAM: Make sure the market is sufficiently large than once they win they can get rewarded +- Medium sized markets that should be winner take all can work +- TAM needs to be realistic of direct market size + +Product market fit: Being in a good market with a product than can satisfy that market +- Solves a problem +- Builds a solution a customer wants to buy +- Either saves the customer something (time/money/pain) or gives them something (revenue/enjoyment) + +Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount) +- Revenue minus direct costs +- Raw input costs (materials, variable labour), direct cost of delivering and servicing the sale +- Attractive as a % of sales so it can contribute to fixed overhead +- Look for high incremental contribution margin + +LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy +- LTV = Purchase value x number of purchases x customer lifespan +- CAC = All-in costs of sales + marketing over number of new customer additions +- Strong reputation leads to referrals leads to lower CAC. Want customers evangelizing product/service +- Rule of thumb higher than 3 + +Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down +- Selling to replenish revenue every year is hard +- Can run through entire customer base over time +- Low churn builds strong net dollar retention + +Business: Must have sufficient barriers to entry to ward off copy-cats once established +- High switching costs (lock-in) +- Addictive +- Steep learning curve once adopted (form of switching cost) +- Two sided liquidity +- Patents, IP, Branding +- No hyper-scaler who can roll over you quickly +- Scale could be a barrier to entry but works against most start-ups, not for them +- Once developed, answer question: Could a well funded competitor starting up today easily duplicate this business or is it cheaper to buy the start up? + +Founders: Must be religious about their product. Believe they will change the world against all odds. +- Just money in the bank is not enough to build a successful company. Just good tech not enough +to build a successful company +- Founders must be motivated to build something, not (all) about money. They would be doing +this for free because they believe in it. Not looking for quick score +- Founders must be persuasive. They will be asking others to sacrifice to make their dream come +to life. They will need to convince investors this company can work and deserves funding. +- Must understand who the customer is and what problem they are helping to solve. +- Founders aren’t expected to know all the preceding points in this document but have an understanding of most of this, and be able to offer a vision. \ No newline at end of file diff --git a/README.md b/README.md index 49607dbf..f016e26f 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ To setup, 2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.] 3) Run the script setup_depedencies.sh. - ``` chmod +x setup_dependecies.sh ``` + ``` chmod +x setup_dependencies.sh ``` ``` sh setup_dependencies.sh ``` @@ -31,13 +31,7 @@ To setup, ``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ" --transcript transcript.txt summary.txt ``` - - -To run the current whisper-jax real time trial, - -You need to run one additional step which is - -``` pip install -r requirements.txt``` +5) ``` pip install -r requirements.txt``` diff --git a/TWC.png b/TWC.png new file mode 100644 index 00000000..3f6abc71 Binary files /dev/null and b/TWC.png differ diff --git a/agenda-headers.txt b/agenda-headers.txt new file mode 100644 index 00000000..fd8034a2 --- /dev/null +++ b/agenda-headers.txt @@ -0,0 +1,8 @@ +AGENDA: Most important things to look for in a start up +TAM: Make sure the market is sufficiently large than once they win they can get rewarded +Product market fit: Being in a good market with a product than can satisfy that market +Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount) +LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy +Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down +Business: Must have sufficient barriers to entry to ward off copy-cats once established +Founders: Must be religious about their product. Believe they will change the world against all odds. \ No newline at end of file diff --git a/config.ini b/config.ini index ad40ac0b..027896ff 100644 --- a/config.ini +++ b/config.ini @@ -2,6 +2,9 @@ # Set exception rule for OpenMP error to allow duplicate lib initialization KMP_DUPLICATE_LIB_OK=TRUE # Export OpenAI API Key -OPENAI_APIKEY=API_KEY +OPENAI_APIKEY=***REMOVED*** # Export Whisper Model Size -WHISPER_MODEL_SIZE=tiny \ No newline at end of file +WHISPER_MODEL_SIZE=tiny +AWS_ACCESS_KEY= +AWS_SECRET_KEY= +BUCKET_NAME='reflector-bucket' \ No newline at end of file diff --git a/file_util.py b/file_util.py new file mode 100644 index 00000000..6a4a4e40 --- /dev/null +++ b/file_util.py @@ -0,0 +1,51 @@ +import boto3 +import botocore +import configparser +from loguru import logger + +config = configparser.ConfigParser() +config.read('config.ini') + +BUCKET_NAME = 'reflector-bucket' + +s3 = boto3.client('s3', + aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"], + aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"]) + +def upload_files(files_to_upload): + """ + Upload a list of files to the configured S3 bucket + :param files_to_upload: + :return: + """ + for KEY in files_to_upload: + logger.info("Uploading file " + KEY) + try: + s3.upload_file(KEY, BUCKET_NAME, KEY) + except botocore.exceptions.ClientError as e: + print(e.response) + + +def download_files(files_to_download): + """ + Download a list of files from the configured S3 bucket + :param files_to_download: + :return: + """ + for KEY in files_to_download: + logger.info("Downloading file " + KEY) + try: + s3.download_file(BUCKET_NAME, KEY, KEY) + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == "404": + print("The object does not exist.") + else: + raise + + +if __name__ == "__main__": + import sys + if sys.argv[1] == "download": + download_files([sys.argv[2]]) + elif sys.argv[1] == "upload": + upload_files([sys.argv[2]]) diff --git a/requirements.txt b/requirements.txt index 13799945..7e2fc07d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,47 @@ pyaudio==0.2.13 keyboard==0.13.5 pynput==1.7.6 -wave==0.0.2 \ No newline at end of file +wave==0.0.2 +aiohttp==3.8.4 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==23.1.0 +certifi==2023.5.7 +charset-normalizer==3.1.0 +decorator==4.4.2 +filelock==3.12.0 +frozenlist==1.3.3 +idna==3.4 +imageio==2.29.0 +imageio-ffmpeg==0.4.8 +Jinja2==3.1.2 +llvmlite==0.40.0 +loguru==0.7.0 +MarkupSafe==2.1.2 +more-itertools==9.1.0 +moviepy==1.0.3 +mpmath==1.3.0 +multidict==6.0.4 +networkx==3.1 +numba==0.57.0 +numpy==1.24.3 +openai==0.27.7 +openai-whisper @ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8 +Pillow==9.5.0 +proglog==0.1.10 +pytube==15.0.0 +regex==2023.5.5 +six==1.16.0 +sympy==1.12 +tiktoken==0.3.3 +torch==2.0.1 +tqdm==4.65.0 +typing_extensions==4.6.2 +urllib3 +yarl==1.9.2 +boto3==1.26.151 +nltk==3.8.1 +wordcloud +spacy +scattertext +pandas \ No newline at end of file diff --git a/transcript_timestamps.txt b/transcript_timestamps.txt new file mode 100644 index 00000000..8f8ff32c Binary files /dev/null and b/transcript_timestamps.txt differ diff --git a/whisjax.py b/whisjax.py index 31f2dc46..bbd51dad 100644 --- a/whisjax.py +++ b/whisjax.py @@ -4,31 +4,42 @@ # summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt # summarize podcast.mp3 summary.txt -from urllib.parse import urlparse -from pytube import YouTube -from loguru import logger -from whisper_jax import FlaxWhisperPipline -import jax.numpy as jnp -import moviepy.editor import argparse -import tempfile -import whisper -import openai -import re +import ast import configparser +import jax.numpy as jnp +import matplotlib.pyplot as plt +import moviepy.editor +import moviepy.editor +import nltk import os +import pandas as pd +import re +import scattertext as st +import spacy +import tempfile +from loguru import logger +from pytube import YouTube +from transformers import BartTokenizer, BartForConditionalGeneration +from urllib.parse import urlparse +from whisper_jax import FlaxWhisperPipline +from wordcloud import WordCloud, STOPWORDS +from file_util import upload_files, download_files + +nltk.download('punkt') + +# Configurations can be found in config.ini. Set them properly before executing config = configparser.ConfigParser() config.read('config.ini') WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] -OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"] - -MAX_WORDS_IN_CHUNK = 2500 -MAX_OUTPUT_TOKENS = 1000 - def init_argparse() -> argparse.ArgumentParser: + """ + Parse the CLI arguments + :return: parser object + """ parser = argparse.ArgumentParser( usage="%(prog)s [OPTIONS] ", description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT." @@ -37,43 +48,185 @@ def init_argparse() -> argparse.ArgumentParser: parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str, default="english", choices=['english', 'spanish', 'french', 'german', 'romanian']) parser.add_argument("-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str) + parser.add_argument( + "-m", "--model_name", help="Name or path of the BART model", + type=str, default="facebook/bart-base") parser.add_argument("location") parser.add_argument("output") return parser -def chunk_text(txt): - sentences = re.split('[.!?]', txt) - +def chunk_text(txt, max_chunk_length=500): + """ + Split text into smaller chunks. + :param txt: Text to be chunked + :param max_chunk_length: length of chunk + :return: chunked texts + """ + sentences = nltk.sent_tokenize(txt) chunks = [] - chunk = "" - size = 0 - - for s in sentences: - # Get the number of words in this sentence. - n = len(re.findall(r'\w+', s)) - - # Skip over empty sentences. - if n == 0: - continue - - # We need to break the text up into chunks so as not to exceed the max - # number of tokens accepted by the ChatGPT model. - if size + n > MAX_WORDS_IN_CHUNK: - chunks.append(chunk) - size = n - chunk = s + current_chunk = "" + for sentence in sentences: + if len(current_chunk) + len(sentence) < max_chunk_length: + current_chunk += f" {sentence.strip()}" else: - chunk = chunk + s - size = size + n - - if chunk: - chunks.append(chunk) - + chunks.append(current_chunk.strip()) + current_chunk = f"{sentence.strip()}" + chunks.append(current_chunk.strip()) return chunks +def summarize_chunks(chunks, tokenizer, model): + """ + Summarize each chunk using a summarizer model + :param chunks: + :param tokenizer: + :param model: + :return: + """ + summaries = [] + for c in chunks: + input_ids = tokenizer.encode(c, return_tensors='pt') + summary_ids = model.generate( + input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3) + summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) + summaries.append(summary) + return summaries + + +def create_wordcloud(): + """ + Create a basic word cloud visualization of transcribed text + :return: None. The wordcloud image is saved locally + """ + with open("transcript.txt", "r") as f: + transcription_text = f.read() + + stopwords = set(STOPWORDS) + + # python_mask = np.array(PIL.Image.open("download1.png")) + + wordcloud = WordCloud(height=800, width=800, + background_color='white', + stopwords=stopwords, + min_font_size=8).generate(transcription_text) + + # Plot wordcloud and save image + plt.figure(facecolor=None) + plt.imshow(wordcloud, interpolation="bilinear") + plt.axis("off") + plt.tight_layout(pad=0) + plt.savefig("wordcloud.png") + + +def create_talk_diff_scatter_viz(): + """ + Perform agenda vs transription diff to see covered topics. + Create a scatter plot of words in topics. + :return: None. Saved locally. + """ + spaCy_model = "en_core_web_md" + nlp = spacy.load(spaCy_model) + nlp.add_pipe('sentencizer') + + agenda_topics = [] + agenda = [] + # Load the agenda + with open("agenda-headers.txt", "r") as f: + for line in f.readlines(): + if line.strip(): + agenda.append(line.strip()) + agenda_topics.append(line.split(":")[0]) + + # Load the transcription with timestamp + with open("transcript_timestamps.txt", "r") as f: + transcription_timestamp_text = f.read() + + res = ast.literal_eval(transcription_timestamp_text) + chunks = res["chunks"] + + # create df for processing + df = pd.DataFrame.from_dict(res["chunks"]) + + covered_items = {} + # ts: timestamp + # Map each timestamped chunk with top1 and top2 matched agenda + ts_to_topic_mapping_top_1 = {} + ts_to_topic_mapping_top_2 = {} + + # Also create a mapping of the different timestamps in which each topic was covered + topic_to_ts_mapping_top_1 = {} + topic_to_ts_mapping_top_2 = {} + + similarity_threshold = 0.7 + + for c in chunks: + doc_transcription = nlp(c["text"]) + topic_similarities = [] + for item in range(len(agenda)): + item_doc = nlp(agenda[item]) + # if not doc_transcription or not all(token.has_vector for token in doc_transcription): + if not doc_transcription: + continue + similarity = doc_transcription.similarity(item_doc) + topic_similarities.append((item, similarity)) + topic_similarities.sort(key=lambda x: x[1], reverse=True) + for i in range(2): + if topic_similarities[i][1] >= similarity_threshold: + covered_items[agenda[topic_similarities[i][0]]] = True + # top1 match + if i == 0: + ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]] + topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"] + # top2 match + else: + ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]] + topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"] + + + def create_new_columns(record): + """ + Accumulate the mapping information into the df + :param record: + :return: + """ + record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]] + record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]] + return record + + df = df.apply(create_new_columns, axis=1) + + # Count the number of items covered and calculatre the percentage + num_covered_items = sum(covered_items.values()) + percentage_covered = num_covered_items / len(agenda) * 100 + + # Print the results + print("πŸ’¬ Agenda items covered in the transcription:") + for item in agenda: + if item in covered_items and covered_items[item]: + print("βœ… ", item) + else: + print("❌ ", item) + print("πŸ“Š Coverage: {:.2f}%".format(percentage_covered)) + + # Save df for further experimentation + df.to_pickle("df.pkl") + + # Scatter plot of topics + df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) + corpus = st.CorpusFromParsedDocuments( + df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse' + ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) + html = st.produce_scattertext_explorer( + corpus, + category='TAM', category_name='TAM', not_category_name='Churn', + minimum_term_frequency=0, pmi_threshold_coefficient=0, + width_in_pixels=1000, + transform=st.Scalers.dense_rank + ) + open('./demo_compact.html', 'w').write(html) + def main(): parser = init_argparse() args = parser.parse_args() @@ -83,6 +236,8 @@ def main(): # audio or video file. url = urlparse(args.location) + # S3 : Pull artefacts to S3 bucket ? + media_file = "" if url.scheme == 'http' or url.scheme == 'https': # Check if we're being asked to retreive a YouTube URL, which is handled @@ -103,65 +258,81 @@ def main(): logger.info(" XXX - This method hasn't been implemented yet.") elif url.scheme == '': media_file = url.path + # If file is not present locally, take it from S3 bucket + if not os.path.exists(media_file): + download_files([media_file]) else: print("Unsupported URL scheme: " + url.scheme) quit() - # If the media file we just retrieved is a video, extract its audio stream. - # XXX - We should be checking if we've downloaded an audio file (eg .mp3), - # XXX - in which case we can skip this step. For now we'll assume that - # XXX - everything is an mp4 video. - audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name - logger.info(f"Extracting audio to: {audio_filename}") - - video = moviepy.editor.VideoFileClip(media_file) - video.audio.write_audiofile(audio_filename, logger=None) + # Handle video + try: + video = moviepy.editor.VideoFileClip(media_file) + audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name + video.audio.write_audiofile(audio_filename, logger=None) + logger.info(f"Extracting audio to: {audio_filename}") + # Handle audio only file + except: + audio = moviepy.editor.AudioFileClip(media_file) + audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name + audio.write_audiofile(audio_filename, logger=None) logger.info("Finished extracting audio") # Convert the audio to text using the OpenAI Whisper model - pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, batch_size=16) + pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, + dtype=jnp.float16, + batch_size=16) whisper_result = pipeline(audio_filename, return_timestamps=True) logger.info("Finished transcribing file") - # If we got the transcript parameter on the command line, save the transcript to the specified file. + # If we got the transcript parameter on the command line, + # save the transcript to the specified file. if args.transcript: logger.info(f"Saving transcript to: {args.transcript}") transcript_file = open(args.transcript, "w") + transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w") transcript_file.write(whisper_result["text"]) + transcript_file_timestamps.write(str(whisper_result)) transcript_file.close() + transcript_file_timestamps.close() - # Summarize the generated transcript using OpenAI - openai.api_key = OPENAI_APIKEY + logger.info("Creating word cloud") + create_wordcloud() - # Break the text up into smaller chunks for ChatGPT to summarize. - logger.info(f"Breaking transcript up into smaller chunks with MAX_WORDS_IN_CHUNK = {MAX_WORDS_IN_CHUNK}") + logger.info("Performing talk-diff and talk-diff visualization") + create_talk_diff_scatter_viz() + + # S3 : Push artefacts to S3 bucket + files_to_upload = ["transcript.txt", "transcript_timestamps.txt", + "demo_compact.html", "df.pkl", + "wordcloud.png"] + upload_files(files_to_upload) + + # Summarize the generated transcript using the BART model + logger.info(f"Loading BART model: {args.model_name}") + tokenizer = BartTokenizer.from_pretrained(args.model_name) + model = BartForConditionalGeneration.from_pretrained(args.model_name) + + logger.info("Breaking transcript into smaller chunks") chunks = chunk_text(whisper_result['text']) - logger.info(f"Transcript broken up into {len(chunks)} chunks") - language = args.language + logger.info( + f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable - logger.info(f"Writing summary text in {language} to: {args.output}") + logger.info(f"Writing summary text in {args.language} to: {args.output}") with open(args.output, 'w') as f: f.write('Summary of: ' + args.location + "\n\n") - - for c in chunks: - response = openai.ChatCompletion.create( - frequency_penalty=0.0, - max_tokens=1000, - model="gpt-3.5-turbo", - presence_penalty=1.0, - temperature=0.2, - messages=[ - {"role": "system", - "content": f"You are an assistant helping to summarize transcipts of an audio or video conversation. The summary should be written in the {language} language."}, - {"role": "user", "content": c} - ], - ) - f.write(response['choices'][0]['message']['content'] + "\n\n") + summaries = summarize_chunks(chunks, tokenizer, model) + for summary in summaries: + f.write(summary.strip() + "\n\n") logger.info("Summarization completed") + # Summarization takes a lot of time, so do this separately at the end + files_to_upload = ["summary.txt"] + upload_files(files_to_upload) + if __name__ == "__main__": main()