Add more features to the reflector demo code repo

This commit is contained in:
gokul
2023-06-13 22:21:35 +05:30
parent 3eab6db142
commit ed77fadd34
9 changed files with 402 additions and 85 deletions

View File

@@ -4,31 +4,42 @@
# summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
# summarize podcast.mp3 summary.txt
from urllib.parse import urlparse
from pytube import YouTube
from loguru import logger
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp
import moviepy.editor
import argparse
import tempfile
import whisper
import openai
import re
import ast
import configparser
import jax.numpy as jnp
import matplotlib.pyplot as plt
import moviepy.editor
import moviepy.editor
import nltk
import os
import pandas as pd
import re
import scattertext as st
import spacy
import tempfile
from loguru import logger
from pytube import YouTube
from transformers import BartTokenizer, BartForConditionalGeneration
from urllib.parse import urlparse
from whisper_jax import FlaxWhisperPipline
from wordcloud import WordCloud, STOPWORDS
from file_util import upload_files, download_files
nltk.download('punkt')
# Configurations can be found in config.ini. Set them properly before executing
config = configparser.ConfigParser()
config.read('config.ini')
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"]
MAX_WORDS_IN_CHUNK = 2500
MAX_OUTPUT_TOKENS = 1000
def init_argparse() -> argparse.ArgumentParser:
"""
Parse the CLI arguments
:return: parser object
"""
parser = argparse.ArgumentParser(
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT."
@@ -37,43 +48,185 @@ def init_argparse() -> argparse.ArgumentParser:
parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str,
default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
parser.add_argument("-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
parser.add_argument(
"-m", "--model_name", help="Name or path of the BART model",
type=str, default="facebook/bart-base")
parser.add_argument("location")
parser.add_argument("output")
return parser
def chunk_text(txt):
sentences = re.split('[.!?]', txt)
def chunk_text(txt, max_chunk_length=500):
"""
Split text into smaller chunks.
:param txt: Text to be chunked
:param max_chunk_length: length of chunk
:return: chunked texts
"""
sentences = nltk.sent_tokenize(txt)
chunks = []
chunk = ""
size = 0
for s in sentences:
# Get the number of words in this sentence.
n = len(re.findall(r'\w+', s))
# Skip over empty sentences.
if n == 0:
continue
# We need to break the text up into chunks so as not to exceed the max
# number of tokens accepted by the ChatGPT model.
if size + n > MAX_WORDS_IN_CHUNK:
chunks.append(chunk)
size = n
chunk = s
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chunk_length:
current_chunk += f" {sentence.strip()}"
else:
chunk = chunk + s
size = size + n
if chunk:
chunks.append(chunk)
chunks.append(current_chunk.strip())
current_chunk = f"{sentence.strip()}"
chunks.append(current_chunk.strip())
return chunks
def summarize_chunks(chunks, tokenizer, model):
"""
Summarize each chunk using a summarizer model
:param chunks:
:param tokenizer:
:param model:
:return:
"""
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c, return_tensors='pt')
summary_ids = model.generate(
input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
return summaries
def create_wordcloud():
"""
Create a basic word cloud visualization of transcribed text
:return: None. The wordcloud image is saved locally
"""
with open("transcript.txt", "r") as f:
transcription_text = f.read()
stopwords = set(STOPWORDS)
# python_mask = np.array(PIL.Image.open("download1.png"))
wordcloud = WordCloud(height=800, width=800,
background_color='white',
stopwords=stopwords,
min_font_size=8).generate(transcription_text)
# Plot wordcloud and save image
plt.figure(facecolor=None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.savefig("wordcloud.png")
def create_talk_diff_scatter_viz():
"""
Perform agenda vs transription diff to see covered topics.
Create a scatter plot of words in topics.
:return: None. Saved locally.
"""
spaCy_model = "en_core_web_md"
nlp = spacy.load(spaCy_model)
nlp.add_pipe('sentencizer')
agenda_topics = []
agenda = []
# Load the agenda
with open("agenda-headers.txt", "r") as f:
for line in f.readlines():
if line.strip():
agenda.append(line.strip())
agenda_topics.append(line.split(":")[0])
# Load the transcription with timestamp
with open("transcript_timestamps.txt", "r") as f:
transcription_timestamp_text = f.read()
res = ast.literal_eval(transcription_timestamp_text)
chunks = res["chunks"]
# create df for processing
df = pd.DataFrame.from_dict(res["chunks"])
covered_items = {}
# ts: timestamp
# Map each timestamped chunk with top1 and top2 matched agenda
ts_to_topic_mapping_top_1 = {}
ts_to_topic_mapping_top_2 = {}
# Also create a mapping of the different timestamps in which each topic was covered
topic_to_ts_mapping_top_1 = {}
topic_to_ts_mapping_top_2 = {}
similarity_threshold = 0.7
for c in chunks:
doc_transcription = nlp(c["text"])
topic_similarities = []
for item in range(len(agenda)):
item_doc = nlp(agenda[item])
# if not doc_transcription or not all(token.has_vector for token in doc_transcription):
if not doc_transcription:
continue
similarity = doc_transcription.similarity(item_doc)
topic_similarities.append((item, similarity))
topic_similarities.sort(key=lambda x: x[1], reverse=True)
for i in range(2):
if topic_similarities[i][1] >= similarity_threshold:
covered_items[agenda[topic_similarities[i][0]]] = True
# top1 match
if i == 0:
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
# top2 match
else:
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
def create_new_columns(record):
"""
Accumulate the mapping information into the df
:param record:
:return:
"""
record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]]
record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]]
return record
df = df.apply(create_new_columns, axis=1)
# Count the number of items covered and calculatre the percentage
num_covered_items = sum(covered_items.values())
percentage_covered = num_covered_items / len(agenda) * 100
# Print the results
print("💬 Agenda items covered in the transcription:")
for item in agenda:
if item in covered_items and covered_items[item]:
print("", item)
else:
print("", item)
print("📊 Coverage: {:.2f}%".format(percentage_covered))
# Save df for further experimentation
df.to_pickle("df.pkl")
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category='TAM', category_name='TAM', not_category_name='Churn',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
open('./demo_compact.html', 'w').write(html)
def main():
parser = init_argparse()
args = parser.parse_args()
@@ -83,6 +236,8 @@ def main():
# audio or video file.
url = urlparse(args.location)
# S3 : Pull artefacts to S3 bucket ?
media_file = ""
if url.scheme == 'http' or url.scheme == 'https':
# Check if we're being asked to retreive a YouTube URL, which is handled
@@ -103,65 +258,81 @@ def main():
logger.info(" XXX - This method hasn't been implemented yet.")
elif url.scheme == '':
media_file = url.path
# If file is not present locally, take it from S3 bucket
if not os.path.exists(media_file):
download_files([media_file])
else:
print("Unsupported URL scheme: " + url.scheme)
quit()
# If the media file we just retrieved is a video, extract its audio stream.
# XXX - We should be checking if we've downloaded an audio file (eg .mp3),
# XXX - in which case we can skip this step. For now we'll assume that
# XXX - everything is an mp4 video.
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
logger.info(f"Extracting audio to: {audio_filename}")
video = moviepy.editor.VideoFileClip(media_file)
video.audio.write_audiofile(audio_filename, logger=None)
# Handle video
try:
video = moviepy.editor.VideoFileClip(media_file)
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
video.audio.write_audiofile(audio_filename, logger=None)
logger.info(f"Extracting audio to: {audio_filename}")
# Handle audio only file
except:
audio = moviepy.editor.AudioFileClip(media_file)
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
audio.write_audiofile(audio_filename, logger=None)
logger.info("Finished extracting audio")
# Convert the audio to text using the OpenAI Whisper model
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, batch_size=16)
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
dtype=jnp.float16,
batch_size=16)
whisper_result = pipeline(audio_filename, return_timestamps=True)
logger.info("Finished transcribing file")
# If we got the transcript parameter on the command line, save the transcript to the specified file.
# If we got the transcript parameter on the command line,
# save the transcript to the specified file.
if args.transcript:
logger.info(f"Saving transcript to: {args.transcript}")
transcript_file = open(args.transcript, "w")
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
transcript_file.write(whisper_result["text"])
transcript_file_timestamps.write(str(whisper_result))
transcript_file.close()
transcript_file_timestamps.close()
# Summarize the generated transcript using OpenAI
openai.api_key = OPENAI_APIKEY
logger.info("Creating word cloud")
create_wordcloud()
# Break the text up into smaller chunks for ChatGPT to summarize.
logger.info(f"Breaking transcript up into smaller chunks with MAX_WORDS_IN_CHUNK = {MAX_WORDS_IN_CHUNK}")
logger.info("Performing talk-diff and talk-diff visualization")
create_talk_diff_scatter_viz()
# S3 : Push artefacts to S3 bucket
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
"demo_compact.html", "df.pkl",
"wordcloud.png"]
upload_files(files_to_upload)
# Summarize the generated transcript using the BART model
logger.info(f"Loading BART model: {args.model_name}")
tokenizer = BartTokenizer.from_pretrained(args.model_name)
model = BartForConditionalGeneration.from_pretrained(args.model_name)
logger.info("Breaking transcript into smaller chunks")
chunks = chunk_text(whisper_result['text'])
logger.info(f"Transcript broken up into {len(chunks)} chunks")
language = args.language
logger.info(
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
logger.info(f"Writing summary text in {language} to: {args.output}")
logger.info(f"Writing summary text in {args.language} to: {args.output}")
with open(args.output, 'w') as f:
f.write('Summary of: ' + args.location + "\n\n")
for c in chunks:
response = openai.ChatCompletion.create(
frequency_penalty=0.0,
max_tokens=1000,
model="gpt-3.5-turbo",
presence_penalty=1.0,
temperature=0.2,
messages=[
{"role": "system",
"content": f"You are an assistant helping to summarize transcipts of an audio or video conversation. The summary should be written in the {language} language."},
{"role": "user", "content": c}
],
)
f.write(response['choices'][0]['message']['content'] + "\n\n")
summaries = summarize_chunks(chunks, tokenizer, model)
for summary in summaries:
f.write(summary.strip() + "\n\n")
logger.info("Summarization completed")
# Summarization takes a lot of time, so do this separately at the end
files_to_upload = ["summary.txt"]
upload_files(files_to_upload)
if __name__ == "__main__":
main()