mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Merge pull request #7 from Monadical-SAS/whisper-jax-gokul
Add more features to codebase
This commit is contained in:
47
42min-StartupsTechTalk-AGENDA-FULL.txt
Normal file
47
42min-StartupsTechTalk-AGENDA-FULL.txt
Normal file
@@ -0,0 +1,47 @@
|
||||
AGENDA: Most important things to look for in a start up
|
||||
|
||||
TAM: Make sure the market is sufficiently large than once they win they can get rewarded
|
||||
- Medium sized markets that should be winner take all can work
|
||||
- TAM needs to be realistic of direct market size
|
||||
|
||||
Product market fit: Being in a good market with a product than can satisfy that market
|
||||
- Solves a problem
|
||||
- Builds a solution a customer wants to buy
|
||||
- Either saves the customer something (time/money/pain) or gives them something (revenue/enjoyment)
|
||||
|
||||
Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
|
||||
- Revenue minus direct costs
|
||||
- Raw input costs (materials, variable labour), direct cost of delivering and servicing the sale
|
||||
- Attractive as a % of sales so it can contribute to fixed overhead
|
||||
- Look for high incremental contribution margin
|
||||
|
||||
LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
|
||||
- LTV = Purchase value x number of purchases x customer lifespan
|
||||
- CAC = All-in costs of sales + marketing over number of new customer additions
|
||||
- Strong reputation leads to referrals leads to lower CAC. Want customers evangelizing product/service
|
||||
- Rule of thumb higher than 3
|
||||
|
||||
Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
|
||||
- Selling to replenish revenue every year is hard
|
||||
- Can run through entire customer base over time
|
||||
- Low churn builds strong net dollar retention
|
||||
|
||||
Business: Must have sufficient barriers to entry to ward off copy-cats once established
|
||||
- High switching costs (lock-in)
|
||||
- Addictive
|
||||
- Steep learning curve once adopted (form of switching cost)
|
||||
- Two sided liquidity
|
||||
- Patents, IP, Branding
|
||||
- No hyper-scaler who can roll over you quickly
|
||||
- Scale could be a barrier to entry but works against most start-ups, not for them
|
||||
- Once developed, answer question: Could a well funded competitor starting up today easily duplicate this business or is it cheaper to buy the start up?
|
||||
|
||||
Founders: Must be religious about their product. Believe they will change the world against all odds.
|
||||
- Just money in the bank is not enough to build a successful company. Just good tech not enough
|
||||
to build a successful company
|
||||
- Founders must be motivated to build something, not (all) about money. They would be doing
|
||||
this for free because they believe in it. Not looking for quick score
|
||||
- Founders must be persuasive. They will be asking others to sacrifice to make their dream come
|
||||
to life. They will need to convince investors this company can work and deserves funding.
|
||||
- Must understand who the customer is and what problem they are helping to solve.
|
||||
- Founders aren’t expected to know all the preceding points in this document but have an understanding of most of this, and be able to offer a vision.
|
||||
@@ -10,7 +10,7 @@ To setup,
|
||||
2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
|
||||
3) Run the script setup_depedencies.sh.
|
||||
|
||||
``` chmod +x setup_dependecies.sh ```
|
||||
``` chmod +x setup_dependencies.sh ```
|
||||
|
||||
``` sh setup_dependencies.sh <ENV>```
|
||||
|
||||
@@ -31,13 +31,15 @@ To setup,
|
||||
|
||||
``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ" --transcript transcript.txt summary.txt ```
|
||||
|
||||
5) ``` pip install -r requirements.txt```
|
||||
|
||||
|
||||
|
||||
NEXT STEPS:
|
||||
|
||||
1) Run this demo on a local Mac M1 to test flow and observe the performance
|
||||
2) Create a pipeline using microphone to listen to audio chunks to perform transcription realtime (and also efficiently
|
||||
summarize it as well)
|
||||
summarize it as well) -> *done as part of whisjax_realtime_trial.py*
|
||||
3) Create a RunPod setup for this feature (mentioned in 1 & 2) and test it end-to-end
|
||||
4) Perform Speaker Diarization using Whisper-JAX
|
||||
5) Based on feasibility of above points, explore suitable visualizations for transcription & summarization.
|
||||
|
||||
8
agenda-headers.txt
Normal file
8
agenda-headers.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
AGENDA: Most important things to look for in a start up
|
||||
TAM: Make sure the market is sufficiently large than once they win they can get rewarded
|
||||
Product market fit: Being in a good market with a product than can satisfy that market
|
||||
Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
|
||||
LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
|
||||
Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
|
||||
Business: Must have sufficient barriers to entry to ward off copy-cats once established
|
||||
Founders: Must be religious about their product. Believe they will change the world against all odds.
|
||||
@@ -2,6 +2,9 @@
|
||||
# Set exception rule for OpenMP error to allow duplicate lib initialization
|
||||
KMP_DUPLICATE_LIB_OK=TRUE
|
||||
# Export OpenAI API Key
|
||||
OPENAI_APIKEY=API_KEY
|
||||
OPENAI_APIKEY=***REMOVED***
|
||||
# Export Whisper Model Size
|
||||
WHISPER_MODEL_SIZE=tiny
|
||||
WHISPER_MODEL_SIZE=tiny
|
||||
AWS_ACCESS_KEY=
|
||||
AWS_SECRET_KEY=
|
||||
BUCKET_NAME='reflector-bucket'
|
||||
51
file_util.py
Normal file
51
file_util.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import boto3
|
||||
import botocore
|
||||
import configparser
|
||||
from loguru import logger
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
|
||||
BUCKET_NAME = 'reflector-bucket'
|
||||
|
||||
s3 = boto3.client('s3',
|
||||
aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
|
||||
aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"])
|
||||
|
||||
def upload_files(files_to_upload):
|
||||
"""
|
||||
Upload a list of files to the configured S3 bucket
|
||||
:param files_to_upload:
|
||||
:return:
|
||||
"""
|
||||
for KEY in files_to_upload:
|
||||
logger.info("Uploading file " + KEY)
|
||||
try:
|
||||
s3.upload_file(KEY, BUCKET_NAME, KEY)
|
||||
except botocore.exceptions.ClientError as e:
|
||||
print(e.response)
|
||||
|
||||
|
||||
def download_files(files_to_download):
|
||||
"""
|
||||
Download a list of files from the configured S3 bucket
|
||||
:param files_to_download:
|
||||
:return:
|
||||
"""
|
||||
for KEY in files_to_download:
|
||||
logger.info("Downloading file " + KEY)
|
||||
try:
|
||||
s3.download_file(BUCKET_NAME, KEY, KEY)
|
||||
except botocore.exceptions.ClientError as e:
|
||||
if e.response['Error']['Code'] == "404":
|
||||
print("The object does not exist.")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
if sys.argv[1] == "download":
|
||||
download_files([sys.argv[2]])
|
||||
elif sys.argv[1] == "upload":
|
||||
upload_files([sys.argv[2]])
|
||||
@@ -1,4 +1,47 @@
|
||||
pyaudio==0.2.13
|
||||
keyboard==0.13.5
|
||||
pynput==1.7.6
|
||||
wave==0.0.2
|
||||
wave==0.0.2
|
||||
aiohttp==3.8.4
|
||||
aiosignal==1.3.1
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
certifi==2023.5.7
|
||||
charset-normalizer==3.1.0
|
||||
decorator==4.4.2
|
||||
filelock==3.12.0
|
||||
frozenlist==1.3.3
|
||||
idna==3.4
|
||||
imageio==2.29.0
|
||||
imageio-ffmpeg==0.4.8
|
||||
Jinja2==3.1.2
|
||||
llvmlite==0.40.0
|
||||
loguru==0.7.0
|
||||
MarkupSafe==2.1.2
|
||||
more-itertools==9.1.0
|
||||
moviepy==1.0.3
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
networkx==3.1
|
||||
numba==0.57.0
|
||||
numpy==1.24.3
|
||||
openai==0.27.7
|
||||
openai-whisper @ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
|
||||
Pillow==9.5.0
|
||||
proglog==0.1.10
|
||||
pytube==15.0.0
|
||||
regex==2023.5.5
|
||||
six==1.16.0
|
||||
sympy==1.12
|
||||
tiktoken==0.3.3
|
||||
torch==2.0.1
|
||||
tqdm==4.65.0
|
||||
typing_extensions==4.6.2
|
||||
urllib3
|
||||
yarl==1.9.2
|
||||
boto3==1.26.151
|
||||
nltk==3.8.1
|
||||
wordcloud
|
||||
spacy
|
||||
scattertext
|
||||
pandas
|
||||
BIN
transcript_timestamps.txt
Normal file
BIN
transcript_timestamps.txt
Normal file
Binary file not shown.
319
whisjax.py
319
whisjax.py
@@ -4,31 +4,42 @@
|
||||
# summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
|
||||
# summarize podcast.mp3 summary.txt
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from pytube import YouTube
|
||||
from loguru import logger
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
import jax.numpy as jnp
|
||||
import moviepy.editor
|
||||
import argparse
|
||||
import tempfile
|
||||
import whisper
|
||||
import openai
|
||||
import re
|
||||
import ast
|
||||
import configparser
|
||||
import jax.numpy as jnp
|
||||
import matplotlib.pyplot as plt
|
||||
import moviepy.editor
|
||||
import moviepy.editor
|
||||
import nltk
|
||||
import os
|
||||
import pandas as pd
|
||||
import re
|
||||
import scattertext as st
|
||||
import spacy
|
||||
import tempfile
|
||||
from loguru import logger
|
||||
from pytube import YouTube
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
from urllib.parse import urlparse
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
from wordcloud import WordCloud, STOPWORDS
|
||||
|
||||
from file_util import upload_files, download_files
|
||||
|
||||
nltk.download('punkt')
|
||||
|
||||
# Configurations can be found in config.ini. Set them properly before executing
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
|
||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||
OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"]
|
||||
|
||||
MAX_WORDS_IN_CHUNK = 2500
|
||||
MAX_OUTPUT_TOKENS = 1000
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Parse the CLI arguments
|
||||
:return: parser object
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
|
||||
description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT."
|
||||
@@ -37,43 +48,185 @@ def init_argparse() -> argparse.ArgumentParser:
|
||||
parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str,
|
||||
default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
|
||||
parser.add_argument("-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
|
||||
parser.add_argument(
|
||||
"-m", "--model_name", help="Name or path of the BART model",
|
||||
type=str, default="facebook/bart-base")
|
||||
parser.add_argument("location")
|
||||
parser.add_argument("output")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def chunk_text(txt):
|
||||
sentences = re.split('[.!?]', txt)
|
||||
|
||||
def chunk_text(txt, max_chunk_length=500):
|
||||
"""
|
||||
Split text into smaller chunks.
|
||||
:param txt: Text to be chunked
|
||||
:param max_chunk_length: length of chunk
|
||||
:return: chunked texts
|
||||
"""
|
||||
sentences = nltk.sent_tokenize(txt)
|
||||
chunks = []
|
||||
chunk = ""
|
||||
size = 0
|
||||
|
||||
for s in sentences:
|
||||
# Get the number of words in this sentence.
|
||||
n = len(re.findall(r'\w+', s))
|
||||
|
||||
# Skip over empty sentences.
|
||||
if n == 0:
|
||||
continue
|
||||
|
||||
# We need to break the text up into chunks so as not to exceed the max
|
||||
# number of tokens accepted by the ChatGPT model.
|
||||
if size + n > MAX_WORDS_IN_CHUNK:
|
||||
chunks.append(chunk)
|
||||
size = n
|
||||
chunk = s
|
||||
current_chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) < max_chunk_length:
|
||||
current_chunk += f" {sentence.strip()}"
|
||||
else:
|
||||
chunk = chunk + s
|
||||
size = size + n
|
||||
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = f"{sentence.strip()}"
|
||||
chunks.append(current_chunk.strip())
|
||||
return chunks
|
||||
|
||||
|
||||
def summarize_chunks(chunks, tokenizer, model):
|
||||
"""
|
||||
Summarize each chunk using a summarizer model
|
||||
:param chunks:
|
||||
:param tokenizer:
|
||||
:param model:
|
||||
:return:
|
||||
"""
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c, return_tensors='pt')
|
||||
summary_ids = model.generate(
|
||||
input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
return summaries
|
||||
|
||||
|
||||
def create_wordcloud():
|
||||
"""
|
||||
Create a basic word cloud visualization of transcribed text
|
||||
:return: None. The wordcloud image is saved locally
|
||||
"""
|
||||
with open("transcript.txt", "r") as f:
|
||||
transcription_text = f.read()
|
||||
|
||||
stopwords = set(STOPWORDS)
|
||||
|
||||
# python_mask = np.array(PIL.Image.open("download1.png"))
|
||||
|
||||
wordcloud = WordCloud(height=800, width=800,
|
||||
background_color='white',
|
||||
stopwords=stopwords,
|
||||
min_font_size=8).generate(transcription_text)
|
||||
|
||||
# Plot wordcloud and save image
|
||||
plt.figure(facecolor=None)
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.tight_layout(pad=0)
|
||||
plt.savefig("wordcloud.png")
|
||||
|
||||
|
||||
def create_talk_diff_scatter_viz():
|
||||
"""
|
||||
Perform agenda vs transription diff to see covered topics.
|
||||
Create a scatter plot of words in topics.
|
||||
:return: None. Saved locally.
|
||||
"""
|
||||
spaCy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spaCy_model)
|
||||
nlp.add_pipe('sentencizer')
|
||||
|
||||
agenda_topics = []
|
||||
agenda = []
|
||||
# Load the agenda
|
||||
with open("agenda-headers.txt", "r") as f:
|
||||
for line in f.readlines():
|
||||
if line.strip():
|
||||
agenda.append(line.strip())
|
||||
agenda_topics.append(line.split(":")[0])
|
||||
|
||||
# Load the transcription with timestamp
|
||||
with open("transcript_timestamps.txt", "r") as f:
|
||||
transcription_timestamp_text = f.read()
|
||||
|
||||
res = ast.literal_eval(transcription_timestamp_text)
|
||||
chunks = res["chunks"]
|
||||
|
||||
# create df for processing
|
||||
df = pd.DataFrame.from_dict(res["chunks"])
|
||||
|
||||
covered_items = {}
|
||||
# ts: timestamp
|
||||
# Map each timestamped chunk with top1 and top2 matched agenda
|
||||
ts_to_topic_mapping_top_1 = {}
|
||||
ts_to_topic_mapping_top_2 = {}
|
||||
|
||||
# Also create a mapping of the different timestamps in which each topic was covered
|
||||
topic_to_ts_mapping_top_1 = {}
|
||||
topic_to_ts_mapping_top_2 = {}
|
||||
|
||||
similarity_threshold = 0.7
|
||||
|
||||
for c in chunks:
|
||||
doc_transcription = nlp(c["text"])
|
||||
topic_similarities = []
|
||||
for item in range(len(agenda)):
|
||||
item_doc = nlp(agenda[item])
|
||||
# if not doc_transcription or not all(token.has_vector for token in doc_transcription):
|
||||
if not doc_transcription:
|
||||
continue
|
||||
similarity = doc_transcription.similarity(item_doc)
|
||||
topic_similarities.append((item, similarity))
|
||||
topic_similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
for i in range(2):
|
||||
if topic_similarities[i][1] >= similarity_threshold:
|
||||
covered_items[agenda[topic_similarities[i][0]]] = True
|
||||
# top1 match
|
||||
if i == 0:
|
||||
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
||||
# top2 match
|
||||
else:
|
||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]] = c["timestamp"]
|
||||
|
||||
|
||||
def create_new_columns(record):
|
||||
"""
|
||||
Accumulate the mapping information into the df
|
||||
:param record:
|
||||
:return:
|
||||
"""
|
||||
record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]]
|
||||
record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]]
|
||||
return record
|
||||
|
||||
df = df.apply(create_new_columns, axis=1)
|
||||
|
||||
# Count the number of items covered and calculatre the percentage
|
||||
num_covered_items = sum(covered_items.values())
|
||||
percentage_covered = num_covered_items / len(agenda) * 100
|
||||
|
||||
# Print the results
|
||||
print("💬 Agenda items covered in the transcription:")
|
||||
for item in agenda:
|
||||
if item in covered_items and covered_items[item]:
|
||||
print("✅ ", item)
|
||||
else:
|
||||
print("❌ ", item)
|
||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||
|
||||
# Save df for further experimentation
|
||||
df.to_pickle("df.pkl")
|
||||
|
||||
# Scatter plot of topics
|
||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||
corpus = st.CorpusFromParsedDocuments(
|
||||
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||
html = st.produce_scattertext_explorer(
|
||||
corpus,
|
||||
category='TAM', category_name='TAM', not_category_name='Churn',
|
||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||
width_in_pixels=1000,
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
open('./demo_compact.html', 'w').write(html)
|
||||
|
||||
def main():
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
@@ -83,6 +236,8 @@ def main():
|
||||
# audio or video file.
|
||||
url = urlparse(args.location)
|
||||
|
||||
# S3 : Pull artefacts to S3 bucket ?
|
||||
|
||||
media_file = ""
|
||||
if url.scheme == 'http' or url.scheme == 'https':
|
||||
# Check if we're being asked to retreive a YouTube URL, which is handled
|
||||
@@ -103,65 +258,81 @@ def main():
|
||||
logger.info(" XXX - This method hasn't been implemented yet.")
|
||||
elif url.scheme == '':
|
||||
media_file = url.path
|
||||
# If file is not present locally, take it from S3 bucket
|
||||
if not os.path.exists(media_file):
|
||||
download_files([media_file])
|
||||
else:
|
||||
print("Unsupported URL scheme: " + url.scheme)
|
||||
quit()
|
||||
|
||||
# If the media file we just retrieved is a video, extract its audio stream.
|
||||
# XXX - We should be checking if we've downloaded an audio file (eg .mp3),
|
||||
# XXX - in which case we can skip this step. For now we'll assume that
|
||||
# XXX - everything is an mp4 video.
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
|
||||
logger.info(f"Extracting audio to: {audio_filename}")
|
||||
|
||||
video = moviepy.editor.VideoFileClip(media_file)
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
# Handle video
|
||||
try:
|
||||
video = moviepy.editor.VideoFileClip(media_file)
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
logger.info(f"Extracting audio to: {audio_filename}")
|
||||
# Handle audio only file
|
||||
except:
|
||||
audio = moviepy.editor.AudioFileClip(media_file)
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
|
||||
audio.write_audiofile(audio_filename, logger=None)
|
||||
|
||||
logger.info("Finished extracting audio")
|
||||
|
||||
# Convert the audio to text using the OpenAI Whisper model
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE, dtype=jnp.float16, batch_size=16)
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(audio_filename, return_timestamps=True)
|
||||
logger.info("Finished transcribing file")
|
||||
|
||||
# If we got the transcript parameter on the command line, save the transcript to the specified file.
|
||||
# If we got the transcript parameter on the command line,
|
||||
# save the transcript to the specified file.
|
||||
if args.transcript:
|
||||
logger.info(f"Saving transcript to: {args.transcript}")
|
||||
transcript_file = open(args.transcript, "w")
|
||||
transcript_file_timestamps = open(args.transcript[0:len(args.transcript)-4] + "_timestamps.txt", "w")
|
||||
transcript_file.write(whisper_result["text"])
|
||||
transcript_file_timestamps.write(str(whisper_result))
|
||||
transcript_file.close()
|
||||
transcript_file_timestamps.close()
|
||||
|
||||
# Summarize the generated transcript using OpenAI
|
||||
openai.api_key = OPENAI_APIKEY
|
||||
logger.info("Creating word cloud")
|
||||
create_wordcloud()
|
||||
|
||||
# Break the text up into smaller chunks for ChatGPT to summarize.
|
||||
logger.info(f"Breaking transcript up into smaller chunks with MAX_WORDS_IN_CHUNK = {MAX_WORDS_IN_CHUNK}")
|
||||
logger.info("Performing talk-diff and talk-diff visualization")
|
||||
create_talk_diff_scatter_viz()
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
files_to_upload = ["transcript.txt", "transcript_timestamps.txt",
|
||||
"demo_compact.html", "df.pkl",
|
||||
"wordcloud.png"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
# Summarize the generated transcript using the BART model
|
||||
logger.info(f"Loading BART model: {args.model_name}")
|
||||
tokenizer = BartTokenizer.from_pretrained(args.model_name)
|
||||
model = BartForConditionalGeneration.from_pretrained(args.model_name)
|
||||
|
||||
logger.info("Breaking transcript into smaller chunks")
|
||||
chunks = chunk_text(whisper_result['text'])
|
||||
logger.info(f"Transcript broken up into {len(chunks)} chunks")
|
||||
|
||||
language = args.language
|
||||
logger.info(
|
||||
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
|
||||
|
||||
logger.info(f"Writing summary text in {language} to: {args.output}")
|
||||
logger.info(f"Writing summary text in {args.language} to: {args.output}")
|
||||
with open(args.output, 'w') as f:
|
||||
f.write('Summary of: ' + args.location + "\n\n")
|
||||
|
||||
for c in chunks:
|
||||
response = openai.ChatCompletion.create(
|
||||
frequency_penalty=0.0,
|
||||
max_tokens=1000,
|
||||
model="gpt-3.5-turbo",
|
||||
presence_penalty=1.0,
|
||||
temperature=0.2,
|
||||
messages=[
|
||||
{"role": "system",
|
||||
"content": f"You are an assistant helping to summarize transcipts of an audio or video conversation. The summary should be written in the {language} language."},
|
||||
{"role": "user", "content": c}
|
||||
],
|
||||
)
|
||||
f.write(response['choices'][0]['message']['content'] + "\n\n")
|
||||
summaries = summarize_chunks(chunks, tokenizer, model)
|
||||
for summary in summaries:
|
||||
f.write(summary.strip() + "\n\n")
|
||||
|
||||
logger.info("Summarization completed")
|
||||
|
||||
# Summarization takes a lot of time, so do this separately at the end
|
||||
files_to_upload = ["summary.txt"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user