From 02c928a7cfd6ab155539b536fae8b1b49af857b7 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Mon, 24 Jul 2023 13:19:24 +0530 Subject: [PATCH 01/11] use faster-whisper pipeline --- format_output.py | 30 ++++++++++ server_executor_cleaned.py => server.py | 77 ++++++++++++++++++------- trials/api.py | 56 ++++++++++++++++++ 3 files changed, 142 insertions(+), 21 deletions(-) create mode 100644 format_output.py rename server_executor_cleaned.py => server.py (82%) diff --git a/format_output.py b/format_output.py new file mode 100644 index 00000000..6cc3006c --- /dev/null +++ b/format_output.py @@ -0,0 +1,30 @@ +import json + +with open("meeting_titles_and_summaries.txt", "r") as f: + outputs = f.read() + +outputs = json.loads(outputs) + +transcript_file = open("meeting_transcript.txt", "a") +title_description_file = open("meeting_title_description.txt", "a") + +for item in outputs["topics"]: + transcript_file.write(item["transcript"]) + + title_description_file.write("TITLE: \n") + title_description_file.write(item["title"]) + title_description_file.write("\n") + + title_description_file.write("DESCRIPTION: \n") + title_description_file.write(item["description"]) + title_description_file.write("\n") + + title_description_file.write("TRANSCRIPT: \n") + title_description_file.write(item["transcript"]) + title_description_file.write("\n") + + title_description_file.write("---------------------------------------- \n\n") + + + + diff --git a/server_executor_cleaned.py b/server.py similarity index 82% rename from server_executor_cleaned.py rename to server.py index 2d8f3747..52f98136 100644 --- a/server_executor_cleaned.py +++ b/server.py @@ -1,11 +1,13 @@ import asyncio import datetime +import os import io +import numpy as np import json import uuid import wave from concurrent.futures import ThreadPoolExecutor - +from faster_whisper import WhisperModel import aiohttp_cors import jax.numpy as jnp import requests @@ -21,9 +23,9 @@ from sortedcontainers import SortedDict pcs = set() relay = MediaRelay() data_channel = None -pipeline = FlaxWhisperPipline("openai/whisper-tiny", - dtype=jnp.float16, - batch_size=16) +model = WhisperModel("tiny", device="cpu", + compute_type="float32", + num_workers=12) CHANNELS = 2 RATE = 48000 @@ -80,6 +82,7 @@ def get_title_and_summary(llm_input_text, last_timestamp): "cmd": "UPDATE_TOPICS", "topics": incremental_responses, } + except Exception as e: print("Exception" + str(e)) result = None @@ -113,18 +116,21 @@ def channel_send_transcript(channel): # Due to exceptions if one of the earlier batches can't return # a transcript, we don't want to be stuck waiting for the result # With the threshold size of 3, we pop the first(lost) element - elif len(sorted_transcripts) >= 3: - del sorted_transcripts[least_time] + else: + if len(sorted_transcripts) >= 3: + del sorted_transcripts[least_time] except Exception as e: print("Exception", str(e)) pass def get_transcription(frames): + print(type(frames)) + print(type(frames[0])) print("Transcribing..") sorted_transcripts[frames[0].time] = None - out_file = io.BytesIO() - wf = wave.open(out_file, "wb") + audiofilename = "test" + str(datetime.datetime.now()) + wf = wave.open(audiofilename, "wb") wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(2) @@ -133,22 +139,48 @@ def get_transcription(frames): wf.writeframes(b"".join(frame.to_ndarray())) wf.close() - # To-Do: Look into WhisperTimeStampLogitsProcessor exception - try: - whisper_result = pipeline(out_file.getvalue(), return_timestamps=True) - except Exception as e: - return + result_text = "" - global transcription_text, last_transcribed_time - transcription_text += whisper_result["text"] - duration = whisper_result["chunks"][0]["timestamp"][1] - if not duration: - duration = 5.0 - last_transcribed_time += duration + try: + segments, _ = model.transcribe(audiofilename, + language="en", + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500) + ) + segments = list(segments) + result_text = "" + duration = 0.0 + for segment in segments: + result_text += segment.text + start_time = segment.start + end_time = segment.end + if not segment.start: + start_time = 0.0 + if not segment.end: + end_time = 5.5 + duration += (end_time - start_time) + + global last_transcribed_time + last_transcribed_time += duration + + except Exception as e: + print("Exception" + str(e)) + pass + + # + try: + os.remove(audiofilename) + except Exception as e: + print("Exception :", str(e)) + pass + + global transcription_text + transcription_text += result_text result = { "cmd": "SHOW_TRANSCRIPTION", - "text": whisper_result["text"] + "text": result_text } sorted_transcripts[frames[0].time] = result return result @@ -167,6 +199,9 @@ def get_final_summary_response(): seconds=round(last_transcribed_time))), "summary": final_summary } + + with open("meeting_titles_and_summaries.txt", "a") as f: + f.write(json.dumps(incremental_responses)) return response @@ -196,7 +231,7 @@ class AudioStreamTrack(MediaStreamTrack): else None ) - if len(transcription_text) > 500: + if len(transcription_text) > 750: llm_input_text = transcription_text transcription_text = "" llm_result = run_in_executor(get_title_and_summary, diff --git a/trials/api.py b/trials/api.py index e69de29b..5e25f4d1 100644 --- a/trials/api.py +++ b/trials/api.py @@ -0,0 +1,56 @@ +import requests +import spacy + +# This is the URL of text-generation-webui +URL = "http://216.153.52.83:5000/api/v1/generate" + +headers = { + "Content-Type": "application/json" +} + + +def split_text_file(filename, token_count): + nlp = spacy.load('en_core_web_md') + + with open(filename, 'r') as file: + text = file.read() + + doc = nlp(text) + total_tokens = len(doc) + + parts = [] + start_index = 0 + + while start_index < total_tokens: + end_index = start_index + token_count + part_tokens = doc[start_index:end_index-5] + part = ' '.join(token.text for token in part_tokens) + parts.append(part) + start_index = end_index + + return parts + + +final_summary = "" +parts = split_text_file("transcript.txt", 1600) +previous_summary = "" + +for part in parts: + prompt = f""" + ### Human: + Given the following text, distill the most important information + into a short summary: {part} + + ### Assistant: + """ + data = { + "prompt": prompt + } + try: + response = requests.post(URL, headers=headers, json=data) + print(response.json()) + except Exception as e: + print(str(e)) + +with open("sum.txt", "w") as sum: + sum.write(" ".join(final_summary)) \ No newline at end of file From ab42858ec835b237cd3e61850b154bf128caa089 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 09:52:13 +0530 Subject: [PATCH 02/11] update server to use faster whisper --- server.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/server.py b/server.py index 52f98136..2b9ffd4e 100644 --- a/server.py +++ b/server.py @@ -125,10 +125,13 @@ def channel_send_transcript(channel): def get_transcription(frames): - print(type(frames)) - print(type(frames[0])) print("Transcribing..") sorted_transcripts[frames[0].time] = None + + # Passing IO objects instead of temporary files throws an error + # Passing ndarrays (typecasted with float) does not give any + # transcription. Refer issue + # https://github.com/guillaumekln/faster-whisper/issues/369 audiofilename = "test" + str(datetime.datetime.now()) wf = wave.open(audiofilename, "wb") wf.setnchannels(CHANNELS) @@ -148,6 +151,7 @@ def get_transcription(frames): vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500) ) + os.remove(audiofilename) segments = list(segments) result_text = "" duration = 0.0 @@ -161,23 +165,14 @@ def get_transcription(frames): end_time = 5.5 duration += (end_time - start_time) - global last_transcribed_time + global last_transcribed_time, transcription_text last_transcribed_time += duration + transcription_text += result_text except Exception as e: print("Exception" + str(e)) pass - # - try: - os.remove(audiofilename) - except Exception as e: - print("Exception :", str(e)) - pass - - global transcription_text - transcription_text += result_text - result = { "cmd": "SHOW_TRANSCRIPTION", "text": result_text From 25f34bf9e5d1bc37316ebf263dfa6d2b5b0c2398 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 10:02:25 +0530 Subject: [PATCH 03/11] organize imports --- __init__.py | 0 format_output.py | 30 ------------------------------ server.py | 34 ++++++++++++++++------------------ stream_client.py | 2 +- utils/config.ini | 32 ++++++++++++++++---------------- utils/format_output.py | 32 ++++++++++++++++++++++++++++++++ utils/text_utilities.py | 6 +++--- utils/viz_utilities.py | 23 +++++++++++------------ 8 files changed, 79 insertions(+), 80 deletions(-) create mode 100644 __init__.py delete mode 100644 format_output.py create mode 100644 utils/format_output.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/format_output.py b/format_output.py deleted file mode 100644 index 6cc3006c..00000000 --- a/format_output.py +++ /dev/null @@ -1,30 +0,0 @@ -import json - -with open("meeting_titles_and_summaries.txt", "r") as f: - outputs = f.read() - -outputs = json.loads(outputs) - -transcript_file = open("meeting_transcript.txt", "a") -title_description_file = open("meeting_title_description.txt", "a") - -for item in outputs["topics"]: - transcript_file.write(item["transcript"]) - - title_description_file.write("TITLE: \n") - title_description_file.write(item["title"]) - title_description_file.write("\n") - - title_description_file.write("DESCRIPTION: \n") - title_description_file.write(item["description"]) - title_description_file.write("\n") - - title_description_file.write("TRANSCRIPT: \n") - title_description_file.write(item["transcript"]) - title_description_file.write("\n") - - title_description_file.write("---------------------------------------- \n\n") - - - - diff --git a/server.py b/server.py index 2b9ffd4e..6ff68400 100644 --- a/server.py +++ b/server.py @@ -1,25 +1,23 @@ import asyncio import datetime -import os -import io -import numpy as np import json +import os import uuid import wave from concurrent.futures import ThreadPoolExecutor -from faster_whisper import WhisperModel + import aiohttp_cors -import jax.numpy as jnp import requests from aiohttp import web from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription from aiortc.contrib.media import MediaRelay from av import AudioFifo +from faster_whisper import WhisperModel from loguru import logger -from whisper_jax import FlaxWhisperPipline -from utils.run_utils import run_in_executor from sortedcontainers import SortedDict +from utils.run_utils import run_in_executor + pcs = set() relay = MediaRelay() data_channel = None @@ -45,7 +43,7 @@ blacklisted_messages = [" Thank you.", " See you next time!", def get_title_and_summary(llm_input_text, last_timestamp): - print("Generating title and summary") + ("Generating title and summary") # output = llm.generate(prompt) # Use monadical-ml to fire this query to an LLM and get result @@ -69,13 +67,13 @@ def get_title_and_summary(llm_input_text, last_timestamp): "prompt": prompt } - # To-do: Handle unexpected output formats from the model + # TODO : Handle unexpected output formats from the model try: response = requests.post(LLM_URL, headers=headers, json=data) output = json.loads(response.json()["results"][0]["text"]) output["description"] = output.pop("summary") output["transcript"] = llm_input_text - output["timestamp"] =\ + output["timestamp"] = \ str(datetime.timedelta(seconds=round(last_timestamp))) incremental_responses.append(output) result = { @@ -84,13 +82,13 @@ def get_title_and_summary(llm_input_text, last_timestamp): } except Exception as e: - print("Exception" + str(e)) + logger.info("Exception" + str(e)) result = None return result def channel_log(channel, t, message): - print("channel(%s) %s %s" % (channel.label, t, message)) + logger.info("channel(%s) %s %s" % (channel.label, t, message)) def channel_send(channel, message): @@ -120,17 +118,18 @@ def channel_send_transcript(channel): if len(sorted_transcripts) >= 3: del sorted_transcripts[least_time] except Exception as e: - print("Exception", str(e)) + logger.info("Exception", str(e)) pass def get_transcription(frames): - print("Transcribing..") + logger.info("Transcribing..") sorted_transcripts[frames[0].time] = None + # TODO: # Passing IO objects instead of temporary files throws an error # Passing ndarrays (typecasted with float) does not give any - # transcription. Refer issue + # transcription. Refer issue, # https://github.com/guillaumekln/faster-whisper/issues/369 audiofilename = "test" + str(datetime.datetime.now()) wf = wave.open(audiofilename, "wb") @@ -170,7 +169,7 @@ def get_transcription(frames): transcription_text += result_text except Exception as e: - print("Exception" + str(e)) + logger.info("Exception" + str(e)) pass result = { @@ -195,7 +194,7 @@ def get_final_summary_response(): "summary": final_summary } - with open("meeting_titles_and_summaries.txt", "a") as f: + with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f: f.write(json.dumps(incremental_responses)) return response @@ -275,7 +274,6 @@ async def offer(request): if isinstance(message, str) and message.startswith("ping"): channel_send(channel, "pong" + message[4:]) - @pc.on("connectionstatechange") async def on_connectionstatechange(): log_info("Connection state is " + pc.connectionState) diff --git a/stream_client.py b/stream_client.py index 124c734d..1ed9cf31 100644 --- a/stream_client.py +++ b/stream_client.py @@ -114,7 +114,7 @@ class StreamClient: self.channel_log(channel, "<", message) if isinstance(message, str) and message.startswith("pong"): - elapsed_ms = (self.current_stamp() - int(message[5:]))\ + elapsed_ms = (self.current_stamp() - int(message[5:])) \ / 1000 print(" RTT %.2f ms" % elapsed_ms) diff --git a/utils/config.ini b/utils/config.ini index 976f4a32..9ba12959 100644 --- a/utils/config.ini +++ b/utils/config.ini @@ -1,24 +1,24 @@ [DEFAULT] #SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization -KMP_DUPLICATE_LIB_OK=TRUE +KMP_DUPLICATE_LIB_OK = TRUE #ExportOpenAIAPIKey -OPENAI_APIKEY= +OPENAI_APIKEY = #ExportWhisperModelSize -WHISPER_MODEL_SIZE=tiny -WHISPER_REAL_TIME_MODEL_SIZE=tiny +WHISPER_MODEL_SIZE = tiny +WHISPER_REAL_TIME_MODEL_SIZE = tiny #AWSconfig -AWS_ACCESS_KEY=***REMOVED*** -AWS_SECRET_KEY=***REMOVED*** -BUCKET_NAME=reflector-bucket +AWS_ACCESS_KEY = ***REMOVED*** +AWS_SECRET_KEY = ***REMOVED*** +BUCKET_NAME = reflector-bucket #Summarizerconfig -SUMMARY_MODEL=facebook/bart-large-cnn -INPUT_ENCODING_MAX_LENGTH=1024 -MAX_LENGTH=2048 -BEAM_SIZE=6 -MAX_CHUNK_LENGTH=1024 -SUMMARIZE_USING_CHUNKS=YES +SUMMARY_MODEL = facebook/bart-large-cnn +INPUT_ENCODING_MAX_LENGTH = 1024 +MAX_LENGTH = 2048 +BEAM_SIZE = 6 +MAX_CHUNK_LENGTH = 1024 +SUMMARIZE_USING_CHUNKS = YES #Audiodevice -BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator -AV_FOUNDATION_DEVICE_ID=1 +BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator +AV_FOUNDATION_DEVICE_ID = 1 # LLM PATH -LLM_PATH= +LLM_PATH = diff --git a/utils/format_output.py b/utils/format_output.py new file mode 100644 index 00000000..4f026ce2 --- /dev/null +++ b/utils/format_output.py @@ -0,0 +1,32 @@ +import json + +with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f: + outputs = f.read() + +outputs = json.loads(outputs) + +transcript_file = open("../artefacts/meeting_transcript.txt", "a") +title_desc_file = open("../artefacts/meeting_title_description.txt", "a") +summary_file = open("../artefacts/meeting_summary.txt", "a") + +for item in outputs["topics"]: + transcript_file.write(item["transcript"]) + summary_file.write(item["description"]) + + title_desc_file.write("TITLE: \n") + title_desc_file.write(item["title"]) + title_desc_file.write("\n") + + title_desc_file.write("DESCRIPTION: \n") + title_desc_file.write(item["description"]) + title_desc_file.write("\n") + + title_desc_file.write("TRANSCRIPT: \n") + title_desc_file.write(item["transcript"]) + title_desc_file.write("\n") + + title_desc_file.write("---------------------------------------- \n\n") + +transcript_file.close() +title_desc_file.close() +summary_file.close() diff --git a/utils/text_utilities.py b/utils/text_utilities.py index ef15c7a3..6210e78e 100644 --- a/utils/text_utilities.py +++ b/utils/text_utilities.py @@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import BartForConditionalGeneration, BartTokenizer -from utils.log_utils import logger -from utils.run_utils import config +from log_utils import logger +from run_utils import config nltk.download('punkt', quiet=True) @@ -171,7 +171,7 @@ def summarize(transcript_text, timestamp, output_filename = "real_time_" + output_filename if summarize_using_chunks != "YES": - inputs = tokenizer.\ + inputs = tokenizer. \ batch_encode_plus([transcript_text], truncation=True, padding='longest', max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]), diff --git a/utils/viz_utilities.py b/utils/viz_utilities.py index 93a9b56f..6da24bb0 100644 --- a/utils/viz_utilities.py +++ b/utils/viz_utilities.py @@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud en = spacy.load('en_core_web_md') spacy_stopwords = en.Defaults.stop_words -STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\ +STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \ union(set(spacy_stopwords)) @@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False): """ filename = "transcript" if real_time: - filename = "real_time_" + filename + "_" +\ + filename = "real_time_" + filename + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" @@ -47,7 +47,7 @@ def create_wordcloud(timestamp, real_time=False): wordcloud_name = "wordcloud" if real_time: - wordcloud_name = "real_time_" + wordcloud_name + "_" +\ + wordcloud_name = "real_time_" + wordcloud_name + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" else: wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" @@ -57,12 +57,12 @@ def create_wordcloud(timestamp, real_time=False): def create_talk_diff_scatter_viz(timestamp, real_time=False): """ - Perform agenda vs transription diff to see covered topics. + Perform agenda vs transcription diff to see covered topics. Create a scatter plot of words in topics. :return: None. Saved locally. """ - spaCy_model = "en_core_web_md" - nlp = spacy.load(spaCy_model) + spacy_model = "en_core_web_md" + nlp = spacy.load(spacy_model) nlp.add_pipe('sentencizer') agenda_topics = [] @@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): agenda_topics.append(line.split(":")[0]) # Load the transcription with timestamp - filename = "" if real_time: - filename = "./artefacts/real_time_transcript_with_timestamp_" +\ + filename = "./artefacts/real_time_transcript_with_timestamp_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: - filename = "./artefacts/transcript_with_timestamp_" +\ + filename = "./artefacts/transcript_with_timestamp_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" with open(filename) as f: transcription_timestamp_text = f.read() @@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): df = df.apply(create_new_columns, axis=1) - # Count the number of items covered and calculatre the percentage + # Count the number of items covered and calculate the percentage num_covered_items = sum(covered_items.values()) percentage_covered = num_covered_items / len(agenda) * 100 @@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): # Save df, mappings for further experimentation df_name = "df" if real_time: - df_name = "real_time_" + df_name + "_" +\ + df_name = "real_time_" + df_name + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" @@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): mappings_name = "mappings" if real_time: - mappings_name = "real_time_" + mappings_name + "_" +\ + mappings_name = "real_time_" + mappings_name + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" From 1672be0383111b07c5d6b2e2602b433422cbb006 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 10:15:16 +0530 Subject: [PATCH 04/11] organize trails --- trials/__init__.py | 0 trials/api.py | 13 +- trials/gpt2.py | 125 +++++++++--------- trials/incsum.py | 29 ++-- trials/openai_endpoint.py | 13 +- trials/whisper-jax/__init__.py | 0 whisjax.py => trials/whisper-jax/whisjax.py | 16 +-- .../whisper-jax/whisjax_realtime.py | 10 +- 8 files changed, 105 insertions(+), 101 deletions(-) create mode 100644 trials/__init__.py create mode 100644 trials/whisper-jax/__init__.py rename whisjax.py => trials/whisper-jax/whisjax.py (93%) rename whisjax_realtime.py => trials/whisper-jax/whisjax_realtime.py (94%) diff --git a/trials/__init__.py b/trials/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/api.py b/trials/api.py index 5e25f4d1..eb6a1fbb 100644 --- a/trials/api.py +++ b/trials/api.py @@ -1,11 +1,13 @@ import requests import spacy +# Enter the Machine where the LLM is hosted +LLM_MACHINE_IP = "" # This is the URL of text-generation-webui -URL = "http://216.153.52.83:5000/api/v1/generate" +URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate" headers = { - "Content-Type": "application/json" + "Content-Type": "application/json" } @@ -23,7 +25,7 @@ def split_text_file(filename, token_count): while start_index < total_tokens: end_index = start_index + token_count - part_tokens = doc[start_index:end_index-5] + part_tokens = doc[start_index:end_index - 5] part = ' '.join(token.text for token in part_tokens) parts.append(part) start_index = end_index @@ -33,7 +35,6 @@ def split_text_file(filename, token_count): final_summary = "" parts = split_text_file("transcript.txt", 1600) -previous_summary = "" for part in parts: prompt = f""" @@ -52,5 +53,5 @@ for part in parts: except Exception as e: print(str(e)) -with open("sum.txt", "w") as sum: - sum.write(" ".join(final_summary)) \ No newline at end of file +with open("summary.txt", "w") as sum: + sum.write(" ".join(final_summary)) diff --git a/trials/gpt2.py b/trials/gpt2.py index d3917af2..1930a2d2 100644 --- a/trials/gpt2.py +++ b/trials/gpt2.py @@ -1,65 +1,66 @@ -# # Approach 1 -# from transformers import GPTNeoForCausalLM, GPT2Tokenizer -# -# model_name = 'EleutherAI/gpt-neo-1.3B' -# tokenizer = GPT2Tokenizer.from_pretrained(model_name) -# model = GPTNeoForCausalLM.from_pretrained(model_name) -# -# conversation = """ -# Summarize the following conversation in 3 key sentences: -# -# We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . -# Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . -# Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . -# Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . -# Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . -# Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . -# """ -# -# input_ids = tokenizer.encode(conversation, return_tensors='pt') -# -# output = model.generate(input_ids, -# max_length=30, -# num_return_sequences=1) -# -# caption = tokenizer.decode(output[0], skip_special_tokens=True) -# print("Caption:", caption[len(input_ids):]) +# Approach 1 +from transformers import GPTNeoForCausalLM, GPT2Tokenizer -# -# # Approach 2 -# import torch -# from transformers import GPT2LMHeadModel, GPT2Tokenizer -# -# model_name = "gpt2" -# tokenizer = GPT2Tokenizer.from_pretrained(model_name) -# model = GPT2LMHeadModel.from_pretrained(model_name) -# -# model.eval() -# -# text = """ -# You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " -# """ -# -# tokenizer.pad_token = tokenizer.eos_token -# input_ids = tokenizer.encode(text, -# max_length=100, -# truncation=True, -# return_tensors="pt") -# attention_mask = torch.ones(input_ids.shape, dtype=torch.long) -# output = model.generate(input_ids, -# max_new_tokens=20, -# num_return_sequences=1, -# num_beams=2, -# attention_mask=attention_mask) -# -# chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])] -# for i, title in enumerate(chapter_titles): -# print("Caption: ", title) +model_name = 'EleutherAI/gpt-neo-1.3B' +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = GPTNeoForCausalLM.from_pretrained(model_name) + +conversation = """ +Summarize the following conversation in 3 key sentences: + +We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . +Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . +Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . +Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . +Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . +Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . +""" + +input_ids = tokenizer.encode(conversation, return_tensors='pt') + +output = model.generate(input_ids, + max_length=30, + num_return_sequences=1) + +caption = tokenizer.decode(output[0], skip_special_tokens=True) +print("Caption:", caption[len(input_ids):]) + + +# Approach 2 +import torch +from transformers import GPT2LMHeadModel, GPT2Tokenizer + +model_name = "gpt2" +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = GPT2LMHeadModel.from_pretrained(model_name) + +model.eval() + +text = """ +You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " +""" + +tokenizer.pad_token = tokenizer.eos_token +input_ids = tokenizer.encode(text, + max_length=100, + truncation=True, + return_tensors="pt") +attention_mask = torch.ones(input_ids.shape, dtype=torch.long) +output = model.generate(input_ids, + max_new_tokens=20, + num_return_sequences=1, + num_beams=2, + attention_mask=attention_mask) + +chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])] +for i, title in enumerate(chapter_titles): + print("Caption: ", title) # Approach 3 import torch -from transformers import GPT2Tokenizer, GPT2LMHeadModel +from transformers import GPT2LMHeadModel, GPT2Tokenizer + def generate_response(conversation, max_length=100): input_text = "" @@ -79,20 +80,22 @@ def generate_response(conversation, max_length=100): response = tokenizer.decode(output[0], skip_special_tokens=True) return response + if __name__ == "__main__": + + # Call appropriate approach from the main while experimenting model_name = "gpt2" model = GPT2LMHeadModel.from_pretrained(model_name) tokenizer = GPT2Tokenizer.from_pretrained(model_name) sample_chunks = [ - "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " + "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " ] conversation = [ - {"role": "system", "content": "Summarize this text" }, - {"role": "user", "content": " text : " + sample_chunks[0]}, + {"role": "system", "content": "Summarize this text"}, + {"role": "user", "content": " text : " + sample_chunks[0]}, ] response = generate_response(conversation) print("Response:", response) - diff --git a/trials/incsum.py b/trials/incsum.py index 41b3d500..5081d16c 100644 --- a/trials/incsum.py +++ b/trials/incsum.py @@ -1,9 +1,11 @@ +import spacy +import sys + + # Observe the incremental summaries by performing summaries in chunks with open("transcript.txt") as f: transcription = f.read() -import spacy - def split_text_file(filename, token_count): nlp = spacy.load('en_core_web_md') @@ -26,8 +28,9 @@ def split_text_file(filename, token_count): return parts + # Set the chunk length here to split the transcript and test -MAX_CHUNK_LENGTH=1000 +MAX_CHUNK_LENGTH = 1000 chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH) print("Number of chunks", len(chunks)) @@ -41,19 +44,17 @@ with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f: # ex. python incsum.py 1 => will run approach 1 # If no input, will run all approaches -import sys try: index = sys.argv[1] except: index = None - # Approach 1 : facebook/bart-large-cnn if index == "1" or index is None: - SUMMARY_MODEL="facebook/bart-large-cnn" - MIN_LENGTH=5 - MAX_LENGTH=10 - BEAM_SIZE=2 + SUMMARY_MODEL = "facebook/bart-large-cnn" + MIN_LENGTH = 5 + MAX_LENGTH = 10 + BEAM_SIZE = 2 print("Performing chunk summary : " + SUMMARY_MODEL) @@ -81,7 +82,6 @@ if index == "1" or index is None: for summary in summaries: f.write(summary + "\n\n") - # Approach 2 if index == "2" or index is None: print("Performing chunk summary : " + "gpt-neo-1.3B") @@ -108,14 +108,14 @@ if index == "2" or index is None: max_length=max_length, attention_mask=attention_mask, pad_token_id=model.config.eos_token_id, - num_beams=4, - length_penalty=2.0, - early_stopping=True) + num_beams=4, + length_penalty=2.0, + early_stopping=True) summary_ids = output[0, input_length:] summary = tokenizer.decode(summary_ids, skip_special_tokens=True) summaries.append(summary) with open("gptneo1.3B-summaries.txt", "a") as f: - f.write(summary + "\n\n") + f.write(summary + "\n\n") # Approach 3 if index == "3" or index is None: @@ -155,4 +155,3 @@ if index == "3" or index is None: with open("mpt-7b-summaries.txt", "a") as f: for summary in summaries: f.write(summary + "\n\n") - diff --git a/trials/openai_endpoint.py b/trials/openai_endpoint.py index 30e6a900..7a572353 100644 --- a/trials/openai_endpoint.py +++ b/trials/openai_endpoint.py @@ -11,14 +11,15 @@ openai.api_key = "" # to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points" # max_tokens=300 -sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", - " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] +sample_chunks = [ + "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] conversation = [ - {"role": "system", - "content": sample_chunks[1]}, - {"role": "user", - "content": "summarize this conversation in a few sentences by taking key points"} + {"role": "system", + "content": sample_chunks[1]}, + {"role": "user", + "content": "summarize this conversation in a few sentences by taking key points"} ] model = "gpt-3.5-turbo" diff --git a/trials/whisper-jax/__init__.py b/trials/whisper-jax/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/whisjax.py b/trials/whisper-jax/whisjax.py similarity index 93% rename from whisjax.py rename to trials/whisper-jax/whisjax.py index cfc95574..98f718f3 100644 --- a/whisjax.py +++ b/trials/whisper-jax/whisjax.py @@ -18,11 +18,11 @@ import nltk import yt_dlp as youtube_dl from whisper_jax import FlaxWhisperPipline -from utils.file_utils import download_files, upload_files -from utils.log_utils import logger -from utils.run_utils import config -from utils.text_utilities import post_process_transcription, summarize -from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.file_utils import download_files, upload_files +from ...utils.log_utils import logger +from ...utils.run_utils import config +from ...utils.text_utilities import post_process_transcription, summarize +from ...utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) @@ -30,8 +30,8 @@ nltk.download('stopwords', quiet=True) WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] NOW = datetime.now() -if not os.path.exists('./artefacts'): - os.makedirs('./artefacts') +if not os.path.exists('../../artefacts'): + os.makedirs('../../artefacts') def init_argparse() -> argparse.ArgumentParser: @@ -91,7 +91,7 @@ def main(): # Download the audio with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([args.location]) - media_file = "./artefacts/audio.mp3" + media_file = "../artefacts/audio.mp3" logger.info("Saved downloaded YouTube video to: " + media_file) else: diff --git a/whisjax_realtime.py b/trials/whisper-jax/whisjax_realtime.py similarity index 94% rename from whisjax_realtime.py rename to trials/whisper-jax/whisjax_realtime.py index 63eab04d..d1ec1a82 100644 --- a/whisjax_realtime.py +++ b/trials/whisper-jax/whisjax_realtime.py @@ -10,11 +10,11 @@ from pynput import keyboard from termcolor import colored from whisper_jax import FlaxWhisperPipline -from utils.file_utils import upload_files -from utils.log_utils import logger -from utils.run_utils import config -from utils.text_utilities import post_process_transcription, summarize -from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.file_utils import upload_files +from ...utils.log_utils import logger +from ...utils.run_utils import config +from ...utils.text_utilities import post_process_transcription, summarize +from ...utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] From cec8bbcf6c6b70092de14ebbbd6f4c0ec2ddb884 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 10:22:46 +0530 Subject: [PATCH 05/11] move all experiments to trials --- trials/bert.py | 43 +++++++++++++++++ trials/pegasus.py | 33 +++++++++++++ trials/t5.py | 27 +++++++++++ trials/vicuna.py | 44 +++++++++++++++++ trials/youtube_scraping.py | 98 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 245 insertions(+) create mode 100644 trials/bert.py create mode 100644 trials/pegasus.py create mode 100644 trials/t5.py create mode 100644 trials/vicuna.py create mode 100644 trials/youtube_scraping.py diff --git a/trials/bert.py b/trials/bert.py new file mode 100644 index 00000000..a79bb76d --- /dev/null +++ b/trials/bert.py @@ -0,0 +1,43 @@ +import torch +from transformers import BertTokenizer, BertModel +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity + +# Load the pre-trained BERT model and tokenizer +model_name = "bert-base-uncased" +model = BertModel.from_pretrained(model_name) +tokenizer = BertTokenizer.from_pretrained(model_name) + +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Load the SentenceTransformer model +sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d') + +# Define the input text +text = "Your input text to be summarized goes here." + +# Tokenize the text +tokens = tokenizer.tokenize(text) +input_ids = tokenizer.convert_tokens_to_ids(tokens) +input_ids = torch.tensor([input_ids]).to(device) + +# Get the BERT model output +with torch.no_grad(): + outputs = model(input_ids)[0] # Extract the last hidden states + +# Calculate sentence embeddings +sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy() +input_text_embedding = sentence_transformer_model.encode([text])[0] + +# Calculate cosine similarity between sentences and input text +similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings) + +# Sort the sentences by similarity scores in descending order +sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)] + +# Choose the top sentences as the summary +num_summary_sentences = 2 # Adjust as needed +summary = ". ".join(sorted_sentences[:num_summary_sentences]) +print("Summary:", summary) diff --git a/trials/pegasus.py b/trials/pegasus.py new file mode 100644 index 00000000..884ed3ee --- /dev/null +++ b/trials/pegasus.py @@ -0,0 +1,33 @@ +from transformers import PegasusForConditionalGeneration, PegasusTokenizer +import torch +# Load the Pegasus model and tokenizer +model_name = "google/pegasus-large" +model = PegasusForConditionalGeneration.from_pretrained(model_name) +tokenizer = PegasusTokenizer.from_pretrained(model_name) + +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] + + +# Define the input text for summarization +text = sample_chunks[1] + +inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device) + +# Generate the summary +summary_ids = model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + max_length=200, + num_beams=4, + length_penalty=2.0, + early_stopping=True, +) + +# Decode and print the summary +summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) +print("Summary:", summary) diff --git a/trials/t5.py b/trials/t5.py new file mode 100644 index 00000000..0c366ac6 --- /dev/null +++ b/trials/t5.py @@ -0,0 +1,27 @@ +from transformers import T5ForConditionalGeneration, T5Tokenizer +import torch +# Load the T5 model and tokenizer +model_name = "t5-base" +model = T5ForConditionalGeneration.from_pretrained(model_name) +tokenizer = T5Tokenizer.from_pretrained(model_name) + +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] + + +# Define the input text for summarization +text = "Summarize the following text in 3 key points. text : " + sample_chunks[1] + +# Tokenize the input text +inputs = tokenizer.encode(text, return_tensors="pt").to(device) + +# Generate the summary +summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True) + +# Decode and print the summary +summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) +print("Summary:", summary) diff --git a/trials/vicuna.py b/trials/vicuna.py new file mode 100644 index 00000000..588869c0 --- /dev/null +++ b/trials/vicuna.py @@ -0,0 +1,44 @@ +from gpt4all import GPT4All + +model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin") + +import spacy + + +def split_text_file(filename, token_count): + nlp = spacy.load('en_core_web_md') + + with open(filename, 'r') as file: + text = file.read() + + doc = nlp(text) + total_tokens = len(doc) + + parts = [] + start_index = 0 + + while start_index < total_tokens: + end_index = start_index + token_count + part_tokens = doc[start_index:end_index] + part = ' '.join(token.text for token in part_tokens) + parts.append(part) + start_index = end_index + + return parts + +parts = split_text_file("transcript.txt", 1800) +final_summary = [] +for part in parts: + prompt = f""" + ### Human: + Summarize the following text without missing any key points and action items. + + {part} + ### Assistant: + """ + output = model.generate(prompt) + final_summary.append(output) + + +with open("sum.txt", "w") as sum: + sum.write(" ".join(final_summary)) diff --git a/trials/youtube_scraping.py b/trials/youtube_scraping.py new file mode 100644 index 00000000..b0892f47 --- /dev/null +++ b/trials/youtube_scraping.py @@ -0,0 +1,98 @@ +import json +import yt_dlp as youtube_dl +from whisper_jax import FlaxWhisperPipline +import jax.numpy as jnp + +# Function to extract chapter information from a YouTube video URL +def get_youtube_chapters(video_id): + video_url = "https://www.youtube.com/watch?v=" + video_id + ydl_opts = { + 'extract_flat': 'in_playlist', + 'skip_download': True, + 'quiet': True, + } + + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + video_info = ydl.extract_info(video_url, download=False) + + chapters = [] + + if 'chapters' in video_info: + for chapter in video_info['chapters']: + start_time = chapter['start_time'] + end_time = chapter['end_time'] + title = chapter['title'] + + chapters.append({ + 'start': start_time, + 'end': end_time, + 'title': title + }) + + return chapters + + +# Function to extract video transcription using yt_dlp +def get_youtube_transcription(video_id): + ydl_opts = { + 'format': 'bestaudio/best', + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': '192', + }], + 'outtmpl': './artefacts/audio', # Specify output file path and name + } + + # Download the audio + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download(["https://www.youtube.com/watch?v=" + video_id]) + media_file = "./artefacts/audio.mp3" + + pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny", + dtype=jnp.float16, + batch_size=16) + whisper_result = pipeline(media_file, return_timestamps=True) + return whisper_result["chunks"] + + + +# Function to scrape YouTube video transcripts and chapter information +def scrape_youtube_data(video_id): + transcript_text = get_youtube_transcription(video_id) + chapters = get_youtube_chapters(video_id) + print("transcript_text", transcript_text) + print("chapters", chapters) + return transcript_text, chapters + + +# Function to generate fine-tuning dataset from YouTube data +def generate_finetuning_dataset(video_ids): + prompt_completion_pairs = [] + for video_id in video_ids: + transcript_text, chapters = scrape_youtube_data(video_id) + if transcript_text is not None and chapters is not None: + for chapter in chapters: + start_time = chapter["start"] + end_time = chapter["end"] + chapter_text = chapter["title"] + + prompt = "" + for transcript in transcript_text: + if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time: + prompt += transcript["text"] + + if prompt is not None: + completion = chapter_text + prompt_completion_pairs.append({"prompt": prompt, "completion": completion}) + + return prompt_completion_pairs + + +# Add all the video ids here, the videos must have captions [chapters] +video_ids = ["yTnSEZIwnkU"] +dataset = generate_finetuning_dataset(video_ids) + +with open("finetuning_dataset.jsonl", "w") as f: + for example in dataset: + f.write(json.dumps(example) + "\n") From 8be41647febe69d38b18ba94c57c20bed79f9524 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 10:35:47 +0530 Subject: [PATCH 06/11] flake8 checks --- server.py | 20 ++++++------ .../server_multithreaded.py | 4 +-- trials/whisper-jax/whisjax.py | 4 +-- trials/whisper-jax/whisjax_realtime.py | 4 +-- utils/{text_utilities.py => text_utils.py} | 32 ++++++++++++------- utils/{viz_utilities.py => viz_utils.py} | 8 ++--- 6 files changed, 40 insertions(+), 32 deletions(-) rename server_multithreaded.py => trials/server_multithreaded.py (98%) rename utils/{text_utilities.py => text_utils.py} (85%) rename utils/{viz_utilities.py => viz_utils.py} (97%) diff --git a/server.py b/server.py index 6ff68400..290e0456 100644 --- a/server.py +++ b/server.py @@ -53,11 +53,11 @@ def get_title_and_summary(llm_input_text, last_timestamp): prompt = f""" ### Human: - Create a JSON object as response. The JSON object must have 2 fields: - i) title and ii) summary. For the title field,generate a short title - for the given text. For the summary field, summarize the given text + Create a JSON object as response. The JSON object must have 2 fields: + i) title and ii) summary. For the title field,generate a short title + for the given text. For the summary field, summarize the given text in three sentences. - + {llm_input_text} ### Assistant: @@ -144,12 +144,12 @@ def get_transcription(frames): result_text = "" try: - segments, _ = model.transcribe(audiofilename, - language="en", - beam_size=5, - vad_filter=True, - vad_parameters=dict(min_silence_duration_ms=500) - ) + segments, _ = \ + model.transcribe(audiofilename, + language="en", + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500)) os.remove(audiofilename) segments = list(segments) result_text = "" diff --git a/server_multithreaded.py b/trials/server_multithreaded.py similarity index 98% rename from server_multithreaded.py rename to trials/server_multithreaded.py index 2862fa36..1d27dfdb 100644 --- a/server_multithreaded.py +++ b/trials/server_multithreaded.py @@ -16,8 +16,8 @@ from av import AudioFifo from sortedcontainers import SortedDict from whisper_jax import FlaxWhisperPipline -from utils.log_utils import logger -from utils.run_utils import config, Mutex +from ..utils.log_utils import logger +from ..utils.run_utils import config, Mutex WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_REAL_TIME_MODEL_SIZE"] pcs = set() diff --git a/trials/whisper-jax/whisjax.py b/trials/whisper-jax/whisjax.py index 98f718f3..eb87629d 100644 --- a/trials/whisper-jax/whisjax.py +++ b/trials/whisper-jax/whisjax.py @@ -21,8 +21,8 @@ from whisper_jax import FlaxWhisperPipline from ...utils.file_utils import download_files, upload_files from ...utils.log_utils import logger from ...utils.run_utils import config -from ...utils.text_utilities import post_process_transcription, summarize -from ...utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.text_utils import post_process_transcription, summarize +from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) diff --git a/trials/whisper-jax/whisjax_realtime.py b/trials/whisper-jax/whisjax_realtime.py index d1ec1a82..efb39461 100644 --- a/trials/whisper-jax/whisjax_realtime.py +++ b/trials/whisper-jax/whisjax_realtime.py @@ -13,8 +13,8 @@ from whisper_jax import FlaxWhisperPipline from ...utils.file_utils import upload_files from ...utils.log_utils import logger from ...utils.run_utils import config -from ...utils.text_utilities import post_process_transcription, summarize -from ...utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.text_utils import post_process_transcription, summarize +from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] diff --git a/utils/text_utilities.py b/utils/text_utils.py similarity index 85% rename from utils/text_utilities.py rename to utils/text_utils.py index 6210e78e..25126b34 100644 --- a/utils/text_utilities.py +++ b/utils/text_utils.py @@ -154,7 +154,7 @@ def chunk_text(text, def summarize(transcript_text, timestamp, real_time=False, - summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): + chunk_summarize=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") summary_model = config["DEFAULT"]["SUMMARY_MODEL"] if not summary_model: @@ -166,27 +166,35 @@ def summarize(transcript_text, timestamp, model = BartForConditionalGeneration.from_pretrained(summary_model) model = model.to(device) - output_filename = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" + output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" if real_time: - output_filename = "real_time_" + output_filename + output_file = "real_time_" + output_file - if summarize_using_chunks != "YES": + if chunk_summarize != "YES": + max_length = int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]) inputs = tokenizer. \ batch_encode_plus([transcript_text], truncation=True, padding='longest', - max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]), + max_length=max_length, return_tensors='pt') inputs = inputs.to(device) with torch.no_grad(): + num_beans = int(config["DEFAULT"]["BEAM_SIZE"]) + max_length = int(config["DEFAULT"]["MAX_LENGTH"]) summaries = model.generate(inputs['input_ids'], - num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0, - max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True) + num_beams=num_beans, + length_penalty=2.0, + max_length=max_length, + early_stopping=True) - decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) - for summary in summaries] + decoded_summaries = \ + [tokenizer.decode(summary, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + for summary in summaries] summary = " ".join(decoded_summaries) - with open("./artefacts/" + output_filename, 'w') as f: + with open("./artefacts/" + output_file, 'w') as f: f.write(summary.strip() + "\n") else: logger.info("Breaking transcript into smaller chunks") @@ -195,8 +203,8 @@ def summarize(transcript_text, timestamp, logger.info(f"Transcript broken into {len(chunks)} " f"chunks of at most 500 words") - logger.info(f"Writing summary text to: {output_filename}") - with open(output_filename, 'w') as f: + logger.info(f"Writing summary text to: {output_file}") + with open(output_file, 'w') as f: summaries = summarize_chunks(chunks, tokenizer, model) for summary in summaries: f.write(summary.strip() + " ") diff --git a/utils/viz_utilities.py b/utils/viz_utils.py similarity index 97% rename from utils/viz_utilities.py rename to utils/viz_utils.py index 6da24bb0..d7debd0c 100644 --- a/utils/viz_utilities.py +++ b/utils/viz_utils.py @@ -45,14 +45,14 @@ def create_wordcloud(timestamp, real_time=False): plt.axis("off") plt.tight_layout(pad=0) - wordcloud_name = "wordcloud" + wordcloud = "wordcloud" if real_time: - wordcloud_name = "real_time_" + wordcloud_name + "_" + \ + wordcloud = "real_time_" + wordcloud + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" else: - wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" + wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" - plt.savefig("./artefacts/" + wordcloud_name) + plt.savefig("./artefacts/" + wordcloud) def create_talk_diff_scatter_viz(timestamp, real_time=False): From b0b47cca831269f579e5522e310f7a194349df9a Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 10:44:37 +0530 Subject: [PATCH 07/11] restructuring --- trials/finetuning/__init__.py | 0 trials/finetuning/inference_fine_tuned.py | 24 +++++++++++++++++++ trials/{ => finetuning}/youtube_scraping.py | 0 trials/server/__init__.py | 0 trials/{ => server}/server_multithreaded.py | 4 ++-- trials/title_summary/__init__.py | 0 trials/{ => title_summary}/api.py | 0 trials/{ => title_summary}/bert.py | 0 trials/{ => title_summary}/gpt2.py | 0 trials/{ => title_summary}/incsum.py | 0 trials/{ => title_summary}/openai_endpoint.py | 2 +- trials/{ => title_summary}/pegasus.py | 0 trials/{ => title_summary}/t5.py | 0 trials/{ => title_summary}/transcript.txt | 0 trials/{ => title_summary}/vicuna.py | 0 15 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 trials/finetuning/__init__.py create mode 100644 trials/finetuning/inference_fine_tuned.py rename trials/{ => finetuning}/youtube_scraping.py (100%) create mode 100644 trials/server/__init__.py rename trials/{ => server}/server_multithreaded.py (98%) create mode 100644 trials/title_summary/__init__.py rename trials/{ => title_summary}/api.py (100%) rename trials/{ => title_summary}/bert.py (100%) rename trials/{ => title_summary}/gpt2.py (100%) rename trials/{ => title_summary}/incsum.py (100%) rename trials/{ => title_summary}/openai_endpoint.py (99%) rename trials/{ => title_summary}/pegasus.py (100%) rename trials/{ => title_summary}/t5.py (100%) rename trials/{ => title_summary}/transcript.txt (100%) rename trials/{ => title_summary}/vicuna.py (100%) diff --git a/trials/finetuning/__init__.py b/trials/finetuning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/finetuning/inference_fine_tuned.py b/trials/finetuning/inference_fine_tuned.py new file mode 100644 index 00000000..4a396071 --- /dev/null +++ b/trials/finetuning/inference_fine_tuned.py @@ -0,0 +1,24 @@ +# Steps to prepare data and submit/check OpenAI finetuning +# import subprocess +# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl") +# export OPENAI_API_KEY= +# openai api fine_tunes.create -t -m +# openai api fine_tunes.list + + +import openai + +# Use your OpenAI API Key +openai.api_key = "" + +sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "] + +# Give your finetuned model name here +# "davinci:ft-personal-2023-07-14-10-43-51" +model_name = "" +response = openai.Completion.create( + model=model_name, + prompt=sample_chunks[0]) + +print(response) diff --git a/trials/youtube_scraping.py b/trials/finetuning/youtube_scraping.py similarity index 100% rename from trials/youtube_scraping.py rename to trials/finetuning/youtube_scraping.py diff --git a/trials/server/__init__.py b/trials/server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/server_multithreaded.py b/trials/server/server_multithreaded.py similarity index 98% rename from trials/server_multithreaded.py rename to trials/server/server_multithreaded.py index 1d27dfdb..1c5e75d7 100644 --- a/trials/server_multithreaded.py +++ b/trials/server/server_multithreaded.py @@ -16,8 +16,8 @@ from av import AudioFifo from sortedcontainers import SortedDict from whisper_jax import FlaxWhisperPipline -from ..utils.log_utils import logger -from ..utils.run_utils import config, Mutex +from reflector.utils.log_utils import logger +from reflector.utils.run_utils import config, Mutex WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_REAL_TIME_MODEL_SIZE"] pcs = set() diff --git a/trials/title_summary/__init__.py b/trials/title_summary/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/api.py b/trials/title_summary/api.py similarity index 100% rename from trials/api.py rename to trials/title_summary/api.py diff --git a/trials/bert.py b/trials/title_summary/bert.py similarity index 100% rename from trials/bert.py rename to trials/title_summary/bert.py diff --git a/trials/gpt2.py b/trials/title_summary/gpt2.py similarity index 100% rename from trials/gpt2.py rename to trials/title_summary/gpt2.py diff --git a/trials/incsum.py b/trials/title_summary/incsum.py similarity index 100% rename from trials/incsum.py rename to trials/title_summary/incsum.py diff --git a/trials/openai_endpoint.py b/trials/title_summary/openai_endpoint.py similarity index 99% rename from trials/openai_endpoint.py rename to trials/title_summary/openai_endpoint.py index 7a572353..c92856c5 100644 --- a/trials/openai_endpoint.py +++ b/trials/title_summary/openai_endpoint.py @@ -28,7 +28,7 @@ response = openai.ChatCompletion.create(model=model, n=1, max_tokens=300) -# Try finetuned model +# Try fine tuned model # model = "davinci:ft-personal-2023-07-14-10-43-51" # response = openai.Completion.create(model=model, # prompt=sample_chunks[0] + " -> ") diff --git a/trials/pegasus.py b/trials/title_summary/pegasus.py similarity index 100% rename from trials/pegasus.py rename to trials/title_summary/pegasus.py diff --git a/trials/t5.py b/trials/title_summary/t5.py similarity index 100% rename from trials/t5.py rename to trials/title_summary/t5.py diff --git a/trials/transcript.txt b/trials/title_summary/transcript.txt similarity index 100% rename from trials/transcript.txt rename to trials/title_summary/transcript.txt diff --git a/trials/vicuna.py b/trials/title_summary/vicuna.py similarity index 100% rename from trials/vicuna.py rename to trials/title_summary/vicuna.py From 2d5c464d3b6dc4d17fbaa83bae52eed47cbd2da4 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 12:50:43 +0530 Subject: [PATCH 08/11] move client files --- .gitignore | 2 +- client-local/__init__.py | 6 ++++++ client.py => client-local/client.py | 4 ++-- .../stream_client.py | 6 +++--- server.py | 21 ++++++++++++++----- 5 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 client-local/__init__.py rename client.py => client-local/client.py (94%) rename stream_client.py => client-local/stream_client.py (97%) diff --git a/.gitignore b/.gitignore index c08eb9a3..c6d9edb2 100644 --- a/.gitignore +++ b/.gitignore @@ -165,7 +165,7 @@ cython_debug/ transcript_*.txt test_*.txt wordcloud*.png -*.ini +utils/config.ini test_samples/ *.wav *.mp3 diff --git a/client-local/__init__.py b/client-local/__init__.py new file mode 100644 index 00000000..c2897b64 --- /dev/null +++ b/client-local/__init__.py @@ -0,0 +1,6 @@ +import sys +import os + +parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), + os.pardir)) +sys.path.append(parent_dir) diff --git a/client.py b/client-local/client.py similarity index 94% rename from client.py rename to client-local/client.py index b0fa46a5..af1f4c7a 100644 --- a/client.py +++ b/client-local/client.py @@ -5,15 +5,15 @@ import signal from aiortc.contrib.signaling import (add_signaling_arguments, create_signaling) +from ..utils.log_utils import logger from stream_client import StreamClient -from utils.log_utils import logger async def main(): parser = argparse.ArgumentParser(description="Data channels ping/pong") parser.add_argument( - "--url", type=str, nargs="?", default="http://127.0.0.1:1250/offer" + "--url", type=str, nargs="?", default="http://0.0.0.0:1250/offer" ) parser.add_argument( diff --git a/stream_client.py b/client-local/stream_client.py similarity index 97% rename from stream_client.py rename to client-local/stream_client.py index 1ed9cf31..b044b1bb 100644 --- a/stream_client.py +++ b/client-local/stream_client.py @@ -9,15 +9,15 @@ import stamina from aiortc import (RTCPeerConnection, RTCSessionDescription) from aiortc.contrib.media import (MediaPlayer, MediaRelay) -from utils.log_utils import logger -from utils.run_utils import config +from ..utils.log_utils import logger +from ..utils.run_utils import config class StreamClient: def __init__( self, signaling, - url="http://127.0.0.1:1250", + url="http://0.0.0.0:1250", play_from=None, ping_pong=False ): diff --git a/server.py b/server.py index 290e0456..55066eef 100644 --- a/server.py +++ b/server.py @@ -1,3 +1,4 @@ +import argparse import asyncio import datetime import json @@ -16,7 +17,7 @@ from faster_whisper import WhisperModel from loguru import logger from sortedcontainers import SortedDict -from utils.run_utils import run_in_executor +from utils.run_utils import run_in_executor, config pcs = set() relay = MediaRelay() @@ -31,8 +32,8 @@ audio_buffer = AudioFifo() executor = ThreadPoolExecutor() transcription_text = "" last_transcribed_time = 0.0 -LLM_MACHINE_IP = "216.153.52.83" -LLM_MACHINE_PORT = "5000" +LLM_MACHINE_IP = config["DEFAULT"]["LLM_MACHINE_IP"] +LLM_MACHINE_PORT = config["DEFAULT"]["LLM_MACHINE_PORT"] LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate" incremental_responses = [] sorted_transcripts = SortedDict() @@ -43,7 +44,7 @@ blacklisted_messages = [" Thank you.", " See you next time!", def get_title_and_summary(llm_input_text, last_timestamp): - ("Generating title and summary") + logger.info("Generating title and summary") # output = llm.generate(prompt) # Use monadical-ml to fire this query to an LLM and get result @@ -306,6 +307,16 @@ async def on_shutdown(app): if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="WebRTC based server for Reflector" + ) + parser.add_argument( + "--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)" + ) + parser.add_argument( + "--port", type=int, default=1250, help="Server port (def: 1250)" + ) + args = parser.parse_args() app = web.Application() cors = aiohttp_cors.setup( app, @@ -321,4 +332,4 @@ if __name__ == "__main__": offer_resource = cors.add(app.router.add_resource("/offer")) cors.add(offer_resource.add_route("POST", offer)) app.on_shutdown.append(on_shutdown) - web.run_app(app, access_log=None, host="127.0.0.1", port=1250) + web.run_app(app, access_log=None, host=args.host, port=args.port) From d96c9d6adfd5b6e5c391951ce0565e213fcefc1e Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 13:18:19 +0530 Subject: [PATCH 09/11] repo update --- client-local/__init__.py | 6 ------ client-local/client.py => client.py | 2 +- client-local/stream_client.py => stream_client.py | 4 ++-- 3 files changed, 3 insertions(+), 9 deletions(-) delete mode 100644 client-local/__init__.py rename client-local/client.py => client.py (98%) rename client-local/stream_client.py => stream_client.py (98%) diff --git a/client-local/__init__.py b/client-local/__init__.py deleted file mode 100644 index c2897b64..00000000 --- a/client-local/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys -import os - -parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), - os.pardir)) -sys.path.append(parent_dir) diff --git a/client-local/client.py b/client.py similarity index 98% rename from client-local/client.py rename to client.py index af1f4c7a..b2167d3b 100644 --- a/client-local/client.py +++ b/client.py @@ -5,7 +5,7 @@ import signal from aiortc.contrib.signaling import (add_signaling_arguments, create_signaling) -from ..utils.log_utils import logger +from utils.log_utils import logger from stream_client import StreamClient diff --git a/client-local/stream_client.py b/stream_client.py similarity index 98% rename from client-local/stream_client.py rename to stream_client.py index b044b1bb..c2238ee5 100644 --- a/client-local/stream_client.py +++ b/stream_client.py @@ -9,8 +9,8 @@ import stamina from aiortc import (RTCPeerConnection, RTCSessionDescription) from aiortc.contrib.media import (MediaPlayer, MediaRelay) -from ..utils.log_utils import logger -from ..utils.run_utils import config +from utils.log_utils import logger +from utils.run_utils import config class StreamClient: From ef9a6a2e665703a39532937d0f6477c20762524c Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 13:47:23 +0530 Subject: [PATCH 10/11] separate requirements --- requirements.txt => pipeline-requirements.txt | 5 -- scripts/setup_dependencies.sh | 31 +----------- scripts/setup_pipeline_dependencies.sh | 33 ++++++++++++ server-requirements.txt | 50 +++++++++++++++++++ 4 files changed, 84 insertions(+), 35 deletions(-) rename requirements.txt => pipeline-requirements.txt (91%) create mode 100644 scripts/setup_pipeline_dependencies.sh create mode 100644 server-requirements.txt diff --git a/requirements.txt b/pipeline-requirements.txt similarity index 91% rename from requirements.txt rename to pipeline-requirements.txt index fb69c4bd..24e7a092 100644 --- a/requirements.txt +++ b/pipeline-requirements.txt @@ -2,8 +2,6 @@ pyaudio==0.2.13 keyboard==0.13.5 pynput==1.7.6 wave==0.0.2 -aiohttp==3.8.4 -aiosignal==1.3.1 async-timeout==4.0.2 attrs==23.1.0 certifi==2023.5.7 @@ -51,11 +49,8 @@ matplotlib==3.7.2 matplotlib-inline==0.1.6 termcolor==2.3.0 ffmpeg==1.4 -aiortc==1.5.0 cached_property==1.5.2 stamina==23.1.0 httpx==0.24.1 -sortedcontainers==2.4.0 https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz gpt4all==1.0.5 -aiohttp_cors==0.7.0 diff --git a/scripts/setup_dependencies.sh b/scripts/setup_dependencies.sh index b7dc6d77..50288d54 100755 --- a/scripts/setup_dependencies.sh +++ b/scripts/setup_dependencies.sh @@ -1,33 +1,4 @@ #!/bin/sh -# Upgrade pip pip install --upgrade pip - -# Default to CPU Installation of JAX -jax_mode="jax[cpu]" - -# Install JAX -if [ "$1" == "cpu" ] -then - jax_mode="jax[cpu]" -elif [ "$1" == "cuda11" ] -then - jax_mode="jax[cuda11_pip]" -elif [ "$1" == "cuda12" ] -then - jax_mode="jax[cuda12_pip]" -fi - -pip install --upgrade "$jax_mode" - -# Install Whisper-JAX base -pip install git+https://github.com/sanchit-gandhi/whisper-jax.git - -# Update to latest version -pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git - -pip install -r ../requirements.txt - -# download spacy models -spacy download en_core_web_sm -spacy download en_core_web_md +pip install -r ../server-requirements.txt \ No newline at end of file diff --git a/scripts/setup_pipeline_dependencies.sh b/scripts/setup_pipeline_dependencies.sh new file mode 100644 index 00000000..95d5d41d --- /dev/null +++ b/scripts/setup_pipeline_dependencies.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +# Upgrade pip +pip install --upgrade pip + +# Default to CPU Installation of JAX +jax_mode="jax[cpu]" + +# Install JAX +if [ "$1" == "cpu" ] +then + jax_mode="jax[cpu]" +elif [ "$1" == "cuda11" ] +then + jax_mode="jax[cuda11_pip]" +elif [ "$1" == "cuda12" ] +then + jax_mode="jax[cuda12_pip]" +fi + +pip install --upgrade "$jax_mode" + +# Install Whisper-JAX base +pip install git+https://github.com/sanchit-gandhi/whisper-jax.git + +# Update to latest version +pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git + +pip install -r ../server-requirements.txt + +# download spacy models +spacy download en_core_web_sm +spacy download en_core_web_md diff --git a/server-requirements.txt b/server-requirements.txt new file mode 100644 index 00000000..01d7af38 --- /dev/null +++ b/server-requirements.txt @@ -0,0 +1,50 @@ +aiohttp==3.8.5 +aiohttp-cors==0.7.0 +aioice==0.9.0 +aiortc==1.5.0 +aiosignal==1.3.1 +anyio==3.7.1 +async-timeout==4.0.2 +attrs==23.1.0 +av==10.0.0 +certifi==2023.7.22 +cffi==1.15.1 +charset-normalizer==3.2.0 +coloredlogs==15.0.1 +cryptography==41.0.2 +ctranslate2==3.17.1 +dnspython==2.4.0 +faster-whisper==0.7.1 +filelock==3.12.2 +flatbuffers==23.5.26 +frozenlist==1.4.0 +fsspec==2023.6.0 +google-crc32c==1.5.0 +h11==0.14.0 +httpcore==0.17.3 +huggingface-hub==0.16.4 +humanfriendly==10.0 +idna==3.4 +ifaddr==0.2.0 +loguru==0.7.0 +mpmath==1.3.0 +multidict==6.0.4 +numpy==1.25.1 +onnxruntime==1.15.1 +packaging==23.1 +protobuf==4.23.4 +pycparser==2.21 +pyee==11.0.0 +pylibsrtp==0.8.0 +pyOpenSSL==23.2.0 +PyYAML==6.0.1 +requests==2.31.0 +sniffio==1.3.0 +sortedcontainers==2.4.0 +sympy==1.12 +tokenizers==0.13.3 +tqdm==4.65.0 +typing_extensions==4.7.1 +urllib3==2.0.4 +yarl==1.9.2 +wave==0.0.2 From b4303d6cd4628f938ec1bbaedf286ec0499b53a4 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Tue, 25 Jul 2023 13:53:39 +0530 Subject: [PATCH 11/11] update --- ...encies.sh => setup_server_dependencies.sh} | 0 utils/config.ini | 40 ++++++++++--------- 2 files changed, 21 insertions(+), 19 deletions(-) rename scripts/{setup_dependencies.sh => setup_server_dependencies.sh} (100%) diff --git a/scripts/setup_dependencies.sh b/scripts/setup_server_dependencies.sh similarity index 100% rename from scripts/setup_dependencies.sh rename to scripts/setup_server_dependencies.sh diff --git a/utils/config.ini b/utils/config.ini index 9ba12959..001ed7c4 100644 --- a/utils/config.ini +++ b/utils/config.ini @@ -1,24 +1,26 @@ [DEFAULT] -#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization -KMP_DUPLICATE_LIB_OK = TRUE +#Set exception rule for OpenMP error +#to allow duplicate lib initialization +KMP_DUPLICATE_LIB_OK=TRUE #ExportOpenAIAPIKey -OPENAI_APIKEY = +OPENAI_APIKEY= #ExportWhisperModelSize -WHISPER_MODEL_SIZE = tiny -WHISPER_REAL_TIME_MODEL_SIZE = tiny +WHISPER_MODEL_SIZE=tiny +WHISPER_REAL_TIME_MODEL_SIZE=tiny #AWSconfig -AWS_ACCESS_KEY = ***REMOVED*** -AWS_SECRET_KEY = ***REMOVED*** -BUCKET_NAME = reflector-bucket +AWS_ACCESS_KEY= +AWS_SECRET_KEY= +BUCKET_NAME=reflector-bucket #Summarizerconfig -SUMMARY_MODEL = facebook/bart-large-cnn -INPUT_ENCODING_MAX_LENGTH = 1024 -MAX_LENGTH = 2048 -BEAM_SIZE = 6 -MAX_CHUNK_LENGTH = 1024 -SUMMARIZE_USING_CHUNKS = YES -#Audiodevice -BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator -AV_FOUNDATION_DEVICE_ID = 1 -# LLM PATH -LLM_PATH = +SUMMARY_MODEL=facebook/bart-large-cnn +INPUT_ENCODING_MAX_LENGTH=1024 +MAX_LENGTH=2048 +BEAM_SIZE=6 +MAX_CHUNK_LENGTH=1024 +SUMMARIZE_USING_CHUNKS=YES +# Audiodevice +BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator +AV_FOUNDATION_DEVICE_ID=1 +# LLM configs +LLM_MACHINE_IP= +LLM_MACHINE_PORT=