Moved all server files to server/

This commit is contained in:
Koper
2023-07-26 15:13:46 +07:00
parent b02bce39f0
commit c0400b4232
65 changed files with 0 additions and 38 deletions

View File

View File

View File

@@ -0,0 +1,24 @@
# Steps to prepare data and submit/check OpenAI finetuning
# import subprocess
# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
# export OPENAI_API_KEY=
# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
# openai api fine_tunes.list
import openai
# Use your OpenAI API Key
openai.api_key = ""
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "]
# Give your finetuned model name here
# "davinci:ft-personal-2023-07-14-10-43-51"
model_name = ""
response = openai.Completion.create(
model=model_name,
prompt=sample_chunks[0])
print(response)

View File

@@ -0,0 +1,98 @@
import json
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp
# Function to extract chapter information from a YouTube video URL
def get_youtube_chapters(video_id):
video_url = "https://www.youtube.com/watch?v=" + video_id
ydl_opts = {
'extract_flat': 'in_playlist',
'skip_download': True,
'quiet': True,
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
video_info = ydl.extract_info(video_url, download=False)
chapters = []
if 'chapters' in video_info:
for chapter in video_info['chapters']:
start_time = chapter['start_time']
end_time = chapter['end_time']
title = chapter['title']
chapters.append({
'start': start_time,
'end': end_time,
'title': title
})
return chapters
# Function to extract video transcription using yt_dlp
def get_youtube_transcription(video_id):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': './artefacts/audio', # Specify output file path and name
}
# Download the audio
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
media_file = "./artefacts/audio.mp3"
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
dtype=jnp.float16,
batch_size=16)
whisper_result = pipeline(media_file, return_timestamps=True)
return whisper_result["chunks"]
# Function to scrape YouTube video transcripts and chapter information
def scrape_youtube_data(video_id):
transcript_text = get_youtube_transcription(video_id)
chapters = get_youtube_chapters(video_id)
print("transcript_text", transcript_text)
print("chapters", chapters)
return transcript_text, chapters
# Function to generate fine-tuning dataset from YouTube data
def generate_finetuning_dataset(video_ids):
prompt_completion_pairs = []
for video_id in video_ids:
transcript_text, chapters = scrape_youtube_data(video_id)
if transcript_text is not None and chapters is not None:
for chapter in chapters:
start_time = chapter["start"]
end_time = chapter["end"]
chapter_text = chapter["title"]
prompt = ""
for transcript in transcript_text:
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
prompt += transcript["text"]
if prompt is not None:
completion = chapter_text
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
return prompt_completion_pairs
# Add all the video ids here, the videos must have captions [chapters]
video_ids = ["yTnSEZIwnkU"]
dataset = generate_finetuning_dataset(video_ids)
with open("finetuning_dataset.jsonl", "w", encoding="utf-8") as file:
for example in dataset:
file.write(json.dumps(example) + "\n")

View File

View File

@@ -0,0 +1,188 @@
import asyncio
import datetime
import io
import json
import threading
import uuid
import wave
from concurrent.futures import ThreadPoolExecutor
import jax.numpy as jnp
import requests
from aiohttp import web
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
from aiortc.contrib.media import MediaRelay
from av import AudioFifo
from sortedcontainers import SortedDict
from whisper_jax import FlaxWhisperPipline
from reflector.utils.log_utils import LOGGER
from reflector.utils.run_utils import CONFIG, Mutex
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
pcs = set()
relay = MediaRelay()
data_channel = None
sorted_message_queue = SortedDict()
CHANNELS = 2
RATE = 44100
CHUNK_SIZE = 256
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
dtype=jnp.float16,
batch_size=16)
start_time = datetime.datetime.now()
executor = ThreadPoolExecutor()
audio_buffer = AudioFifo()
frame_lock = Mutex(audio_buffer)
def channel_log(channel, t, message):
print("channel(%s) %s %s" % (channel.label, t, message))
def thread_queue_channel_send():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
least_time = sorted_message_queue.keys()[0]
message = sorted_message_queue[least_time]
if message:
del sorted_message_queue[least_time]
data_channel.send(message)
except Exception as e:
print("Exception", str(e))
pass
loop.run_forever()
def get_transcription():
while True:
with frame_lock.lock() as audio_buffer:
frames = audio_buffer.read_many(CHUNK_SIZE * 960, partial=False)
if not frames:
transcribe = False
else:
transcribe = True
if transcribe:
print("Transcribing..")
try:
sorted_message_queue[frames[0].time] = None
out_file = io.BytesIO()
wf = wave.open(out_file, "wb")
wf.setnchannels(CHANNELS)
wf.setframerate(RATE)
wf.setsampwidth(2)
for frame in frames:
wf.writeframes(b''.join(frame.to_ndarray()))
wf.close()
whisper_result = pipeline(out_file.getvalue())
item = {
'text': whisper_result["text"],
'start_time': str(frames[0].time),
'time': str(datetime.datetime.now())
}
sorted_message_queue[frames[0].time] = str(item)
start_messaging_thread()
except Exception as e:
print("Exception -> ", str(e))
class AudioStreamTrack(MediaStreamTrack):
"""
An audio stream track to send audio frames.
"""
kind = "audio"
def __init__(self, track):
super().__init__() # don't forget this!
self.track = track
async def recv(self):
frame = await self.track.recv()
audio_buffer.write(frame)
return frame
def start_messaging_thread():
message_thread = threading.Thread(target=thread_queue_channel_send)
message_thread.start()
def start_transcription_thread(max_threads: int):
for i in range(max_threads):
t_thread = threading.Thread(target=get_transcription)
t_thread.start()
async def offer(request: requests.Request):
params = await request.json()
offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
pc = RTCPeerConnection()
pc_id = "PeerConnection(%s)" % uuid.uuid4()
pcs.add(pc)
def log_info(msg: str, *args):
LOGGER.info(pc_id + " " + msg, *args)
log_info("Created for " + request.remote)
@pc.on("datachannel")
def on_datachannel(channel):
global data_channel, start_time
data_channel = channel
channel_log(channel, "-", "created by remote party")
start_time = datetime.datetime.now()
@channel.on("message")
def on_message(message: str):
channel_log(channel, "<", message)
if isinstance(message, str) and message.startswith("ping"):
# reply
channel.send("pong" + message[4:])
@pc.on("connectionstatechange")
async def on_connectionstatechange():
log_info("Connection state is " + pc.connectionState)
if pc.connectionState == "failed":
await pc.close()
pcs.discard(pc)
@pc.on("track")
def on_track(track):
log_info("Track " + track.kind + " received")
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
# handle offer
await pc.setRemoteDescription(offer)
# send answer
answer = await pc.createAnswer()
await pc.setLocalDescription(answer)
return web.Response(
content_type="application/json",
text=json.dumps({
"sdp": pc.localDescription.sdp,
"type": pc.localDescription.type
}),
)
async def on_shutdown(app: web.Application):
coros = [pc.close() for pc in pcs]
await asyncio.gather(*coros)
pcs.clear()
if __name__ == "__main__":
app = web.Application()
app.on_shutdown.append(on_shutdown)
start_transcription_thread(6)
app.router.add_post("/offer", offer)
web.run_app(
app, access_log=None, host="127.0.0.1", port=1250
)

View File

View File

@@ -0,0 +1,57 @@
import requests
import spacy
# Enter the Machine where the LLM is hosted
LLM_MACHINE_IP = ""
# This is the URL of text-generation-webui
URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate"
headers = {
"Content-Type": "application/json"
}
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
with open(filename, 'r') as file:
text = file.read()
doc = nlp(text)
total_tokens = len(doc)
parts = []
start_index = 0
while start_index < total_tokens:
end_index = start_index + token_count
part_tokens = doc[start_index:end_index - 5]
part = ' '.join(token.text for token in part_tokens)
parts.append(part)
start_index = end_index
return parts
final_summary = ""
parts = split_text_file("transcript.txt", 1600)
for part in parts:
prompt = f"""
### Human:
Given the following text, distill the most important information
into a short summary: {part}
### Assistant:
"""
data = {
"prompt": prompt
}
try:
response = requests.post(URL, headers=headers, json=data)
print(response.json())
except Exception as e:
print(str(e))
with open("summary.txt", "w") as sum:
sum.write(" ".join(final_summary))

View File

@@ -0,0 +1,43 @@
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Load the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
# Define the input text
text = "Your input text to be summarized goes here."
# Tokenize the text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids]).to(device)
# Get the BERT model output
with torch.no_grad():
outputs = model(input_ids)[0] # Extract the last hidden states
# Calculate sentence embeddings
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
input_text_embedding = sentence_transformer_model.encode([text])[0]
# Calculate cosine similarity between sentences and input text
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)
# Sort the sentences by similarity scores in descending order
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]
# Choose the top sentences as the summary
num_summary_sentences = 2 # Adjust as needed
summary = ". ".join(sorted_sentences[:num_summary_sentences])
print("Summary:", summary)

View File

@@ -0,0 +1,101 @@
# Approach 1
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)
conversation = """
Summarize the following conversation in 3 key sentences:
We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
"""
input_ids = tokenizer.encode(conversation, return_tensors='pt')
output = model.generate(input_ids,
max_length=30,
num_return_sequences=1)
caption = tokenizer.decode(output[0], skip_special_tokens=True)
print("Caption:", caption[len(input_ids):])
# Approach 2
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()
text = """
You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
"""
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer.encode(text,
max_length=100,
truncation=True,
return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
output = model.generate(input_ids,
max_new_tokens=20,
num_return_sequences=1,
num_beams=2,
attention_mask=attention_mask)
chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
for i, title in enumerate(chapter_titles):
print("Caption: ", title)
# Approach 3
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
def generate_response(conversation, max_length=100):
input_text = ""
for entry in conversation:
role = entry["role"]
content = entry["content"]
input_text += f"{role}: {content}\n"
# Tokenize the entire conversation
input_ids = tokenizer.encode(input_text, return_tensors="pt")
# Generate text based on the entire conversation
with torch.no_grad():
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
# Decode the generated text and return it
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response
if __name__ == "__main__":
# Call appropriate approach from the main while experimenting
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
sample_chunks = [
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
]
conversation = [
{"role": "system", "content": "Summarize this text"},
{"role": "user", "content": " text : " + sample_chunks[0]},
]
response = generate_response(conversation)
print("Response:", response)

View File

@@ -0,0 +1,157 @@
import spacy
import sys
# Observe the incremental summaries by performing summaries in chunks
with open("transcript.txt", "r", encoding="utf-8") as file:
transcription = file.read()
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
with open(filename, 'r', encoding="utf-8") as file:
text = file.read()
doc = nlp(text)
total_tokens = len(doc)
parts = []
start_index = 0
while start_index < total_tokens:
end_index = start_index + token_count
part_tokens = doc[start_index:end_index]
part = ' '.join(token.text for token in part_tokens)
parts.append(part)
start_index = end_index
return parts
# Set the chunk length here to split the transcript and test
MAX_CHUNK_LENGTH = 1000
chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
print("Number of chunks", len(chunks))
# Write chunks to file to refer to input vs output, separated by blank lines
with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a", encoding="utf-8") as file:
for c in chunks:
file.write(c + "\n\n")
# If we want to run only a certain model, type the option while running
# ex. python incsum.py 1 => will run approach 1
# If no input, will run all approaches
try:
index = sys.argv[1]
except:
index = None
# Approach 1 : facebook/bart-large-cnn
if index == "1" or index is None:
SUMMARY_MODEL = "facebook/bart-large-cnn"
MIN_LENGTH = 5
MAX_LENGTH = 10
BEAM_SIZE = 2
print("Performing chunk summary : " + SUMMARY_MODEL)
from transformers import BartTokenizer, BartForConditionalGeneration
tokenizer = BartTokenizer.from_pretrained(SUMMARY_MODEL)
model = BartForConditionalGeneration.from_pretrained(SUMMARY_MODEL)
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c,
truncation=True,
max_length=MAX_CHUNK_LENGTH,
padding="max_length",
return_tensors='pt')
summary_ids = model.generate(
input_ids,
num_beams=BEAM_SIZE,
max_length=56,
early_stopping=True,
length_penalty=1.0)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
with open("bart-summaries.txt", "a", encoding="utf-8") as file:
for summary in summaries:
file.write(summary + "\n\n")
# Approach 2
if index == "2" or index is None:
print("Performing chunk summary : " + "gpt-neo-1.3B")
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c,
truncation=True,
return_tensors='pt')
input_length = input_ids.shape[1]
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
max_summary_length = 100
max_length = input_length + max_summary_length
output = model.generate(input_ids,
max_length=max_length,
attention_mask=attention_mask,
pad_token_id=model.config.eos_token_id,
num_beams=4,
length_penalty=2.0,
early_stopping=True)
summary_ids = output[0, input_length:]
summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
summaries.append(summary)
with open("gptneo1.3B-summaries.txt", "a", encoding="utf-8") as file:
file.write(summary + "\n\n")
# Approach 3
if index == "3" or index is None:
print("Performing chunk summary : " + "mpt-7B")
import torch
import transformers
from transformers import AutoTokenizer
config = transformers.AutoConfig.from_pretrained('mosaicml/mpt-7b',
trust_remote_code=True)
config.attn_config['attn_impl'] = 'triton'
config.max_seq_len = 1024
config.init_device = "meta"
model = transformers.AutoModelForCausalLM.from_pretrained(
'mosaicml/mpt-7b',
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c, return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
output = model.generate(input_ids,
max_new_tokens=25,
attention_mask=attention_mask,
pad_token_id=model.config.eos_token_id,
num_return_sequences=1)
summary = tokenizer.decode(output[0],
skip_special_tokens=True)
summaries.append(summary)
with open("mpt-7b-summaries.txt", "a", encoding="utf-8") as file:
for summary in summaries:
file.write(summary + "\n\n")

View File

@@ -0,0 +1,37 @@
# Use OpenAI API endpoint to send data to OpenAI
# along with prompts to caption/summarize the conversation
import openai
openai.api_key = ""
# to caption, user prompt used : "caption this conversation"
# max_tokens=20
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
# max_tokens=300
sample_chunks = [
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
conversation = [
{"role": "system",
"content": sample_chunks[1]},
{"role": "user",
"content": "summarize this conversation in a few sentences by taking key points"}
]
model = "gpt-3.5-turbo"
response = openai.ChatCompletion.create(model=model,
messages=conversation,
n=1,
max_tokens=300)
# Try fine tuned model
# model = "davinci:ft-personal-2023-07-14-10-43-51"
# response = openai.Completion.create(model=model,
# prompt=sample_chunks[0] + " -> ")
caption = response.choices[0]
print(caption)

View File

@@ -0,0 +1,33 @@
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
# Load the Pegasus model and tokenizer
model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
# Define the input text for summarization
text = sample_chunks[1]
inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
# Generate the summary
summary_ids = model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=200,
num_beams=4,
length_penalty=2.0,
early_stopping=True,
)
# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)

View File

@@ -0,0 +1,27 @@
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
# Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
# Define the input text for summarization
text = "Summarize the following text in 3 key points. text : " + sample_chunks[1]
# Tokenize the input text
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
# Generate the summary
summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True)
# Decode and print the summary
summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
print("Summary:", summary)

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,44 @@
from gpt4all import GPT4All
model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin")
import spacy
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
with open(filename, 'r') as file:
text = file.read()
doc = nlp(text)
total_tokens = len(doc)
parts = []
start_index = 0
while start_index < total_tokens:
end_index = start_index + token_count
part_tokens = doc[start_index:end_index]
part = ' '.join(token.text for token in part_tokens)
parts.append(part)
start_index = end_index
return parts
parts = split_text_file("transcript.txt", 1800)
final_summary = []
for part in parts:
prompt = f"""
### Human:
Summarize the following text without missing any key points and action items.
{part}
### Assistant:
"""
output = model.generate(prompt)
final_summary.append(output)
with open("sum.txt", "w") as sum:
sum.write(" ".join(final_summary))

View File

View File

@@ -0,0 +1,183 @@
#!/usr/bin/env python3
# summarize https://www.youtube.com/watch?v=imzTxoEDH_g
# summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
# summarize podcast.mp3 summary.txt
import argparse
import os
import re
import subprocess
import tempfile
from datetime import datetime
from urllib.parse import urlparse
import jax.numpy as jnp
import moviepy.editor
import nltk
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
from ...utils.file_utils import download_files, upload_files
from ...utils.log_utils import LOGGER
from ...utils.run_utils import CONFIG
from ...utils.text_utils import post_process_transcription, summarize
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
NOW = datetime.now()
if not os.path.exists('../../artefacts'):
os.makedirs('../../artefacts')
def init_argparse() -> argparse.ArgumentParser:
"""
Parse the CLI arguments
:return: parser object
"""
parser = argparse.ArgumentParser(
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
description="Creates a transcript of a video or audio file, then"
" summarizes it using ChatGPT."
)
parser.add_argument("-l", "--language",
help="Language that the summary should be written in",
type=str,
default="english",
choices=['english', 'spanish', 'french', 'german',
'romanian'])
parser.add_argument("location")
return parser
def main():
parser = init_argparse()
args = parser.parse_args()
# Parse the location string that was given to us, and figure out if it's a
# local file (audio or video), a YouTube URL, or a URL referencing an
# audio or video file.
url = urlparse(args.location)
# S3 : Pull artefacts to S3 bucket ?
media_file = ""
if url.scheme == 'http' or url.scheme == 'https':
# Check if we're being asked to retreive a YouTube URL, which is
# handled differently, as we'll use a secondary site to download
# the video first.
if re.search('youtube.com', url.netloc, re.IGNORECASE):
# Download the lowest resolution YouTube video
# (since we're just interested in the audio).
# It will be saved to the current directory.
LOGGER.info("Downloading YouTube video at url: " + args.location)
# Create options for the download
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': './artefacts/audio', # Specify output file path and name
}
# Download the audio
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([args.location])
media_file = "../artefacts/audio.mp3"
LOGGER.info("Saved downloaded YouTube video to: " + media_file)
else:
# XXX - Download file using urllib, check if file is
# audio/video using python-magic
LOGGER.info(f"Downloading file at url: {args.location}")
LOGGER.info(" XXX - This method hasn't been implemented yet.")
elif url.scheme == '':
media_file = url.path
# If file is not present locally, take it from S3 bucket
if not os.path.exists(media_file):
download_files([media_file])
if media_file.endswith(".m4a"):
subprocess.run(["ffmpeg", "-i", media_file, f"./artefacts/{media_file}.mp4"])
media_file = f"./artefacts/{media_file}.mp4"
else:
print("Unsupported URL scheme: " + url.scheme)
quit()
# Handle video
if not media_file.endswith(".mp3"):
try:
video = moviepy.editor.VideoFileClip(media_file)
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
delete=False).name
video.audio.write_audiofile(audio_filename, logger=None)
LOGGER.info(f"Extracting audio to: {audio_filename}")
# Handle audio only file
except Exception:
audio = moviepy.editor.AudioFileClip(media_file)
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
delete=False).name
audio.write_audiofile(audio_filename, logger=None)
else:
audio_filename = media_file
LOGGER.info("Finished extracting audio")
LOGGER.info("Transcribing")
# Convert the audio to text using the OpenAI Whisper model
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
dtype=jnp.float16,
batch_size=16)
whisper_result = pipeline(audio_filename, return_timestamps=True)
LOGGER.info("Finished transcribing file")
whisper_result = post_process_transcription(whisper_result)
transcript_text = ""
for chunk in whisper_result["chunks"]:
transcript_text += chunk["text"]
with open("./artefacts/transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") +
".txt", "w") as transcript_file:
transcript_file.write(transcript_text)
with open("./artefacts/transcript_with_timestamp_" +
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt",
"w") as transcript_file_timestamps:
transcript_file_timestamps.write(str(whisper_result))
LOGGER.info("Creating word cloud")
create_wordcloud(NOW)
LOGGER.info("Performing talk-diff and talk-diff visualization")
create_talk_diff_scatter_viz(NOW)
# S3 : Push artefacts to S3 bucket
prefix = "./artefacts/"
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
files_to_upload = [prefix + "transcript_" + suffix + ".txt",
prefix + "transcript_with_timestamp_" + suffix + ".txt",
prefix + "df_" + suffix + ".pkl",
prefix + "wordcloud_" + suffix + ".png",
prefix + "mappings_" + suffix + ".pkl",
prefix + "scatter_" + suffix + ".html"]
upload_files(files_to_upload)
summarize(transcript_text, NOW, False, False)
LOGGER.info("Summarization completed")
# Summarization takes a lot of time, so do this separately at the end
files_to_upload = [prefix + "summary_" + suffix + ".txt"]
upload_files(files_to_upload)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
import time
import wave
from datetime import datetime
import jax.numpy as jnp
import pyaudio
from pynput import keyboard
from termcolor import colored
from whisper_jax import FlaxWhisperPipline
from ...utils.file_utils import upload_files
from ...utils.log_utils import LOGGER
from ...utils.run_utils import CONFIG
from ...utils.text_utils import post_process_transcription, summarize
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
FRAMES_PER_BUFFER = 8000
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 96000
RECORD_SECONDS = 15
NOW = datetime.now()
def main():
p = pyaudio.PyAudio()
AUDIO_DEVICE_ID = -1
for i in range(p.get_device_count()):
if p.get_device_info_by_index(i)["name"] == \
CONFIG["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
AUDIO_DEVICE_ID = i
audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID)
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=FRAMES_PER_BUFFER,
input_device_index=int(audio_devices['index'])
)
pipeline = FlaxWhisperPipline("openai/whisper-" +
CONFIG["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
dtype=jnp.float16,
batch_size=16)
transcription = ""
TEMP_AUDIO_FILE = "temp_audio.wav"
global proceed
proceed = True
def on_press(key):
if key == keyboard.Key.esc:
global proceed
proceed = False
transcript_with_timestamp = {"text": "", "chunks": []}
last_transcribed_time = 0.0
listener = keyboard.Listener(on_press=on_press)
listener.start()
print("Attempting real-time transcription.. Listening...")
try:
while proceed:
frames = []
start_time = time.time()
for i in range(0, int(RATE / FRAMES_PER_BUFFER * RECORD_SECONDS)):
data = stream.read(FRAMES_PER_BUFFER,
exception_on_overflow=False)
frames.append(data)
end_time = time.time()
wf = wave.open(TEMP_AUDIO_FILE, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
whisper_result = pipeline(TEMP_AUDIO_FILE, return_timestamps=True)
timestamp = whisper_result["chunks"][0]["timestamp"]
start = timestamp[0]
end = timestamp[1]
if end is None:
end = start + 15.0
duration = end - start
item = {'timestamp': (last_transcribed_time,
last_transcribed_time + duration),
'text': whisper_result['text'],
'stats': (str(end_time - start_time), str(duration))
}
last_transcribed_time = last_transcribed_time + duration
transcript_with_timestamp["chunks"].append(item)
transcription += whisper_result['text']
print(colored("<START>", "yellow"))
print(colored(whisper_result['text'], 'green'))
print(colored("<END> Recorded duration: " +
str(end_time - start_time) +
" | Transcribed duration: " +
str(duration), "yellow"))
except Exception as exception:
print(str(exception))
finally:
with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt", "w", encoding="utf-8") as file:
file.write(transcription)
with open("real_time_transcript_with_timestamp_" +
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w",
encoding="utf-8") as file:
transcript_with_timestamp["text"] = transcription
file.write(str(transcript_with_timestamp))
transcript_with_timestamp = \
post_process_transcription(transcript_with_timestamp)
LOGGER.info("Creating word cloud")
create_wordcloud(NOW, True)
LOGGER.info("Performing talk-diff and talk-diff visualization")
create_talk_diff_scatter_viz(NOW, True)
# S3 : Push artefacts to S3 bucket
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
files_to_upload = ["real_time_transcript_" + suffix + ".txt",
"real_time_transcript_with_timestamp_" + suffix + ".txt",
"real_time_df_" + suffix + ".pkl",
"real_time_wordcloud_" + suffix + ".png",
"real_time_mappings_" + suffix + ".pkl",
"real_time_scatter_" + suffix + ".html"]
upload_files(files_to_upload)
summarize(transcript_with_timestamp["text"], NOW, True, True)
LOGGER.info("Summarization completed")
# Summarization takes a lot of time, so do this separately at the end
files_to_upload = ["real_time_summary_" + suffix + ".txt"]
upload_files(files_to_upload)
if __name__ == "__main__":
main()