clean up trials

This commit is contained in:
Gokul Mohanarangan
2023-07-17 20:01:31 +05:30
10 changed files with 2534 additions and 291 deletions

View File

@@ -1,43 +0,0 @@
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Load the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
# Define the input text
text = "Your input text to be summarized goes here."
# Tokenize the text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids]).to(device)
# Get the BERT model output
with torch.no_grad():
outputs = model(input_ids)[0] # Extract the last hidden states
# Calculate sentence embeddings
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
input_text_embedding = sentence_transformer_model.encode([text])[0]
# Calculate cosine similarity between sentences and input text
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)
# Sort the sentences by similarity scores in descending order
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]
# Choose the top sentences as the summary
num_summary_sentences = 2 # Adjust as needed
summary = ". ".join(sorted_sentences[:num_summary_sentences])
print("Summary:", summary)

View File

@@ -1,47 +0,0 @@
import subprocess
# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
#
# export OPENAI_API_KEY=
#
# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
#
# openai api fine_tunes.list
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Load and preprocess your dataset
dataset = [...] # Your dataset of transcriptions and corresponding titles
# Tokenize and encode the dataset
encoded_dataset = tokenizer(dataset, truncation=True, padding=True)
# Define the fine-tuning training arguments
training_args = TrainingArguments(
output_dir="./fine_tuned_model",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=4,
save_steps=1000,
save_total_limit=2,
prediction_loss_only=True,
)
# Define the fine-tuning trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset,
)
# Fine-tune the GPT-2 model
trainer.train()
# Save the fine-tuned model
trainer.save_model("./fine_tuned_model")

View File

@@ -1,6 +0,0 @@
{"prompt": " Welcome back to the show dozens of AI industry leaders, policymakers and academics are warning there's a risk that the use of artificial intelligence could lead to the annihilation of humanity. The statement says extinction by AI should be considered a top global threat and treated just as seriously as risks from pandemics or things like nuclear war. It's worth noting today's cutting edge chat bots don't think for themselves. In simple terms they produce outputs and answers based on data that they've been trained on. It's about the model. That's where scale AI sees it offers solutions. It provides software that in part helps label and refine the data sets used in this process, for example, their data can be used to teach AI in self-driving cars to spot the difference between a pedestrian and a pebble. That's very important. Now for over seven years, scale has been serving organisations from Fortune 500 companies to the U.S. Defense Department. And it's just launched two new platforms, scaled on of them, used in defense for AI decision making and scale EGP, a generative platform for enterprises.", "completion": "Intro"}
{"prompt": " and he joins us now. Alexander, most of my audience heads exploded there along with mine. I try to keep it simple, just in simple terms in your words, welcome and explain what you do at scale AI. Thanks so much for having me on. Scale is the data platform exploring the development of artificial intelligence. Like you mentioned, at the end of the day, all of these algorithms spoiled down to the data that they trained on. So whether it's a self-repearing car, or it's a chatchew PT, or it's any number of capabilities that people are working on with artificial intelligence, it always comes down to data. And we're working to unlock AI for every single industry from the high tech players like Meta, Microsoft, and others all the way to large enterprises and the US government and the Department of Defense, like you mentioned with our new platforms, scaled on of in scale, etc. Okay, so you take their data, you label it, you filter it, and you ensure that whatever the inputs are into the models that are created are the best they can be to ensure the most efficient. And I think smartest, can we call it that outputs? So that these chat box, for example, don't talk gobbledygook, just based on the data that the trained on being, romance novels, for example, was one of them that was used. And then you got weird outputs. Yeah, that's exactly right. I think one of the things that we're finding with these AI is that they're incredibly sensitive to the data that they're trained on. You know, it's possible to create AI that will blow your mind and be incredibly powerful, but it's also possible to create AI that just falls flat or returns go up a little bit, or just doesn't perform very well. Our view is that enterprises and anyone using these models, they need to know how to use their own data, their own know how their own expertise, their own trade secrets, to combine with these powerful algorithms, to actually build experiences that are new, refreshing, different, and powerful to provide a lot of value to their customers, provide a lot of value to their users. You know, enterprises can't use large language models or these AI completely off the shelf, they can be customized and meet strict requirements for security and performance to actually", "completion": "What is Scale AI"}
{"prompt": " How do you do it? Does it require human input in order to label and select and to identify which data is best? I guess I'm asking why you have an advantage in doing this versus anybody doing it in-house, for example, if they wanted to. Yeah, you know, we've been working in AI as you mentioned for the past seven years, all the way back to the very genesis of many of the technologies that they were seeing today. We've been working with OpenAI since 2019, for example, and work with them back on GPT2 before it was, before it was the incredibly powerful technology today. One of the things I like to say is that it was a four year overnight success. This is some of the work that we did with them. Today, we're seeing a lot of AI tourists pretending to be AI natives because there's just so much hype. There's a lot of companies that are not even selling solutions that they're selling vaporware. And so what we can bring is a lot of our expertise in working with these systems from the very beginning and have a ton of experience bringing this technology from, you know, prototype to production and actually help release the technology to the broader public. Our products, they're not coming soon. They're live with customers today, everywhere from the Fortune 500 to the U.S. Department of Defense. Our Donovan platform is the first large language model that's going to be deployed on a U.S. government class to find network. With Donovan, Warfighters can act in minutes instead of weeks and we're working with customers including the Department of Defense's joint all-demand community control as well as the Marine Corps University School of Advanced Warfighting. And on the enterprise side, our view is that kind of as I mentioned, enterprises can't use large language models off the shelf and that's why we created EGP. It's a full-stack solution for enterprises that is model-ignostic and enables businesses to leverage a suite of options from leading private models to open source models to test, deploy, monitor the best tech for their unique business models.", "completion": "Why Scale AI"}
{"prompt": " OK, I have a million questions for you. But the most important one was just how, sort of, unemotionally, you were discussing some of the defense contracts and the importance that your data and your filtering is in the selections that are one day going to be made. And if I tie that with the warning that we got from the industry about the risk of extinction from AI. And combine it with what you just said about Open AI and the difference I think between what we've already seen with chat GPT2 and chat GPT4, the sophistication is dramatically better. So is the darker side. Alexander, where's the off button here? Where's the control system in even the defense contracts that are being undertaken today? Yeah, no, no, this is a topic of incredible importance. And as we saw today with the statement, I think it's, you know, this is one of the reasons why at scale, we're working in partnership with the White House to perform a public evaluation of these AI systems. It's critical that the AI industry is doing this work in testing and evaluating these models in parallel alongside development and progress and foundation models and progress in AI, the capabilities needs to happen alongside progress and safety and model evaluation and understanding what the risk associated with the technology are. So, you know what we're doing is. Is that happening today in Malik Xander? I mean, we're asking these questions which is important, but I feel like chatGPT has been unleashed on the broader public before we really have any controls. Let's be clear. I think, you know, this is one of the things that we're working on and the White House recently released a statement. I believe it was two weeks ago, a fact sheet describing the efforts that they're taking in ensuring public evaluation and public forums in which we're understanding the importance of these models. I, you know, what do thing I would say is we need to act very quickly. It is an incredibly powerful technology. These are incredibly important conversations that we need to be having. And so we're trying to raise this quickly as we can to deliver these testing evaluation systems. But, you know, as we mentioned before, at scale, we have over seven years of expertise in the AI industry. And so we've been able to utilize our deep understanding and clear way to measure the risks associated with these models, to really accelerate our country's efforts and understanding the safety, mitigations and risks with these models.", "completion": "AI Defense Contracts"}
{"prompt": " in artificial intelligence terms. So when you're infected dinosaur, so we are sort of relying on you in many ways to be able to recognize the benefits which I think people can and do see, but also the downsides. If I had to ask you and just put you on the spot what you think the best way is to regulate this and then know it's a complex question. What is the best way to regulate it? It makes sense to pause Alexander. I know it's fundamental to your business but would pause make sense? You know this is a really important question and it's one that we and my I personally have spent a lot of time thinking about. I think as we've seen, the history of AI really tells us that the key to human-centric response way AI really comes down to a solid data foundation. And for a technology that's as transformative as for reaching and as potentially as ubiquitous as artificial intelligence, I unfortunately don't think there's a one-size-fits-all approach regulation. For example, there's a number of industries that are going to need to regulate industries that are impacted by AI, such as the FDA for medicine and health or the Department of Transportation for autonomous vehicles or the FAI for drones. And so ensuring that AI is employed for maximal benefit will require a wide participation from everywhere from policymakers, industry, civil society organizations, and everyone needs to bring together to sort of educate each other to ensure that AI is developed and is safe and trustworthy way for the American people. One thing is clear though, at per our previous conversation, I think it's absolutely critical that we have proper testing evaluation and understanding of the risks of these AI systems as we are developing them. So kind of as you mentioned how we've unleashed the activity on the world, potentially before there've been the right checks and balances put in place. I think for all future deployments of powerful AI technology, we need to ensure that we have the right public safeguards and public testing to make sure that", "completion": "Regulation"}
{"prompt": " I couldn't agree more with you. A human sensitive and centric AI system requires the right data set and to consider it. I've written article about your company in Forbes back in April and it said, you employ a company called remote asks, which employs around 240,000 humans, that are going through all this data and trying to make sure that the right data set for you guys is provided to the customers. Is that right Alexander? Is it 240,000 people to sort of cultivate the data sets that you're providing to customers? Because human-centric data also requires a lot of humans if that's true. to sort of cultivate the data sets that you're providing to customers because human-centric data also requires a lot of humans if that's true. Yeah, exactly as you mentioned in terms of a human-centric and a human-sensitive approach to artificial intelligence, we really believe that it requires sort of the collective expertise and collective knowledge of as many people in the world to really empower and enhance these models. We don't believe in walled gardens and we don't believe in sort of a small group of individuals or small group of engineers deciding how this technology should look for the entire world or the broader ecosystem. The use cases that we're considering, whether it's with the Department of Defense and working on defense use cases or it's with large enterprises or it's, you know, with the Department of Defense and working on defense use cases, or it's with large enterprises, or it's with OpenAI, deploying Apache PT. You know, these are use cases that are incredibly important for the future of humanity, writ large, not just for the future of the technology industry. And so, you know, our view is that we need to build a technology that enables the sort of collective wisdom and collective expertise of as many people as possible to bring into the data set that fuel these models so that they to reflect our collective knowledge or collective wisdom or collective values. Yeah, I mean, the more people that's checking this, the better. But I'm correct. Do you employ hundreds of thousands of people to check this data? And I guess if do you, actually, I should ask you the question directly first. Yeah, you know, I can't comment specifically on the number of people. But what we do with defense contracts, surely, where those people are based and who's checking the data, or doesn't it? Yes, so in our work with in scale donament, for example, one of the things that we do is in line with deploying the first a large-range model, the first AI system, to a large-range model to classified networks, we're also ensuring that the data to fuel these systems is powered by the most brilliant experts within the United States. So in this case, we bring in experts in defense data and in defense context to bring the data to be able to power these models. I'll examine that out of a thousand more questions for you, but we're going to have to reconvene. As you said, this is early days of this technology. We all need to be thinking about regulation and yeah we'll reconvene sir. Thank you for joining us on the show. I'll extend our angsee you and co-founder", "completion": "HumanCentric AI"}

View File

@@ -1,6 +0,0 @@
{"prompt":" Welcome back to the show dozens of AI industry leaders, policymakers and academics are warning there's a risk that the use of artificial intelligence could lead to the annihilation of humanity. The statement says extinction by AI should be considered a top global threat and treated just as seriously as risks from pandemics or things like nuclear war. It's worth noting today's cutting edge chat bots don't think for themselves. In simple terms they produce outputs and answers based on data that they've been trained on. It's about the model. That's where scale AI sees it offers solutions. It provides software that in part helps label and refine the data sets used in this process, for example, their data can be used to teach AI in self-driving cars to spot the difference between a pedestrian and a pebble. That's very important. Now for over seven years, scale has been serving organisations from Fortune 500 companies to the U.S. Defense Department. And it's just launched two new platforms, scaled on of them, used in defense for AI decision making and scale EGP, a generative platform for enterprises. ->","completion":" Intro\n"}
{"prompt":" and he joins us now. Alexander, most of my audience heads exploded there along with mine. I try to keep it simple, just in simple terms in your words, welcome and explain what you do at scale AI. Thanks so much for having me on. Scale is the data platform exploring the development of artificial intelligence. Like you mentioned, at the end of the day, all of these algorithms spoiled down to the data that they trained on. So whether it's a self-repearing car, or it's a chatchew PT, or it's any number of capabilities that people are working on with artificial intelligence, it always comes down to data. And we're working to unlock AI for every single industry from the high tech players like Meta, Microsoft, and others all the way to large enterprises and the US government and the Department of Defense, like you mentioned with our new platforms, scaled on of in scale, etc. Okay, so you take their data, you label it, you filter it, and you ensure that whatever the inputs are into the models that are created are the best they can be to ensure the most efficient. And I think smartest, can we call it that outputs? So that these chat box, for example, don't talk gobbledygook, just based on the data that the trained on being, romance novels, for example, was one of them that was used. And then you got weird outputs. Yeah, that's exactly right. I think one of the things that we're finding with these AI is that they're incredibly sensitive to the data that they're trained on. You know, it's possible to create AI that will blow your mind and be incredibly powerful, but it's also possible to create AI that just falls flat or returns go up a little bit, or just doesn't perform very well. Our view is that enterprises and anyone using these models, they need to know how to use their own data, their own know how their own expertise, their own trade secrets, to combine with these powerful algorithms, to actually build experiences that are new, refreshing, different, and powerful to provide a lot of value to their customers, provide a lot of value to their users. You know, enterprises can't use large language models or these AI completely off the shelf, they can be customized and meet strict requirements for security and performance to actually ->","completion":" What is Scale AI\n"}
{"prompt":" How do you do it? Does it require human input in order to label and select and to identify which data is best? I guess I'm asking why you have an advantage in doing this versus anybody doing it in-house, for example, if they wanted to. Yeah, you know, we've been working in AI as you mentioned for the past seven years, all the way back to the very genesis of many of the technologies that they were seeing today. We've been working with OpenAI since 2019, for example, and work with them back on GPT2 before it was, before it was the incredibly powerful technology today. One of the things I like to say is that it was a four year overnight success. This is some of the work that we did with them. Today, we're seeing a lot of AI tourists pretending to be AI natives because there's just so much hype. There's a lot of companies that are not even selling solutions that they're selling vaporware. And so what we can bring is a lot of our expertise in working with these systems from the very beginning and have a ton of experience bringing this technology from, you know, prototype to production and actually help release the technology to the broader public. Our products, they're not coming soon. They're live with customers today, everywhere from the Fortune 500 to the U.S. Department of Defense. Our Donovan platform is the first large language model that's going to be deployed on a U.S. government class to find network. With Donovan, Warfighters can act in minutes instead of weeks and we're working with customers including the Department of Defense's joint all-demand community control as well as the Marine Corps University School of Advanced Warfighting. And on the enterprise side, our view is that kind of as I mentioned, enterprises can't use large language models off the shelf and that's why we created EGP. It's a full-stack solution for enterprises that is model-ignostic and enables businesses to leverage a suite of options from leading private models to open source models to test, deploy, monitor the best tech for their unique business models. ->","completion":" Why Scale AI\n"}
{"prompt":" OK, I have a million questions for you. But the most important one was just how, sort of, unemotionally, you were discussing some of the defense contracts and the importance that your data and your filtering is in the selections that are one day going to be made. And if I tie that with the warning that we got from the industry about the risk of extinction from AI. And combine it with what you just said about Open AI and the difference I think between what we've already seen with chat GPT2 and chat GPT4, the sophistication is dramatically better. So is the darker side. Alexander, where's the off button here? Where's the control system in even the defense contracts that are being undertaken today? Yeah, no, no, this is a topic of incredible importance. And as we saw today with the statement, I think it's, you know, this is one of the reasons why at scale, we're working in partnership with the White House to perform a public evaluation of these AI systems. It's critical that the AI industry is doing this work in testing and evaluating these models in parallel alongside development and progress and foundation models and progress in AI, the capabilities needs to happen alongside progress and safety and model evaluation and understanding what the risk associated with the technology are. So, you know what we're doing is. Is that happening today in Malik Xander? I mean, we're asking these questions which is important, but I feel like chatGPT has been unleashed on the broader public before we really have any controls. Let's be clear. I think, you know, this is one of the things that we're working on and the White House recently released a statement. I believe it was two weeks ago, a fact sheet describing the efforts that they're taking in ensuring public evaluation and public forums in which we're understanding the importance of these models. I, you know, what do thing I would say is we need to act very quickly. It is an incredibly powerful technology. These are incredibly important conversations that we need to be having. And so we're trying to raise this quickly as we can to deliver these testing evaluation systems. But, you know, as we mentioned before, at scale, we have over seven years of expertise in the AI industry. And so we've been able to utilize our deep understanding and clear way to measure the risks associated with these models, to really accelerate our country's efforts and understanding the safety, mitigations and risks with these models. ->","completion":" AI Defense Contracts\n"}
{"prompt":" in artificial intelligence terms. So when you're infected dinosaur, so we are sort of relying on you in many ways to be able to recognize the benefits which I think people can and do see, but also the downsides. If I had to ask you and just put you on the spot what you think the best way is to regulate this and then know it's a complex question. What is the best way to regulate it? It makes sense to pause Alexander. I know it's fundamental to your business but would pause make sense? You know this is a really important question and it's one that we and my I personally have spent a lot of time thinking about. I think as we've seen, the history of AI really tells us that the key to human-centric response way AI really comes down to a solid data foundation. And for a technology that's as transformative as for reaching and as potentially as ubiquitous as artificial intelligence, I unfortunately don't think there's a one-size-fits-all approach regulation. For example, there's a number of industries that are going to need to regulate industries that are impacted by AI, such as the FDA for medicine and health or the Department of Transportation for autonomous vehicles or the FAI for drones. And so ensuring that AI is employed for maximal benefit will require a wide participation from everywhere from policymakers, industry, civil society organizations, and everyone needs to bring together to sort of educate each other to ensure that AI is developed and is safe and trustworthy way for the American people. One thing is clear though, at per our previous conversation, I think it's absolutely critical that we have proper testing evaluation and understanding of the risks of these AI systems as we are developing them. So kind of as you mentioned how we've unleashed the activity on the world, potentially before there've been the right checks and balances put in place. I think for all future deployments of powerful AI technology, we need to ensure that we have the right public safeguards and public testing to make sure that ->","completion":" Regulation\n"}
{"prompt":" I couldn't agree more with you. A human sensitive and centric AI system requires the right data set and to consider it. I've written article about your company in Forbes back in April and it said, you employ a company called remote asks, which employs around 240,000 humans, that are going through all this data and trying to make sure that the right data set for you guys is provided to the customers. Is that right Alexander? Is it 240,000 people to sort of cultivate the data sets that you're providing to customers? Because human-centric data also requires a lot of humans if that's true. to sort of cultivate the data sets that you're providing to customers because human-centric data also requires a lot of humans if that's true. Yeah, exactly as you mentioned in terms of a human-centric and a human-sensitive approach to artificial intelligence, we really believe that it requires sort of the collective expertise and collective knowledge of as many people in the world to really empower and enhance these models. We don't believe in walled gardens and we don't believe in sort of a small group of individuals or small group of engineers deciding how this technology should look for the entire world or the broader ecosystem. The use cases that we're considering, whether it's with the Department of Defense and working on defense use cases or it's with large enterprises or it's, you know, with the Department of Defense and working on defense use cases, or it's with large enterprises, or it's with OpenAI, deploying Apache PT. You know, these are use cases that are incredibly important for the future of humanity, writ large, not just for the future of the technology industry. And so, you know, our view is that we need to build a technology that enables the sort of collective wisdom and collective expertise of as many people as possible to bring into the data set that fuel these models so that they to reflect our collective knowledge or collective wisdom or collective values. Yeah, I mean, the more people that's checking this, the better. But I'm correct. Do you employ hundreds of thousands of people to check this data? And I guess if do you, actually, I should ask you the question directly first. Yeah, you know, I can't comment specifically on the number of people. But what we do with defense contracts, surely, where those people are based and who's checking the data, or doesn't it? Yes, so in our work with in scale donament, for example, one of the things that we do is in line with deploying the first a large-range model, the first AI system, to a large-range model to classified networks, we're also ensuring that the data to fuel these systems is powered by the most brilliant experts within the United States. So in this case, we bring in experts in defense data and in defense context to bring the data to be able to power these models. I'll examine that out of a thousand more questions for you, but we're going to have to reconvene. As you said, this is early days of this technology. We all need to be thinking about regulation and yeah we'll reconvene sir. Thank you for joining us on the show. I'll extend our angsee you and co-founder ->","completion":" HumanCentric AI\n"}

2534
trials/incsum.ipynb Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +0,0 @@
import openai
openai.api_key = ""
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "]
response = openai.Completion.create(
model="davinci:ft-personal-2023-07-14-10-43-51",
prompt=sample_chunks[0])
print(response)

View File

@@ -1,33 +0,0 @@
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
# Load the Pegasus model and tokenizer
model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
# Define the input text for summarization
text = sample_chunks[1]
inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
# Generate the summary
summary_ids = model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=200,
num_beams=4,
length_penalty=2.0,
early_stopping=True,
)
# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)

View File

@@ -1,27 +0,0 @@
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
# Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
# Define the input text for summarization
text = "Summarize the following text in 3 key points. text : " + sample_chunks[1]
# Tokenize the input text
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
# Generate the summary
summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True)
# Decode and print the summary
summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
print("Summary:", summary)

View File

@@ -1,19 +0,0 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("CarperAI/stable-vicuna-13b-delta")
# model = AutoModelForCausalLM.from_pretrained("CarperAI/stable-vicuna-13b-delta")
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-13b-v1.3")
model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-13b-v1.3")
# model.half().cuda()
prompt = """\
Summarize the text in a subject line. text = "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
"""
inputs = tokenizer(prompt, return_tensors='pt')
tokens = model.generate(
inputs,
max_new_tokens=256
)
print(tokenizer.decode(tokens[0], skip_special_tokens=True))

View File

@@ -1,98 +0,0 @@
import json
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp
# Function to extract chapter information from a YouTube video URL
def get_youtube_chapters(video_id):
video_url = "https://www.youtube.com/watch?v=" + video_id
ydl_opts = {
'extract_flat': 'in_playlist',
'skip_download': True,
'quiet': True,
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
video_info = ydl.extract_info(video_url, download=False)
chapters = []
if 'chapters' in video_info:
for chapter in video_info['chapters']:
start_time = chapter['start_time']
end_time = chapter['end_time']
title = chapter['title']
chapters.append({
'start': start_time,
'end': end_time,
'title': title
})
return chapters
# Function to extract video transcription using yt_dlp
def get_youtube_transcription(video_id):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': './artefacts/audio', # Specify output file path and name
}
# Download the audio
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
media_file = "./artefacts/audio.mp3"
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
dtype=jnp.float16,
batch_size=16)
whisper_result = pipeline(media_file, return_timestamps=True)
return whisper_result["chunks"]
# Function to scrape YouTube video transcripts and chapter information
def scrape_youtube_data(video_id):
transcript_text = get_youtube_transcription(video_id)
chapters = get_youtube_chapters(video_id)
print("transcript_text", transcript_text)
print("chapters", chapters)
return transcript_text, chapters
# Function to generate fine-tuning dataset from YouTube data
def generate_finetuning_dataset(video_ids):
prompt_completion_pairs = []
for video_id in video_ids:
transcript_text, chapters = scrape_youtube_data(video_id)
if transcript_text is not None and chapters is not None:
for chapter in chapters:
start_time = chapter["start"]
end_time = chapter["end"]
chapter_text = chapter["title"]
prompt = ""
for transcript in transcript_text:
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
prompt += transcript["text"]
if prompt is not None:
completion = chapter_text
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
return prompt_completion_pairs
# Add all the video ids here, the videos must have captions [chapters]
video_ids = ["yTnSEZIwnkU"]
dataset = generate_finetuning_dataset(video_ids)
with open("finetuning_dataset.jsonl", "w") as f:
for example in dataset:
f.write(json.dumps(example) + "\n")