summary trials

2025-12-22 05:09:05 +00:00 · 2023-07-17 19:54:16 +05:30
parent 0282d4fca5
commit 6f2f608b6a
10 changed files with 380 additions and 46 deletions
--- a/trials/finetune.py
+++ b/trials/finetune.py
@@ -0,0 +1,47 @@
+import subprocess
+
+# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
+#
+# export OPENAI_API_KEY=
+#
+# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
+#
+# openai api fine_tunes.list
+
+
+import torch
+from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments
+
+# Load the GPT-2 tokenizer and model
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained("gpt2")
+
+# Load and preprocess your dataset
+dataset = [...]  # Your dataset of transcriptions and corresponding titles
+
+# Tokenize and encode the dataset
+encoded_dataset = tokenizer(dataset, truncation=True, padding=True)
+
+# Define the fine-tuning training arguments
+training_args = TrainingArguments(
+    output_dir="./fine_tuned_model",
+    overwrite_output_dir=True,
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    save_steps=1000,
+    save_total_limit=2,
+    prediction_loss_only=True,
+)
+
+# Define the fine-tuning trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=encoded_dataset,
+)
+
+# Fine-tune the GPT-2 model
+trainer.train()
+
+# Save the fine-tuned model
+trainer.save_model("./fine_tuned_model")