mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
move all experiments to trials
This commit is contained in:
98
trials/youtube_scraping.py
Normal file
98
trials/youtube_scraping.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
import jax.numpy as jnp
|
||||
|
||||
# Function to extract chapter information from a YouTube video URL
|
||||
def get_youtube_chapters(video_id):
|
||||
video_url = "https://www.youtube.com/watch?v=" + video_id
|
||||
ydl_opts = {
|
||||
'extract_flat': 'in_playlist',
|
||||
'skip_download': True,
|
||||
'quiet': True,
|
||||
}
|
||||
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
video_info = ydl.extract_info(video_url, download=False)
|
||||
|
||||
chapters = []
|
||||
|
||||
if 'chapters' in video_info:
|
||||
for chapter in video_info['chapters']:
|
||||
start_time = chapter['start_time']
|
||||
end_time = chapter['end_time']
|
||||
title = chapter['title']
|
||||
|
||||
chapters.append({
|
||||
'start': start_time,
|
||||
'end': end_time,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return chapters
|
||||
|
||||
|
||||
# Function to extract video transcription using yt_dlp
|
||||
def get_youtube_transcription(video_id):
|
||||
ydl_opts = {
|
||||
'format': 'bestaudio/best',
|
||||
'postprocessors': [{
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'mp3',
|
||||
'preferredquality': '192',
|
||||
}],
|
||||
'outtmpl': './artefacts/audio', # Specify output file path and name
|
||||
}
|
||||
|
||||
# Download the audio
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
|
||||
media_file = "./artefacts/audio.mp3"
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(media_file, return_timestamps=True)
|
||||
return whisper_result["chunks"]
|
||||
|
||||
|
||||
|
||||
# Function to scrape YouTube video transcripts and chapter information
|
||||
def scrape_youtube_data(video_id):
|
||||
transcript_text = get_youtube_transcription(video_id)
|
||||
chapters = get_youtube_chapters(video_id)
|
||||
print("transcript_text", transcript_text)
|
||||
print("chapters", chapters)
|
||||
return transcript_text, chapters
|
||||
|
||||
|
||||
# Function to generate fine-tuning dataset from YouTube data
|
||||
def generate_finetuning_dataset(video_ids):
|
||||
prompt_completion_pairs = []
|
||||
for video_id in video_ids:
|
||||
transcript_text, chapters = scrape_youtube_data(video_id)
|
||||
if transcript_text is not None and chapters is not None:
|
||||
for chapter in chapters:
|
||||
start_time = chapter["start"]
|
||||
end_time = chapter["end"]
|
||||
chapter_text = chapter["title"]
|
||||
|
||||
prompt = ""
|
||||
for transcript in transcript_text:
|
||||
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
|
||||
prompt += transcript["text"]
|
||||
|
||||
if prompt is not None:
|
||||
completion = chapter_text
|
||||
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
|
||||
|
||||
return prompt_completion_pairs
|
||||
|
||||
|
||||
# Add all the video ids here, the videos must have captions [chapters]
|
||||
video_ids = ["yTnSEZIwnkU"]
|
||||
dataset = generate_finetuning_dataset(video_ids)
|
||||
|
||||
with open("finetuning_dataset.jsonl", "w") as f:
|
||||
for example in dataset:
|
||||
f.write(json.dumps(example) + "\n")
|
||||
Reference in New Issue
Block a user