reflector/trials/finetuning/youtube_scraping.py

import json
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp

# Function to extract chapter information from a YouTube video URL
def get_youtube_chapters(video_id):
    video_url = "https://www.youtube.com/watch?v=" + video_id
    ydl_opts = {
        'extract_flat': 'in_playlist',
        'skip_download': True,
        'quiet': True,
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        video_info = ydl.extract_info(video_url, download=False)

    chapters = []

    if 'chapters' in video_info:
        for chapter in video_info['chapters']:
            start_time = chapter['start_time']
            end_time = chapter['end_time']
            title = chapter['title']

            chapters.append({
                'start': start_time,
                'end': end_time,
                'title': title
            })

    return chapters


# Function to extract video transcription using yt_dlp
def get_youtube_transcription(video_id):
    ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                    'key': 'FFmpegExtractAudio',
                    'preferredcodec': 'mp3',
                    'preferredquality': '192',
            }],
            'outtmpl': './artefacts/audio',  # Specify output file path and name
    }

    # Download the audio
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download(["https://www.youtube.com/watch?v=" + video_id])
    media_file = "./artefacts/audio.mp3"

    pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
                                  dtype=jnp.float16,
                                  batch_size=16)
    whisper_result = pipeline(media_file, return_timestamps=True)
    return whisper_result["chunks"]


# Function to scrape YouTube video transcripts and chapter information
def scrape_youtube_data(video_id):
    transcript_text = get_youtube_transcription(video_id)
    chapters = get_youtube_chapters(video_id)
    print("transcript_text", transcript_text)
    print("chapters", chapters)
    return transcript_text, chapters


# Function to generate fine-tuning dataset from YouTube data
def generate_finetuning_dataset(video_ids):
    prompt_completion_pairs = []
    for video_id in video_ids:
        transcript_text, chapters = scrape_youtube_data(video_id)
        if transcript_text is not None and chapters is not None:
            for chapter in chapters:
                start_time = chapter["start"]
                end_time = chapter["end"]
                chapter_text = chapter["title"]

                prompt = ""
                for transcript in transcript_text:
                    if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
                        prompt += transcript["text"]

                if prompt is not None:
                    completion = chapter_text
                    prompt_completion_pairs.append({"prompt": prompt, "completion": completion})

    return prompt_completion_pairs


# Add all the video ids here, the videos must have captions [chapters]
video_ids = ["yTnSEZIwnkU"]
dataset = generate_finetuning_dataset(video_ids)

with open("finetuning_dataset.jsonl", "w", encoding="utf-8") as file:
    for example in dataset:
        file.write(json.dumps(example) + "\n")