mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
99 lines
3.2 KiB
Python
99 lines
3.2 KiB
Python
import json
|
|
import yt_dlp as youtube_dl
|
|
from whisper_jax import FlaxWhisperPipline
|
|
import jax.numpy as jnp
|
|
|
|
# Function to extract chapter information from a YouTube video URL
|
|
def get_youtube_chapters(video_id):
|
|
video_url = "https://www.youtube.com/watch?v=" + video_id
|
|
ydl_opts = {
|
|
'extract_flat': 'in_playlist',
|
|
'skip_download': True,
|
|
'quiet': True,
|
|
}
|
|
|
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
|
video_info = ydl.extract_info(video_url, download=False)
|
|
|
|
chapters = []
|
|
|
|
if 'chapters' in video_info:
|
|
for chapter in video_info['chapters']:
|
|
start_time = chapter['start_time']
|
|
end_time = chapter['end_time']
|
|
title = chapter['title']
|
|
|
|
chapters.append({
|
|
'start': start_time,
|
|
'end': end_time,
|
|
'title': title
|
|
})
|
|
|
|
return chapters
|
|
|
|
|
|
# Function to extract video transcription using yt_dlp
|
|
def get_youtube_transcription(video_id):
|
|
ydl_opts = {
|
|
'format': 'bestaudio/best',
|
|
'postprocessors': [{
|
|
'key': 'FFmpegExtractAudio',
|
|
'preferredcodec': 'mp3',
|
|
'preferredquality': '192',
|
|
}],
|
|
'outtmpl': './artefacts/audio', # Specify output file path and name
|
|
}
|
|
|
|
# Download the audio
|
|
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
|
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
|
|
media_file = "./artefacts/audio.mp3"
|
|
|
|
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
|
|
dtype=jnp.float16,
|
|
batch_size=16)
|
|
whisper_result = pipeline(media_file, return_timestamps=True)
|
|
return whisper_result["chunks"]
|
|
|
|
|
|
|
|
# Function to scrape YouTube video transcripts and chapter information
|
|
def scrape_youtube_data(video_id):
|
|
transcript_text = get_youtube_transcription(video_id)
|
|
chapters = get_youtube_chapters(video_id)
|
|
print("transcript_text", transcript_text)
|
|
print("chapters", chapters)
|
|
return transcript_text, chapters
|
|
|
|
|
|
# Function to generate fine-tuning dataset from YouTube data
|
|
def generate_finetuning_dataset(video_ids):
|
|
prompt_completion_pairs = []
|
|
for video_id in video_ids:
|
|
transcript_text, chapters = scrape_youtube_data(video_id)
|
|
if transcript_text is not None and chapters is not None:
|
|
for chapter in chapters:
|
|
start_time = chapter["start"]
|
|
end_time = chapter["end"]
|
|
chapter_text = chapter["title"]
|
|
|
|
prompt = ""
|
|
for transcript in transcript_text:
|
|
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
|
|
prompt += transcript["text"]
|
|
|
|
if prompt is not None:
|
|
completion = chapter_text
|
|
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
|
|
|
|
return prompt_completion_pairs
|
|
|
|
|
|
# Add all the video ids here, the videos must have captions [chapters]
|
|
video_ids = ["yTnSEZIwnkU"]
|
|
dataset = generate_finetuning_dataset(video_ids)
|
|
|
|
with open("finetuning_dataset.jsonl", "w") as f:
|
|
for example in dataset:
|
|
f.write(json.dumps(example) + "\n")
|