mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
add local BART test script - 10min processing for 30min audio on M1 8GB
This commit is contained in:
118
reflector-local/whisper_summarizer_bart.py
Normal file
118
reflector-local/whisper_summarizer_bart.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import argparse
|
||||
import os
|
||||
import tempfile
|
||||
import moviepy.editor
|
||||
from loguru import logger
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
import whisper
|
||||
import nltk
|
||||
nltk.download('punkt')
|
||||
|
||||
WHISPER_MODEL_SIZE = "base"
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
|
||||
description="Creates a transcript of a video or audio file, then summarizes it using BART."
|
||||
)
|
||||
|
||||
parser.add_argument("location", help="Location of the media file")
|
||||
parser.add_argument("output", help="Output file path")
|
||||
|
||||
parser.add_argument(
|
||||
"-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
|
||||
parser.add_argument(
|
||||
"-l", "--language", help="Language that the summary should be written in",
|
||||
type=str, default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
|
||||
parser.add_argument(
|
||||
"-m", "--model_name", help="Name or path of the BART model",
|
||||
type=str, default="facebook/bart-large-cnn")
|
||||
|
||||
return parser
|
||||
|
||||
# NLTK chunking function
|
||||
def chunk_text(txt, max_chunk_length=500):
|
||||
"Split text into smaller chunks."
|
||||
sentences = nltk.sent_tokenize(txt)
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) < max_chunk_length:
|
||||
current_chunk += f" {sentence.strip()}"
|
||||
else:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = f"{sentence.strip()}"
|
||||
chunks.append(current_chunk.strip())
|
||||
return chunks
|
||||
|
||||
# BART summary function
|
||||
def summarize_chunks(chunks, tokenizer, model):
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c, return_tensors='pt')
|
||||
summary_ids = model.generate(
|
||||
input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
return summaries
|
||||
|
||||
def main():
|
||||
import sys
|
||||
sys.setrecursionlimit(10000)
|
||||
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
media_file = args.location
|
||||
logger.info(f"Processing file: {media_file}")
|
||||
|
||||
# If the media file we just retrieved is a video, extract its audio stream.
|
||||
if os.path.isfile(media_file) and media_file.endswith(('.mp4', '.avi', '.flv')):
|
||||
audio_filename = tempfile.NamedTemporaryFile(
|
||||
suffix=".mp3", delete=False).name
|
||||
logger.info(f"Extracting audio to: {audio_filename}")
|
||||
|
||||
video = moviepy.editor.VideoFileClip(media_file)
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
|
||||
logger.info("Finished extracting audio")
|
||||
media_file = audio_filename
|
||||
|
||||
# Transcribe the audio file using the OpenAI Whisper model
|
||||
logger.info("Loading Whisper speech-to-text model")
|
||||
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE)
|
||||
|
||||
logger.info(f"Transcribing audio file: {media_file}")
|
||||
whisper_result = whisper_model.transcribe(media_file)
|
||||
|
||||
logger.info("Finished transcribing file")
|
||||
|
||||
# If we got the transcript parameter on the command line, save the transcript to the specified file.
|
||||
if args.transcript:
|
||||
logger.info(f"Saving transcript to: {args.transcript}")
|
||||
transcript_file = open(args.transcript, "w")
|
||||
transcript_file.write(whisper_result["text"])
|
||||
transcript_file.close()
|
||||
|
||||
# Summarize the generated transcript using the BART model
|
||||
logger.info(f"Loading BART model: {args.model_name}")
|
||||
tokenizer = BartTokenizer.from_pretrained(args.model_name)
|
||||
model = BartForConditionalGeneration.from_pretrained(args.model_name)
|
||||
|
||||
logger.info("Breaking transcript into smaller chunks")
|
||||
chunks = chunk_text(whisper_result['text'])
|
||||
|
||||
logger.info(
|
||||
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
|
||||
|
||||
logger.info(f"Writing summary text in {args.language} to: {args.output}")
|
||||
with open(args.output, 'w') as f:
|
||||
f.write('Summary of: ' + args.location + "\n\n")
|
||||
summaries = summarize_chunks(chunks, tokenizer, model)
|
||||
for summary in summaries:
|
||||
f.write(summary.strip() + "\n\n")
|
||||
|
||||
logger.info("Summarization completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user