This commit is contained in:
Gokul Mohanarangan
2023-07-11 11:06:27 +05:30
parent 58c9cdf676
commit b7fbfb2a54
13 changed files with 54 additions and 44 deletions

View File

@@ -1,15 +1,18 @@
import argparse
import os
import tempfile
import moviepy.editor
import nltk
import whisper
from loguru import logger
from transformers import BartTokenizer, BartForConditionalGeneration
import whisper
import nltk
nltk.download('punkt', quiet=True)
WHISPER_MODEL_SIZE = "base"
def init_argparse() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
@@ -30,6 +33,7 @@ def init_argparse() -> argparse.ArgumentParser:
return parser
# NLTK chunking function
def chunk_text(txt, max_chunk_length=500):
"Split text into smaller chunks."
@@ -45,6 +49,7 @@ def chunk_text(txt, max_chunk_length=500):
chunks.append(current_chunk.strip())
return chunks
# BART summary function
def summarize_chunks(chunks, tokenizer, model):
summaries = []
@@ -56,6 +61,7 @@ def summarize_chunks(chunks, tokenizer, model):
summaries.append(summary)
return summaries
def main():
import sys
sys.setrecursionlimit(10000)
@@ -103,7 +109,7 @@ def main():
chunks = chunk_text(whisper_result['text'])
logger.info(
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
logger.info(f"Writing summary text in {args.language} to: {args.output}")
with open(args.output, 'w') as f:
@@ -114,5 +120,6 @@ def main():
logger.info("Summarization completed")
if __name__ == "__main__":
main()