This commit is contained in:
Gokul Mohanarangan
2023-07-11 11:06:27 +05:30
parent 58c9cdf676
commit b7fbfb2a54
13 changed files with 54 additions and 44 deletions

View File

@@ -1,11 +1,14 @@
import argparse
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from heapq import nlargest
from loguru import logger
# Function to initialize the argument parser
def init_argparse():
parser = argparse.ArgumentParser(
@@ -17,12 +20,14 @@ def init_argparse():
parser.add_argument("--num_sentences", type=int, default=5, help="Number of sentences to include in the summary")
return parser
# Function to read the input transcript file
def read_transcript(file_path):
with open(file_path, "r") as file:
transcript = file.read()
return transcript
# Function to preprocess the text by removing stop words and special characters
def preprocess_text(text):
stop_words = set(stopwords.words('english'))
@@ -30,6 +35,7 @@ def preprocess_text(text):
words = [w.lower() for w in words if w.isalpha() and w.lower() not in stop_words]
return words
# Function to score each sentence based on the frequency of its words and return the top sentences
def summarize_text(text, num_sentences):
# Tokenize the text into sentences
@@ -61,6 +67,7 @@ def summarize_text(text, num_sentences):
return " ".join(summary)
def main():
# Initialize the argument parser and parse the arguments
parser = init_argparse()
@@ -82,5 +89,6 @@ def main():
logger.info("Summarization completed")
if __name__ == "__main__":
main()