refactor

2026-02-05 18:36:45 +00:00 · 2023-07-11 11:06:27 +05:30
parent 58c9cdf676
commit b7fbfb2a54
13 changed files with 54 additions and 44 deletions
--- a/reflector-local/0-reflector-local.py
+++ b/reflector-local/0-reflector-local.py
@@ -1,10 +1,11 @@
 import os
 import subprocess
 import sys
+
 from loguru import logger

 # Get the input file name from the command line argument
-input_file = sys.argv[1]  
+input_file = sys.argv[1]
 # example use: python 0-reflector-local.py input.m4a agenda.txt

 # Get the agenda file name from the command line argument if provided
@@ -21,7 +22,7 @@ if not os.path.exists(agenda_file):
 # Check if the input file is .m4a, if so convert to .mp4
 if input_file.endswith(".m4a"):
    subprocess.run(["ffmpeg", "-i", input_file, f"{input_file}.mp4"])
-    input_file = f"{input_file}.mp4" 
+    input_file = f"{input_file}.mp4"

 # Run the first script to generate the transcript
 subprocess.run(["python3", "1-transcript-generator.py", input_file, f"{input_file}_transcript.txt"])
@@ -30,4 +31,4 @@ subprocess.run(["python3", "1-transcript-generator.py", input_file, f"{input_fil
 subprocess.run(["python3", "2-agenda-transcript-diff.py", agenda_file, f"{input_file}_transcript.txt"])

 # Run the third script to summarize the transcript
-subprocess.run(["python3", "3-transcript-summarizer.py", f"{input_file}_transcript.txt", f"{input_file}_summary.txt"])
+subprocess.run(["python3", "3-transcript-summarizer.py", f"{input_file}_transcript.txt", f"{input_file}_summary.txt"])
--- a/reflector-local/1-transcript-generator.py
+++ b/reflector-local/1-transcript-generator.py
@@ -1,11 +1,13 @@
 import argparse
 import os
+
 import moviepy.editor
-from loguru import logger
 import whisper
+from loguru import logger

 WHISPER_MODEL_SIZE = "base"

+
 def init_argparse() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        usage="%(prog)s <LOCATION> <OUTPUT>",
@@ -15,6 +17,7 @@ def init_argparse() -> argparse.ArgumentParser:
    parser.add_argument("output", help="Output file path")
    return parser

+
 def main():
    import sys
    sys.setrecursionlimit(10000)
@@ -26,10 +29,11 @@ def main():
    logger.info(f"Processing file: {media_file}")

    # Check if the media file is a valid audio or video file
-    if os.path.isfile(media_file) and not media_file.endswith(('.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.flv')):
+    if os.path.isfile(media_file) and not media_file.endswith(
+            ('.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.flv')):
        logger.error(f"Invalid file format: {media_file}")
        return
-    
+
    # If the media file we just retrieved is an audio file then skip extraction step
    audio_filename = media_file
    logger.info(f"Found audio-only file, skipping audio extraction")
@@ -53,5 +57,6 @@ def main():
    transcript_file.write(whisper_result["text"])
    transcript_file.close()

+
 if __name__ == "__main__":
    main()
--- a/reflector-local/2-agenda-transcript-diff.py
+++ b/reflector-local/2-agenda-transcript-diff.py
@@ -1,7 +1,9 @@
 import argparse
+
 import spacy
 from loguru import logger

+
 # Define the paths for agenda and transcription files
 def init_argparse() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
@@ -11,6 +13,8 @@ def init_argparse() -> argparse.ArgumentParser:
    parser.add_argument("agenda", help="Location of the agenda file")
    parser.add_argument("transcription", help="Location of the transcription file")
    return parser
+
+
 args = init_argparse().parse_args()
 agenda_path = args.agenda
 transcription_path = args.transcription
@@ -19,7 +23,7 @@ transcription_path = args.transcription
 spaCy_model = "en_core_web_md"
 nlp = spacy.load(spaCy_model)
 nlp.add_pipe('sentencizer')
-logger.info("Loaded spaCy model " + spaCy_model )
+logger.info("Loaded spaCy model " + spaCy_model)

 # Load the agenda
 with open(agenda_path, "r") as f:
--- a/reflector-local/3-transcript-summarizer.py
+++ b/reflector-local/3-transcript-summarizer.py
@@ -1,11 +1,14 @@
 import argparse
+
 import nltk
+
 nltk.download('stopwords')
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
 from heapq import nlargest
 from loguru import logger

+
 # Function to initialize the argument parser
 def init_argparse():
    parser = argparse.ArgumentParser(
@@ -17,12 +20,14 @@ def init_argparse():
    parser.add_argument("--num_sentences", type=int, default=5, help="Number of sentences to include in the summary")
    return parser

+
 # Function to read the input transcript file
 def read_transcript(file_path):
    with open(file_path, "r") as file:
        transcript = file.read()
    return transcript

+
 # Function to preprocess the text by removing stop words and special characters
 def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
@@ -30,6 +35,7 @@ def preprocess_text(text):
    words = [w.lower() for w in words if w.isalpha() and w.lower() not in stop_words]
    return words

+
 # Function to score each sentence based on the frequency of its words and return the top sentences
 def summarize_text(text, num_sentences):
    # Tokenize the text into sentences
@@ -61,6 +67,7 @@ def summarize_text(text, num_sentences):

    return " ".join(summary)

+
 def main():
    # Initialize the argument parser and parse the arguments
    parser = init_argparse()
@@ -82,5 +89,6 @@ def main():

    logger.info("Summarization completed")

+
 if __name__ == "__main__":
    main()
--- a/reflector-local/whisper_summarizer_bart.py
+++ b/reflector-local/whisper_summarizer_bart.py
@@ -1,15 +1,18 @@
 import argparse
 import os
 import tempfile
+
 import moviepy.editor
+import nltk
+import whisper
 from loguru import logger
 from transformers import BartTokenizer, BartForConditionalGeneration
-import whisper
-import nltk
+
 nltk.download('punkt', quiet=True)

 WHISPER_MODEL_SIZE = "base"

+
 def init_argparse() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
@@ -30,6 +33,7 @@ def init_argparse() -> argparse.ArgumentParser:

    return parser

+
 # NLTK chunking function
 def chunk_text(txt, max_chunk_length=500):
    "Split text into smaller chunks."
@@ -45,6 +49,7 @@ def chunk_text(txt, max_chunk_length=500):
    chunks.append(current_chunk.strip())
    return chunks

+
 # BART summary function
 def summarize_chunks(chunks, tokenizer, model):
    summaries = []
@@ -56,6 +61,7 @@ def summarize_chunks(chunks, tokenizer, model):
        summaries.append(summary)
    return summaries

+
 def main():
    import sys
    sys.setrecursionlimit(10000)
@@ -103,7 +109,7 @@ def main():
    chunks = chunk_text(whisper_result['text'])

    logger.info(
-        f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
+        f"Transcript broken into {len(chunks)} chunks of at most 500 words")  # TODO fix variable

    logger.info(f"Writing summary text in {args.language} to: {args.output}")
    with open(args.output, 'w') as f:
@@ -114,5 +120,6 @@ def main():

    logger.info("Summarization completed")

+
 if __name__ == "__main__":
    main()