Files
reflector/reflector-local/2-agenda-transcript-diff.py
Gokul Mohanarangan b7fbfb2a54 refactor
2023-07-11 11:06:27 +05:30

69 lines
2.3 KiB
Python

import argparse
import spacy
from loguru import logger
# Define the paths for agenda and transcription files
def init_argparse() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
usage="%(prog)s <AGENDA> <TRANSCRIPTION>",
description="Compares the transcript of a video or audio file to an agenda using the SpaCy model"
)
parser.add_argument("agenda", help="Location of the agenda file")
parser.add_argument("transcription", help="Location of the transcription file")
return parser
args = init_argparse().parse_args()
agenda_path = args.agenda
transcription_path = args.transcription
# Load the spaCy model and add the sentencizer
spaCy_model = "en_core_web_md"
nlp = spacy.load(spaCy_model)
nlp.add_pipe('sentencizer')
logger.info("Loaded spaCy model " + spaCy_model)
# Load the agenda
with open(agenda_path, "r") as f:
agenda = [line.strip() for line in f.readlines() if line.strip()]
logger.info("Loaded agenda items")
# Load the transcription
with open(transcription_path, "r") as f:
transcription = f.read()
logger.info("Loaded transcription")
# Tokenize the transcription using spaCy
doc_transcription = nlp(transcription)
logger.info("Tokenized transcription")
# Find the items covered in the transcription
covered_items = {}
for item in agenda:
item_doc = nlp(item)
for sent in doc_transcription.sents:
if not sent or not all(token.has_vector for token in sent):
# Skip an empty span or one without any word vectors
continue
similarity = sent.similarity(item_doc)
similarity_threshold = 0.7
if similarity > similarity_threshold: # Set the threshold to determine what is considered a match
covered_items[item] = True
break
# Count the number of items covered and calculatre the percentage
num_covered_items = sum(covered_items.values())
percentage_covered = num_covered_items / len(agenda) * 100
# Print the results
print("💬 Agenda items covered in the transcription:")
for item in agenda:
if item in covered_items and covered_items[item]:
print("", item)
else:
print("", item)
print("📊 Coverage: {:.2f}%".format(percentage_covered))
logger.info("Finished comparing agenda to transcription with similarity threshold of " + str(similarity_threshold))