mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 12:19:06 +00:00
69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
import argparse
|
|
|
|
import spacy
|
|
from loguru import logger
|
|
|
|
|
|
# Define the paths for agenda and transcription files
|
|
def init_argparse() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
usage="%(prog)s <AGENDA> <TRANSCRIPTION>",
|
|
description="Compares the transcript of a video or audio file to an agenda using the SpaCy model"
|
|
)
|
|
parser.add_argument("agenda", help="Location of the agenda file")
|
|
parser.add_argument("transcription", help="Location of the transcription file")
|
|
return parser
|
|
|
|
|
|
args = init_argparse().parse_args()
|
|
agenda_path = args.agenda
|
|
transcription_path = args.transcription
|
|
|
|
# Load the spaCy model and add the sentencizer
|
|
spaCy_model = "en_core_web_md"
|
|
nlp = spacy.load(spaCy_model)
|
|
nlp.add_pipe('sentencizer')
|
|
logger.info("Loaded spaCy model " + spaCy_model)
|
|
|
|
# Load the agenda
|
|
with open(agenda_path, "r") as f:
|
|
agenda = [line.strip() for line in f.readlines() if line.strip()]
|
|
logger.info("Loaded agenda items")
|
|
|
|
# Load the transcription
|
|
with open(transcription_path, "r") as f:
|
|
transcription = f.read()
|
|
logger.info("Loaded transcription")
|
|
|
|
# Tokenize the transcription using spaCy
|
|
doc_transcription = nlp(transcription)
|
|
logger.info("Tokenized transcription")
|
|
|
|
# Find the items covered in the transcription
|
|
covered_items = {}
|
|
for item in agenda:
|
|
item_doc = nlp(item)
|
|
for sent in doc_transcription.sents:
|
|
if not sent or not all(token.has_vector for token in sent):
|
|
# Skip an empty span or one without any word vectors
|
|
continue
|
|
similarity = sent.similarity(item_doc)
|
|
similarity_threshold = 0.7
|
|
if similarity > similarity_threshold: # Set the threshold to determine what is considered a match
|
|
covered_items[item] = True
|
|
break
|
|
|
|
# Count the number of items covered and calculatre the percentage
|
|
num_covered_items = sum(covered_items.values())
|
|
percentage_covered = num_covered_items / len(agenda) * 100
|
|
|
|
# Print the results
|
|
print("💬 Agenda items covered in the transcription:")
|
|
for item in agenda:
|
|
if item in covered_items and covered_items[item]:
|
|
print("✅ ", item)
|
|
else:
|
|
print("❌ ", item)
|
|
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
|
logger.info("Finished comparing agenda to transcription with similarity threshold of " + str(similarity_threshold))
|