mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Moved all server files to server/
This commit is contained in:
68
server/reflector-local/2-agenda-transcript-diff.py
Normal file
68
server/reflector-local/2-agenda-transcript-diff.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import argparse
|
||||
|
||||
import spacy
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# Define the paths for agenda and transcription files
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s <AGENDA> <TRANSCRIPTION>",
|
||||
description="Compares the transcript of a video or audio file to an agenda using the SpaCy model"
|
||||
)
|
||||
parser.add_argument("agenda", help="Location of the agenda file")
|
||||
parser.add_argument("transcription", help="Location of the transcription file")
|
||||
return parser
|
||||
|
||||
|
||||
args = init_argparse().parse_args()
|
||||
agenda_path = args.agenda
|
||||
transcription_path = args.transcription
|
||||
|
||||
# Load the spaCy model and add the sentencizer
|
||||
spaCy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spaCy_model)
|
||||
nlp.add_pipe('sentencizer')
|
||||
logger.info("Loaded spaCy model " + spaCy_model)
|
||||
|
||||
# Load the agenda
|
||||
with open(agenda_path, "r") as f:
|
||||
agenda = [line.strip() for line in f.readlines() if line.strip()]
|
||||
logger.info("Loaded agenda items")
|
||||
|
||||
# Load the transcription
|
||||
with open(transcription_path, "r") as f:
|
||||
transcription = f.read()
|
||||
logger.info("Loaded transcription")
|
||||
|
||||
# Tokenize the transcription using spaCy
|
||||
doc_transcription = nlp(transcription)
|
||||
logger.info("Tokenized transcription")
|
||||
|
||||
# Find the items covered in the transcription
|
||||
covered_items = {}
|
||||
for item in agenda:
|
||||
item_doc = nlp(item)
|
||||
for sent in doc_transcription.sents:
|
||||
if not sent or not all(token.has_vector for token in sent):
|
||||
# Skip an empty span or one without any word vectors
|
||||
continue
|
||||
similarity = sent.similarity(item_doc)
|
||||
similarity_threshold = 0.7
|
||||
if similarity > similarity_threshold: # Set the threshold to determine what is considered a match
|
||||
covered_items[item] = True
|
||||
break
|
||||
|
||||
# Count the number of items covered and calculatre the percentage
|
||||
num_covered_items = sum(covered_items.values())
|
||||
percentage_covered = num_covered_items / len(agenda) * 100
|
||||
|
||||
# Print the results
|
||||
print("💬 Agenda items covered in the transcription:")
|
||||
for item in agenda:
|
||||
if item in covered_items and covered_items[item]:
|
||||
print("✅ ", item)
|
||||
else:
|
||||
print("❌ ", item)
|
||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||
logger.info("Finished comparing agenda to transcription with similarity threshold of " + str(similarity_threshold))
|
||||
Reference in New Issue
Block a user