organize imports

This commit is contained in:
Gokul Mohanarangan
2023-07-25 10:02:25 +05:30
parent ab42858ec8
commit 25f34bf9e5
8 changed files with 79 additions and 80 deletions

View File

@@ -1,24 +1,24 @@
[DEFAULT]
#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
KMP_DUPLICATE_LIB_OK=TRUE
KMP_DUPLICATE_LIB_OK = TRUE
#ExportOpenAIAPIKey
OPENAI_APIKEY=
OPENAI_APIKEY =
#ExportWhisperModelSize
WHISPER_MODEL_SIZE=tiny
WHISPER_REAL_TIME_MODEL_SIZE=tiny
WHISPER_MODEL_SIZE = tiny
WHISPER_REAL_TIME_MODEL_SIZE = tiny
#AWSconfig
AWS_ACCESS_KEY=***REMOVED***
AWS_SECRET_KEY=***REMOVED***
BUCKET_NAME=reflector-bucket
AWS_ACCESS_KEY = ***REMOVED***
AWS_SECRET_KEY = ***REMOVED***
BUCKET_NAME = reflector-bucket
#Summarizerconfig
SUMMARY_MODEL=facebook/bart-large-cnn
INPUT_ENCODING_MAX_LENGTH=1024
MAX_LENGTH=2048
BEAM_SIZE=6
MAX_CHUNK_LENGTH=1024
SUMMARIZE_USING_CHUNKS=YES
SUMMARY_MODEL = facebook/bart-large-cnn
INPUT_ENCODING_MAX_LENGTH = 1024
MAX_LENGTH = 2048
BEAM_SIZE = 6
MAX_CHUNK_LENGTH = 1024
SUMMARIZE_USING_CHUNKS = YES
#Audiodevice
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
AV_FOUNDATION_DEVICE_ID=1
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
AV_FOUNDATION_DEVICE_ID = 1
# LLM PATH
LLM_PATH=
LLM_PATH =

32
utils/format_output.py Normal file
View File

@@ -0,0 +1,32 @@
import json
with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f:
outputs = f.read()
outputs = json.loads(outputs)
transcript_file = open("../artefacts/meeting_transcript.txt", "a")
title_desc_file = open("../artefacts/meeting_title_description.txt", "a")
summary_file = open("../artefacts/meeting_summary.txt", "a")
for item in outputs["topics"]:
transcript_file.write(item["transcript"])
summary_file.write(item["description"])
title_desc_file.write("TITLE: \n")
title_desc_file.write(item["title"])
title_desc_file.write("\n")
title_desc_file.write("DESCRIPTION: \n")
title_desc_file.write(item["description"])
title_desc_file.write("\n")
title_desc_file.write("TRANSCRIPT: \n")
title_desc_file.write(item["transcript"])
title_desc_file.write("\n")
title_desc_file.write("---------------------------------------- \n\n")
transcript_file.close()
title_desc_file.close()
summary_file.close()

View File

@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer
from utils.log_utils import logger
from utils.run_utils import config
from log_utils import logger
from run_utils import config
nltk.download('punkt', quiet=True)
@@ -171,7 +171,7 @@ def summarize(transcript_text, timestamp,
output_filename = "real_time_" + output_filename
if summarize_using_chunks != "YES":
inputs = tokenizer.\
inputs = tokenizer. \
batch_encode_plus([transcript_text], truncation=True,
padding='longest',
max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),

View File

@@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud
en = spacy.load('en_core_web_md')
spacy_stopwords = en.Defaults.stop_words
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
union(set(spacy_stopwords))
@@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False):
"""
filename = "transcript"
if real_time:
filename = "real_time_" + filename + "_" +\
filename = "real_time_" + filename + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
@@ -47,7 +47,7 @@ def create_wordcloud(timestamp, real_time=False):
wordcloud_name = "wordcloud"
if real_time:
wordcloud_name = "real_time_" + wordcloud_name + "_" +\
wordcloud_name = "real_time_" + wordcloud_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
else:
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
@@ -57,12 +57,12 @@ def create_wordcloud(timestamp, real_time=False):
def create_talk_diff_scatter_viz(timestamp, real_time=False):
"""
Perform agenda vs transription diff to see covered topics.
Perform agenda vs transcription diff to see covered topics.
Create a scatter plot of words in topics.
:return: None. Saved locally.
"""
spaCy_model = "en_core_web_md"
nlp = spacy.load(spaCy_model)
spacy_model = "en_core_web_md"
nlp = spacy.load(spacy_model)
nlp.add_pipe('sentencizer')
agenda_topics = []
@@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
agenda_topics.append(line.split(":")[0])
# Load the transcription with timestamp
filename = ""
if real_time:
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename = "./artefacts/transcript_with_timestamp_" +\
filename = "./artefacts/transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open(filename) as f:
transcription_timestamp_text = f.read()
@@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
df = df.apply(create_new_columns, axis=1)
# Count the number of items covered and calculatre the percentage
# Count the number of items covered and calculate the percentage
num_covered_items = sum(covered_items.values())
percentage_covered = num_covered_items / len(agenda) * 100
@@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# Save df, mappings for further experimentation
df_name = "df"
if real_time:
df_name = "real_time_" + df_name + "_" +\
df_name = "real_time_" + df_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
@@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
mappings_name = "mappings"
if real_time:
mappings_name = "real_time_" + mappings_name + "_" +\
mappings_name = "real_time_" + mappings_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"