This commit is contained in:
Gokul Mohanarangan
2023-07-11 18:47:21 +05:30
parent d962ff1712
commit 71eb277fd7
11 changed files with 67 additions and 80 deletions

View File

@@ -3,8 +3,8 @@ import sys
import boto3
import botocore
from log_utils import logger
from run_utils import config
from .log_utils import logger
from .run_utils import config
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]

View File

@@ -6,18 +6,18 @@ from threading import Lock
from typing import ContextManager, Generic, TypeVar
class ConfigParser:
__config = configparser.ConfigParser()
def __init__(self, config_file='../config.ini'):
self.__config.read(config_file)
class ReflectorConfig:
__config = None
@staticmethod
def get_config():
return ConfigParser.__config
if ReflectorConfig.__config is None:
ReflectorConfig.__config = configparser.ConfigParser()
ReflectorConfig.__config.read('utils/config.ini')
return ReflectorConfig.__config
config = ConfigParser.get_config()
config = ReflectorConfig.get_config()
def run_in_executor(func, *args, executor=None, **kwargs):

0
utils/test.py Normal file
View File

View File

@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer
from log_utils import logger
from run_utils import config
from utils.log_utils import logger
from utils.run_utils import config
nltk.download('punkt', quiet=True)
@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for summary in summaries]
summary = " ".join(decoded_summaries)
with open(output_filename, 'w') as f:
with open("./artefacts/" + output_filename, 'w') as f:
f.write(summary.strip() + "\n")
else:
logger.info("Breaking transcript into smaller chunks")

View File

@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
else:
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
plt.savefig(wordcloud_name)
plt.savefig("./artefacts/" + wordcloud_name)
def create_talk_diff_scatter_viz(timestamp, real_time=False):
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# Load the transcription with timestamp
filename = ""
if real_time:
filename = "real_time_transcript_with_timestamp_" +\
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename = "transcript_with_timestamp_" +\
filename = "./artefacts/transcript_with_timestamp_" +\
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open(filename) as f:
transcription_timestamp_text = f.read()
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
df.to_pickle(df_name)
df.to_pickle("./artefacts/" + df_name)
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
pickle.dump(my_mappings, open(mappings_name, "wb"))
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
if len(topic_times) > 1:
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
if real_time:
open('./artefacts/real_time_scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./artefacts/scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
if real_time:
open('./artefacts/real_time_scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./artefacts/scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)