code cleanup

This commit is contained in:
Gokul Mohanarangan
2023-07-11 12:09:30 +05:30
parent b7fbfb2a54
commit 8e9cd6c568
15 changed files with 249 additions and 215 deletions

View File

@@ -1,14 +1,12 @@
import configparser
import sys
import boto3
import botocore
from run_utils import config
from log_utils import logger
config = configparser.ConfigParser()
config.read('config.ini')
BUCKET_NAME = 'reflector-bucket'
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]
s3 = boto3.client('s3',
aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
@@ -18,8 +16,8 @@ s3 = boto3.client('s3',
def upload_files(files_to_upload):
"""
Upload a list of files to the configured S3 bucket
:param files_to_upload:
:return:
:param files_to_upload: List of files to upload
:return: None
"""
for KEY in files_to_upload:
logger.info("Uploading file " + KEY)
@@ -32,8 +30,8 @@ def upload_files(files_to_upload):
def download_files(files_to_download):
"""
Download a list of files from the configured S3 bucket
:param files_to_download:
:return:
:param files_to_download: List of files to download
:return: None
"""
for KEY in files_to_download:
logger.info("Downloading file " + KEY)
@@ -47,8 +45,6 @@ def download_files(files_to_download):
if __name__ == "__main__":
import sys
if sys.argv[1] == "download":
download_files([sys.argv[2]])
elif sys.argv[1] == "upload":

View File

@@ -6,6 +6,10 @@ class SingletonLogger:
@staticmethod
def get_logger():
"""
Create or return the singleton instance for the SingletonLogger class
:return: SingletonLogger instance
"""
if not SingletonLogger.__instance:
SingletonLogger.__instance = logger
return SingletonLogger.__instance

66
utils/run_utils.py Normal file
View File

@@ -0,0 +1,66 @@
import asyncio
import configparser
import contextlib
from functools import partial
from threading import Lock
from typing import ContextManager, Generic, TypeVar
class ConfigParser:
__config = configparser.ConfigParser()
def __init__(self, config_file='../config.ini'):
self.__config.read(config_file)
@staticmethod
def get_config():
return ConfigParser.__config
config = ConfigParser.get_config()
def run_in_executor(func, *args, executor=None, **kwargs):
"""
Run the function in an executor, unblocking the main loop
:param func: Function to be run in executor
:param args: function parameters
:param executor: executor instance [Thread | Process]
:param kwargs: Additional parameters
:return: Future of function result upon completion
"""
callback = partial(func, *args, **kwargs)
loop = asyncio.get_event_loop()
return asyncio.get_event_loop().run_in_executor(executor, callback)
# Genetic type template
T = TypeVar("T")
class Mutex(Generic[T]):
"""
Mutex class to implement lock/release of a shared
protected variable
"""
def __init__(self, value: T):
"""
Create an instance of Mutex wrapper for the given resource
:param value: Shared resources to be thread protected
"""
self.__value = value
self.__lock = Lock()
@contextlib.contextmanager
def lock(self) -> ContextManager[T]:
"""
Lock the resource with a mutex to be used within a context block
The lock is automatically released on context exit
:return: Shared resource
"""
self.__lock.acquire()
try:
yield self.__value
finally:
self.__lock.release()

View File

@@ -1,28 +0,0 @@
import asyncio
import contextlib
from functools import partial
from threading import Lock
from typing import ContextManager, Generic, TypeVar
def run_in_executor(func, *args, executor=None, **kwargs):
callback = partial(func, *args, **kwargs)
loop = asyncio.get_event_loop()
return asyncio.get_event_loop().run_in_executor(executor, callback)
T = TypeVar("T")
class Mutex(Generic[T]):
def __init__(self, value: T):
self.__value = value
self.__lock = Lock()
@contextlib.contextmanager
def lock(self) -> ContextManager[T]:
self.__lock.acquire()
try:
yield self.__value
finally:
self.__lock.release()

View File

@@ -6,14 +6,12 @@ from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import BartForConditionalGeneration, BartTokenizer
from run_utils import config
from log_utils import logger
nltk.download('punkt', quiet=True)
config = configparser.ConfigParser()
config.read('config.ini')
def preprocess_sentence(sentence):
@@ -74,7 +72,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
for sent in nonduplicate_sentences:
temp_result = ""
seen = {}
seen = { }
words = nltk.word_tokenize(sent)
n_gram_filter = 3
for i in range(len(words)):

View File

@@ -1,6 +1,5 @@
import ast
import collections
import configparser
import os
import pickle
from pathlib import Path
@@ -10,10 +9,7 @@ import pandas as pd
import scattertext as st
import spacy
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
config = configparser.ConfigParser()
config.read('config.ini')
from wordcloud import STOPWORDS, WordCloud
en = spacy.load('en_core_web_md')
spacy_stopwords = en.Defaults.stop_words
@@ -92,11 +88,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# create df for processing
df = pd.DataFrame.from_dict(res["chunks"])
covered_items = {}
covered_items = { }
# ts: timestamp
# Map each timestamped chunk with top1 and top2 matched agenda
ts_to_topic_mapping_top_1 = {}
ts_to_topic_mapping_top_2 = {}
ts_to_topic_mapping_top_1 = { }
ts_to_topic_mapping_top_2 = { }
# Also create a mapping of the different timestamps in which each topic was covered
topic_to_ts_mapping_top_1 = collections.defaultdict(list)
@@ -189,16 +185,16 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
if real_time:
open('./artefacts/real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)