server: reformat whole project using black

This commit is contained in:
Mathieu Virbel
2023-07-27 14:08:41 +02:00
parent 314321c603
commit 094ed696c4
12 changed files with 406 additions and 237 deletions

View File

@@ -14,9 +14,11 @@ from .run_utils import SECRETS
BUCKET_NAME = SECRETS["AWS-S3"]["BUCKET_NAME"]
s3 = boto3.client('s3',
aws_access_key_id=SECRETS["AWS-S3"]["AWS_ACCESS_KEY"],
aws_secret_access_key=SECRETS["AWS-S3"]["AWS_SECRET_KEY"])
s3 = boto3.client(
"s3",
aws_access_key_id=SECRETS["AWS-S3"]["AWS_ACCESS_KEY"],
aws_secret_access_key=SECRETS["AWS-S3"]["AWS_SECRET_KEY"],
)
def upload_files(files_to_upload: List[str]) -> NoReturn:
@@ -44,7 +46,7 @@ def download_files(files_to_download: List[str]) -> NoReturn:
try:
s3.download_file(BUCKET_NAME, key, key)
except botocore.exceptions.ClientError as exception:
if exception.response['Error']['Code'] == "404":
if exception.response["Error"]["Code"] == "404":
print("The object does not exist.")
else:
raise

View File

@@ -4,21 +4,16 @@ Utility function to format the artefacts created during Reflector run
import json
with open("../artefacts/meeting_titles_and_summaries.txt", "r",
encoding='utf-8') as f:
with open("../artefacts/meeting_titles_and_summaries.txt", "r", encoding="utf-8") as f:
outputs = f.read()
outputs = json.loads(outputs)
transcript_file = open("../artefacts/meeting_transcript.txt",
"a",
encoding='utf-8')
title_desc_file = open("../artefacts/meeting_title_description.txt",
"a",
encoding='utf-8')
summary_file = open("../artefacts/meeting_summary.txt",
"a",
encoding='utf-8')
transcript_file = open("../artefacts/meeting_transcript.txt", "a", encoding="utf-8")
title_desc_file = open(
"../artefacts/meeting_title_description.txt", "a", encoding="utf-8"
)
summary_file = open("../artefacts/meeting_summary.txt", "a", encoding="utf-8")
for item in outputs["topics"]:
transcript_file.write(item["transcript"])

View File

@@ -10,6 +10,7 @@ class SingletonLogger:
Use Singleton design pattern to create a logger object and share it
across the entire project
"""
__instance = None
@staticmethod

View File

@@ -14,6 +14,7 @@ class ReflectorConfig:
"""
Create a single config object to share across the project
"""
__config = None
__secrets = None
@@ -25,7 +26,7 @@ class ReflectorConfig:
"""
if ReflectorConfig.__config is None:
ReflectorConfig.__config = configparser.ConfigParser()
ReflectorConfig.__config.read('utils/config.ini')
ReflectorConfig.__config.read("utils/config.ini")
return ReflectorConfig.__config
@staticmethod
@@ -36,7 +37,7 @@ class ReflectorConfig:
"""
if ReflectorConfig.__secrets is None:
ReflectorConfig.__secrets = configparser.ConfigParser()
ReflectorConfig.__secrets.read('utils/secrets.ini')
ReflectorConfig.__secrets.read("utils/secrets.ini")
return ReflectorConfig.__secrets

View File

@@ -15,7 +15,7 @@ from transformers import BartForConditionalGeneration, BartTokenizer
from log_utils import LOGGER
from run_utils import CONFIG
nltk.download('punkt', quiet=True)
nltk.download("punkt", quiet=True)
def preprocess_sentence(sentence: str) -> str:
@@ -24,11 +24,10 @@ def preprocess_sentence(sentence: str) -> str:
:param sentence:
:return:
"""
stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words("english"))
tokens = word_tokenize(sentence.lower())
tokens = [token for token in tokens
if token.isalnum() and token not in stop_words]
return ' '.join(tokens)
tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
return " ".join(tokens)
def compute_similarity(sent1: str, sent2: str) -> float:
@@ -67,14 +66,14 @@ def remove_almost_alike_sentences(sentences: List[str], threshold=0.7) -> List[s
sentence1 = preprocess_sentence(sentences[i])
sentence2 = preprocess_sentence(sentences[j])
if len(sentence1) != 0 and len(sentence2) != 0:
similarity = compute_similarity(sentence1,
sentence2)
similarity = compute_similarity(sentence1, sentence2)
if similarity >= threshold:
removed_indices.add(max(i, j))
filtered_sentences = [sentences[i] for i in range(num_sentences)
if i not in removed_indices]
filtered_sentences = [
sentences[i] for i in range(num_sentences) if i not in removed_indices
]
return filtered_sentences
@@ -90,7 +89,9 @@ def remove_outright_duplicate_sentences_from_chunk(chunk: str) -> List[str]:
return nonduplicate_sentences
def remove_whisper_repetitive_hallucination(nonduplicate_sentences: List[str]) -> List[str]:
def remove_whisper_repetitive_hallucination(
nonduplicate_sentences: List[str],
) -> List[str]:
"""
Remove sentences that are repeated as a result of Whisper
hallucinations
@@ -105,13 +106,16 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences: List[str]) -
words = nltk.word_tokenize(sent)
n_gram_filter = 3
for i in range(len(words)):
if str(words[i:i + n_gram_filter]) in seen and \
seen[str(words[i:i + n_gram_filter])] == \
words[i + 1:i + n_gram_filter + 2]:
if (
str(words[i : i + n_gram_filter]) in seen
and seen[str(words[i : i + n_gram_filter])]
== words[i + 1 : i + n_gram_filter + 2]
):
pass
else:
seen[str(words[i:i + n_gram_filter])] = \
words[i + 1:i + n_gram_filter + 2]
seen[str(words[i : i + n_gram_filter])] = words[
i + 1 : i + n_gram_filter + 2
]
temp_result += words[i]
temp_result += " "
chunk_sentences.append(temp_result)
@@ -126,12 +130,11 @@ def post_process_transcription(whisper_result: dict) -> dict:
"""
transcript_text = ""
for chunk in whisper_result["chunks"]:
nonduplicate_sentences = \
remove_outright_duplicate_sentences_from_chunk(chunk)
chunk_sentences = \
remove_whisper_repetitive_hallucination(nonduplicate_sentences)
similarity_matched_sentences = \
remove_almost_alike_sentences(chunk_sentences)
nonduplicate_sentences = remove_outright_duplicate_sentences_from_chunk(chunk)
chunk_sentences = remove_whisper_repetitive_hallucination(
nonduplicate_sentences
)
similarity_matched_sentences = remove_almost_alike_sentences(chunk_sentences)
chunk["text"] = " ".join(similarity_matched_sentences)
transcript_text += chunk["text"]
whisper_result["text"] = transcript_text
@@ -149,23 +152,24 @@ def summarize_chunks(chunks: List[str], tokenizer, model) -> List[str]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c, return_tensors='pt')
input_ids = tokenizer.encode(c, return_tensors="pt")
input_ids = input_ids.to(device)
with torch.no_grad():
summary_ids = \
model.generate(input_ids,
num_beams=int(CONFIG["SUMMARIZER"]["BEAM_SIZE"]),
length_penalty=2.0,
max_length=int(CONFIG["SUMMARIZER"]["MAX_LENGTH"]),
early_stopping=True)
summary = tokenizer.decode(summary_ids[0],
skip_special_tokens=True)
summary_ids = model.generate(
input_ids,
num_beams=int(CONFIG["SUMMARIZER"]["BEAM_SIZE"]),
length_penalty=2.0,
max_length=int(CONFIG["SUMMARIZER"]["MAX_LENGTH"]),
early_stopping=True,
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
return summaries
def chunk_text(text: str,
max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])) -> List[str]:
def chunk_text(
text: str, max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])
) -> List[str]:
"""
Split text into smaller chunks.
:param text: Text to be chunked
@@ -185,9 +189,12 @@ def chunk_text(text: str,
return chunks
def summarize(transcript_text: str, timestamp: datetime.datetime.timestamp,
real_time: bool = False,
chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
def summarize(
transcript_text: str,
timestamp: datetime.datetime.timestamp,
real_time: bool = False,
chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"],
):
"""
Summarize the given text either as a whole or as chunks as needed
:param transcript_text:
@@ -213,39 +220,45 @@ def summarize(transcript_text: str, timestamp: datetime.datetime.timestamp,
if chunk_summarize != "YES":
max_length = int(CONFIG["SUMMARIZER"]["INPUT_ENCODING_MAX_LENGTH"])
inputs = tokenizer. \
batch_encode_plus([transcript_text], truncation=True,
padding='longest',
max_length=max_length,
return_tensors='pt')
inputs = tokenizer.batch_encode_plus(
[transcript_text],
truncation=True,
padding="longest",
max_length=max_length,
return_tensors="pt",
)
inputs = inputs.to(device)
with torch.no_grad():
num_beans = int(CONFIG["SUMMARIZER"]["BEAM_SIZE"])
max_length = int(CONFIG["SUMMARIZER"]["MAX_LENGTH"])
summaries = model.generate(inputs['input_ids'],
num_beams=num_beans,
length_penalty=2.0,
max_length=max_length,
early_stopping=True)
summaries = model.generate(
inputs["input_ids"],
num_beams=num_beans,
length_penalty=2.0,
max_length=max_length,
early_stopping=True,
)
decoded_summaries = \
[tokenizer.decode(summary,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
for summary in summaries]
decoded_summaries = [
tokenizer.decode(
summary, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for summary in summaries
]
summary = " ".join(decoded_summaries)
with open("./artefacts/" + output_file, 'w', encoding="utf-8") as file:
with open("./artefacts/" + output_file, "w", encoding="utf-8") as file:
file.write(summary.strip() + "\n")
else:
LOGGER.info("Breaking transcript into smaller chunks")
chunks = chunk_text(transcript_text)
LOGGER.info(f"Transcript broken into {len(chunks)} "
f"chunks of at most 500 words")
LOGGER.info(
f"Transcript broken into {len(chunks)} " f"chunks of at most 500 words"
)
LOGGER.info(f"Writing summary text to: {output_file}")
with open(output_file, 'w') as f:
with open(output_file, "w") as f:
summaries = summarize_chunks(chunks, tokenizer, model)
for summary in summaries:
f.write(summary.strip() + " ")

View File

@@ -16,23 +16,30 @@ import spacy
from nltk.corpus import stopwords
from wordcloud import STOPWORDS, WordCloud
en = spacy.load('en_core_web_md')
en = spacy.load("en_core_web_md")
spacy_stopwords = en.Defaults.stop_words
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
union(set(spacy_stopwords))
STOPWORDS = (
set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords))
)
def create_wordcloud(timestamp: datetime.datetime.timestamp,
real_time: bool = False) -> NoReturn:
def create_wordcloud(
timestamp: datetime.datetime.timestamp, real_time: bool = False
) -> NoReturn:
"""
Create a basic word cloud visualization of transcribed text
:return: None. The wordcloud image is saved locally
"""
filename = "transcript"
if real_time:
filename = "real_time_" + filename + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
filename = (
"real_time_"
+ filename
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
else:
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
@@ -41,10 +48,13 @@ def create_wordcloud(timestamp: datetime.datetime.timestamp,
# python_mask = np.array(PIL.Image.open("download1.png"))
wordcloud = WordCloud(height=800, width=800,
background_color='white',
stopwords=STOPWORDS,
min_font_size=8).generate(transcription_text)
wordcloud = WordCloud(
height=800,
width=800,
background_color="white",
stopwords=STOPWORDS,
min_font_size=8,
).generate(transcription_text)
# Plot wordcloud and save image
plt.figure(facecolor=None)
@@ -54,16 +64,22 @@ def create_wordcloud(timestamp: datetime.datetime.timestamp,
wordcloud = "wordcloud"
if real_time:
wordcloud = "real_time_" + wordcloud + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
wordcloud = (
"real_time_"
+ wordcloud
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".png"
)
else:
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
plt.savefig("./artefacts/" + wordcloud)
def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
real_time: bool = False) -> NoReturn:
def create_talk_diff_scatter_viz(
timestamp: datetime.datetime.timestamp, real_time: bool = False
) -> NoReturn:
"""
Perform agenda vs transcription diff to see covered topics.
Create a scatter plot of words in topics.
@@ -71,7 +87,7 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
"""
spacy_model = "en_core_web_md"
nlp = spacy.load(spacy_model)
nlp.add_pipe('sentencizer')
nlp.add_pipe("sentencizer")
agenda_topics = []
agenda = []
@@ -84,11 +100,17 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
# Load the transcription with timestamp
if real_time:
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
filename = (
"./artefacts/real_time_transcript_with_timestamp_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
else:
filename = "./artefacts/transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
filename = (
"./artefacts/transcript_with_timestamp_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
with open(filename) as file:
transcription_timestamp_text = file.read()
@@ -128,14 +150,20 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
covered_items[agenda[topic_similarities[i][0]]] = True
# top1 match
if i == 0:
ts_to_topic_mapping_top_1[c["timestamp"]] = \
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[
topic_similarities[i][0]
]
topic_to_ts_mapping_top_1[
agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
].append(c["timestamp"])
# top2 match
else:
ts_to_topic_mapping_top_2[c["timestamp"]] = \
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[
topic_similarities[i][0]
]
topic_to_ts_mapping_top_2[
agenda_topics[topic_similarities[i][0]]
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
].append(c["timestamp"])
def create_new_columns(record: dict) -> dict:
"""
@@ -143,10 +171,12 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
:param record:
:return:
"""
record["ts_to_topic_mapping_top_1"] = \
ts_to_topic_mapping_top_1[record["timestamp"]]
record["ts_to_topic_mapping_top_2"] = \
ts_to_topic_mapping_top_2[record["timestamp"]]
record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[
record["timestamp"]
]
record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[
record["timestamp"]
]
return record
df = df.apply(create_new_columns, axis=1)
@@ -167,19 +197,33 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
# Save df, mappings for further experimentation
df_name = "df"
if real_time:
df_name = "real_time_" + df_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
df_name = (
"real_time_"
+ df_name
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".pkl"
)
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
df.to_pickle("./artefacts/" + df_name)
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
my_mappings = [
ts_to_topic_mapping_top_1,
ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1,
topic_to_ts_mapping_top_2,
]
mappings_name = "mappings"
if real_time:
mappings_name = "real_time_" + mappings_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
mappings_name = (
"real_time_"
+ mappings_name
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".pkl"
)
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
@@ -203,23 +247,37 @@ def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
corpus = (
st.CorpusFromParsedDocuments(
df, category_col="ts_to_topic_mapping_top_1", parsed_col="parse"
)
.build()
.get_unigram_corpus()
.compact(st.AssociationCompactor(2000))
)
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank,
)
if real_time:
with open('./artefacts/real_time_scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
with open(
"./artefacts/real_time_scatter_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".html",
"w",
) as file:
file.write(html)
else:
with open('./artefacts/scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
with open(
"./artefacts/scatter_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".html",
"w",
) as file:
file.write(html)