mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Issues 44, 46, 47
This commit is contained in:
@@ -4,6 +4,7 @@ uploads to cloud storage
|
||||
"""
|
||||
|
||||
import sys
|
||||
from typing import List, NoReturn
|
||||
|
||||
import boto3
|
||||
import botocore
|
||||
@@ -18,7 +19,7 @@ s3 = boto3.client('s3',
|
||||
aws_secret_access_key=CONFIG["AWS"]["AWS_SECRET_KEY"])
|
||||
|
||||
|
||||
def upload_files(files_to_upload):
|
||||
def upload_files(files_to_upload: List[str]) -> NoReturn:
|
||||
"""
|
||||
Upload a list of files to the configured S3 bucket
|
||||
:param files_to_upload: List of files to upload
|
||||
@@ -32,7 +33,7 @@ def upload_files(files_to_upload):
|
||||
print(exception.response)
|
||||
|
||||
|
||||
def download_files(files_to_download):
|
||||
def download_files(files_to_download: List[str]) -> NoReturn:
|
||||
"""
|
||||
Download a list of files from the configured S3 bucket
|
||||
:param files_to_download: List of files to download
|
||||
|
||||
@@ -18,6 +18,10 @@ class ReflectorConfig:
|
||||
|
||||
@staticmethod
|
||||
def get_config():
|
||||
"""
|
||||
Load the configurations from the local config.ini file
|
||||
:return:
|
||||
"""
|
||||
if ReflectorConfig.__config is None:
|
||||
ReflectorConfig.__config = configparser.ConfigParser()
|
||||
ReflectorConfig.__config.read('utils/config.ini')
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
"""
|
||||
Utility file for all text processing related functionalities
|
||||
"""
|
||||
import datetime
|
||||
from typing import List
|
||||
|
||||
import nltk
|
||||
import torch
|
||||
@@ -16,7 +18,12 @@ from run_utils import CONFIG
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
|
||||
def preprocess_sentence(sentence):
|
||||
def preprocess_sentence(sentence: str) -> str:
|
||||
"""
|
||||
Filter out undesirable tokens from thr sentence
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
stop_words = set(stopwords.words('english'))
|
||||
tokens = word_tokenize(sentence.lower())
|
||||
tokens = [token for token in tokens
|
||||
@@ -24,7 +31,7 @@ def preprocess_sentence(sentence):
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def compute_similarity(sent1, sent2):
|
||||
def compute_similarity(sent1: str, sent2: str) -> float:
|
||||
"""
|
||||
Compute the similarity
|
||||
"""
|
||||
@@ -35,7 +42,7 @@ def compute_similarity(sent1, sent2):
|
||||
return 0.0
|
||||
|
||||
|
||||
def remove_almost_alike_sentences(sentences, threshold=0.7):
|
||||
def remove_almost_alike_sentences(sentences: List[str], threshold=0.7) -> List[str]:
|
||||
"""
|
||||
Filter sentences that are similar beyond a set threshold
|
||||
:param sentences:
|
||||
@@ -71,7 +78,7 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
|
||||
return filtered_sentences
|
||||
|
||||
|
||||
def remove_outright_duplicate_sentences_from_chunk(chunk):
|
||||
def remove_outright_duplicate_sentences_from_chunk(chunk: str) -> List[str]:
|
||||
"""
|
||||
Remove repetitive sentences
|
||||
:param chunk:
|
||||
@@ -83,7 +90,7 @@ def remove_outright_duplicate_sentences_from_chunk(chunk):
|
||||
return nonduplicate_sentences
|
||||
|
||||
|
||||
def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
|
||||
def remove_whisper_repetitive_hallucination(nonduplicate_sentences: List[str]) -> List[str]:
|
||||
"""
|
||||
Remove sentences that are repeated as a result of Whisper
|
||||
hallucinations
|
||||
@@ -111,7 +118,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
|
||||
return chunk_sentences
|
||||
|
||||
|
||||
def post_process_transcription(whisper_result):
|
||||
def post_process_transcription(whisper_result: dict) -> dict:
|
||||
"""
|
||||
Parent function to perform post-processing on the transcription result
|
||||
:param whisper_result:
|
||||
@@ -131,7 +138,7 @@ def post_process_transcription(whisper_result):
|
||||
return whisper_result
|
||||
|
||||
|
||||
def summarize_chunks(chunks, tokenizer, model):
|
||||
def summarize_chunks(chunks: List[str], tokenizer, model) -> List[str]:
|
||||
"""
|
||||
Summarize each chunk using a summarizer model
|
||||
:param chunks:
|
||||
@@ -157,8 +164,8 @@ def summarize_chunks(chunks, tokenizer, model):
|
||||
return summaries
|
||||
|
||||
|
||||
def chunk_text(text,
|
||||
max_chunk_length=int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])):
|
||||
def chunk_text(text: str,
|
||||
max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])) -> List[str]:
|
||||
"""
|
||||
Split text into smaller chunks.
|
||||
:param text: Text to be chunked
|
||||
@@ -178,9 +185,9 @@ def chunk_text(text,
|
||||
return chunks
|
||||
|
||||
|
||||
def summarize(transcript_text, timestamp,
|
||||
real_time=False,
|
||||
chunk_summarize=CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
def summarize(transcript_text: str, timestamp: datetime.datetime.timestamp,
|
||||
real_time: bool = False,
|
||||
chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
"""
|
||||
Summarize the given text either as a whole or as chunks as needed
|
||||
:param transcript_text:
|
||||
|
||||
@@ -4,8 +4,10 @@ Utility file for all visualization related functions
|
||||
|
||||
import ast
|
||||
import collections
|
||||
import datetime
|
||||
import os
|
||||
import pickle
|
||||
from typing import NoReturn
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
@@ -21,7 +23,8 @@ STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
|
||||
union(set(spacy_stopwords))
|
||||
|
||||
|
||||
def create_wordcloud(timestamp, real_time=False):
|
||||
def create_wordcloud(timestamp: datetime.datetime.timestamp,
|
||||
real_time: bool = False) -> NoReturn:
|
||||
"""
|
||||
Create a basic word cloud visualization of transcribed text
|
||||
:return: None. The wordcloud image is saved locally
|
||||
@@ -52,14 +55,15 @@ def create_wordcloud(timestamp, real_time=False):
|
||||
wordcloud = "wordcloud"
|
||||
if real_time:
|
||||
wordcloud = "real_time_" + wordcloud + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
else:
|
||||
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
|
||||
plt.savefig("./artefacts/" + wordcloud)
|
||||
|
||||
|
||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
|
||||
real_time: bool = False) -> NoReturn:
|
||||
"""
|
||||
Perform agenda vs transcription diff to see covered topics.
|
||||
Create a scatter plot of words in topics.
|
||||
@@ -124,14 +128,16 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
covered_items[agenda[topic_similarities[i][0]]] = True
|
||||
# top1 match
|
||||
if i == 0:
|
||||
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
ts_to_topic_mapping_top_1[c["timestamp"]] = \
|
||||
agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
# top2 match
|
||||
else:
|
||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
ts_to_topic_mapping_top_2[c["timestamp"]] = \
|
||||
agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
|
||||
def create_new_columns(record):
|
||||
def create_new_columns(record: dict) -> dict:
|
||||
"""
|
||||
Accumulate the mapping information into the df
|
||||
:param record:
|
||||
@@ -210,8 +216,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
if real_time:
|
||||
open('./artefacts/real_time_scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
with open('./artefacts/real_time_scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
|
||||
file.write(html)
|
||||
else:
|
||||
open('./artefacts/scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
with open('./artefacts/scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
|
||||
file.write(html)
|
||||
|
||||
Reference in New Issue
Block a user