Issues 44, 46, 47

2026-02-04 18:06:48 +00:00 · 2023-07-27 11:54:24 +05:30
parent 499edd665b
commit 60ea3ac137
6 changed files with 141 additions and 40 deletions
--- a/server/utils/file_utils.py
+++ b/server/utils/file_utils.py
@@ -4,6 +4,7 @@ uploads to cloud storage
 """

 import sys
+from typing import List, NoReturn

 import boto3
 import botocore
@@ -18,7 +19,7 @@ s3 = boto3.client('s3',
                  aws_secret_access_key=CONFIG["AWS"]["AWS_SECRET_KEY"])


-def upload_files(files_to_upload):
+def upload_files(files_to_upload: List[str]) -> NoReturn:
    """
    Upload a list of files to the configured S3 bucket
    :param files_to_upload: List of files to upload
@@ -32,7 +33,7 @@ def upload_files(files_to_upload):
            print(exception.response)


-def download_files(files_to_download):
+def download_files(files_to_download: List[str]) -> NoReturn:
    """
    Download a list of files from the configured S3 bucket
    :param files_to_download: List of files to download
--- a/server/utils/run_utils.py
+++ b/server/utils/run_utils.py
@@ -18,6 +18,10 @@ class ReflectorConfig:

    @staticmethod
    def get_config():
+        """
+        Load the configurations from the local config.ini file
+        :return:
+        """
        if ReflectorConfig.__config is None:
            ReflectorConfig.__config = configparser.ConfigParser()
            ReflectorConfig.__config.read('utils/config.ini')
--- a/server/utils/text_utils.py
+++ b/server/utils/text_utils.py
@@ -1,6 +1,8 @@
 """
 Utility file for all text processing related functionalities
 """
+import datetime
+from typing import List

 import nltk
 import torch
@@ -16,7 +18,12 @@ from run_utils import CONFIG
 nltk.download('punkt', quiet=True)


-def preprocess_sentence(sentence):
+def preprocess_sentence(sentence: str) -> str:
+    """
+    Filter out undesirable tokens from thr sentence
+    :param sentence:
+    :return:
+    """
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence.lower())
    tokens = [token for token in tokens
@@ -24,7 +31,7 @@ def preprocess_sentence(sentence):
    return ' '.join(tokens)


-def compute_similarity(sent1, sent2):
+def compute_similarity(sent1: str, sent2: str) -> float:
    """
    Compute the similarity
    """
@@ -35,7 +42,7 @@ def compute_similarity(sent1, sent2):
    return 0.0


-def remove_almost_alike_sentences(sentences, threshold=0.7):
+def remove_almost_alike_sentences(sentences: List[str], threshold=0.7) -> List[str]:
    """
    Filter sentences that are similar beyond a set threshold
    :param sentences:
@@ -71,7 +78,7 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
    return filtered_sentences


-def remove_outright_duplicate_sentences_from_chunk(chunk):
+def remove_outright_duplicate_sentences_from_chunk(chunk: str) -> List[str]:
    """
    Remove repetitive sentences
    :param chunk:
@@ -83,7 +90,7 @@ def remove_outright_duplicate_sentences_from_chunk(chunk):
    return nonduplicate_sentences


-def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
+def remove_whisper_repetitive_hallucination(nonduplicate_sentences: List[str]) -> List[str]:
    """
    Remove sentences that are repeated as a result of Whisper
    hallucinations
@@ -111,7 +118,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
    return chunk_sentences


-def post_process_transcription(whisper_result):
+def post_process_transcription(whisper_result: dict) -> dict:
    """
    Parent function to perform post-processing on the transcription result
    :param whisper_result:
@@ -131,7 +138,7 @@ def post_process_transcription(whisper_result):
    return whisper_result


-def summarize_chunks(chunks, tokenizer, model):
+def summarize_chunks(chunks: List[str], tokenizer, model) -> List[str]:
    """
    Summarize each chunk using a summarizer model
    :param chunks:
@@ -157,8 +164,8 @@ def summarize_chunks(chunks, tokenizer, model):
    return summaries


-def chunk_text(text,
-               max_chunk_length=int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])):
+def chunk_text(text: str,
+               max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])) -> List[str]:
    """
    Split text into smaller chunks.
    :param text: Text to be chunked
@@ -178,9 +185,9 @@ def chunk_text(text,
    return chunks


-def summarize(transcript_text, timestamp,
-              real_time=False,
-              chunk_summarize=CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
+def summarize(transcript_text: str, timestamp: datetime.datetime.timestamp,
+              real_time: bool = False,
+              chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
    """
    Summarize the given text either as a whole or as chunks as needed
    :param transcript_text:
--- a/server/utils/viz_utils.py
+++ b/server/utils/viz_utils.py
@@ -4,8 +4,10 @@ Utility file for all visualization related functions

 import ast
 import collections
+import datetime
 import os
 import pickle
+from typing import NoReturn

 import matplotlib.pyplot as plt
 import pandas as pd
@@ -21,7 +23,8 @@ STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
    union(set(spacy_stopwords))


-def create_wordcloud(timestamp, real_time=False):
+def create_wordcloud(timestamp: datetime.datetime.timestamp,
+                     real_time: bool = False) -> NoReturn:
    """
    Create a basic word cloud visualization of transcribed text
    :return: None. The wordcloud image is saved locally
@@ -52,14 +55,15 @@ def create_wordcloud(timestamp, real_time=False):
    wordcloud = "wordcloud"
    if real_time:
        wordcloud = "real_time_" + wordcloud + "_" + \
-                         timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
+                    timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
    else:
        wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"

    plt.savefig("./artefacts/" + wordcloud)


-def create_talk_diff_scatter_viz(timestamp, real_time=False):
+def create_talk_diff_scatter_viz(timestamp: datetime.datetime.timestamp,
+                                 real_time: bool = False) -> NoReturn:
    """
    Perform agenda vs transcription diff to see covered topics.
    Create a scatter plot of words in topics.
@@ -124,14 +128,16 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                covered_items[agenda[topic_similarities[i][0]]] = True
            # top1 match
            if i == 0:
-                ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
+                ts_to_topic_mapping_top_1[c["timestamp"]] = \
+                    agenda_topics[topic_similarities[i][0]]
                topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
            # top2 match
            else:
-                ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
+                ts_to_topic_mapping_top_2[c["timestamp"]] = \
+                    agenda_topics[topic_similarities[i][0]]
                topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])

-    def create_new_columns(record):
+    def create_new_columns(record: dict) -> dict:
        """
        Accumulate the mapping information into the df
        :param record:
@@ -210,8 +216,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
                transform=st.Scalers.dense_rank
        )
        if real_time:
-            open('./artefacts/real_time_scatter_' +
-                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+            with open('./artefacts/real_time_scatter_' +
+                      timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
+                file.write(html)
        else:
-            open('./artefacts/scatter_' +
-                 timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+            with open('./artefacts/scatter_' +
+                      timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w') as file:
+                file.write(html)