fix: remove unused settings and utils files (#522)

* fix: remove unused settings and utils files * fix: remove migration done * fix: remove outdated scripts * fix: removing deployment of hermes, not used anymore * fix: partially remove secret, still have to understand frontend.
2026-04-18 03:06:57 +00:00 · 2025-07-31 17:45:48 -06:00
parent 4ee19ed015
commit ad56165b54
17 changed files with 8 additions and 1046 deletions
--- a/server/reflector/utils/file_utils.py
+++ b/server/reflector/utils/file_utils.py
@@ -1,59 +0,0 @@
-"""
-Utility file for file handling related functions, including file downloads and
-uploads to cloud storage
-"""
-
-import sys
-from typing import List, NoReturn
-
-import boto3
-import botocore
-
-from .log_utils import LOGGER
-from .run_utils import SECRETS
-
-BUCKET_NAME = SECRETS["AWS-S3"]["BUCKET_NAME"]
-
-s3 = boto3.client(
-    "s3",
-    aws_access_key_id=SECRETS["AWS-S3"]["AWS_ACCESS_KEY"],
-    aws_secret_access_key=SECRETS["AWS-S3"]["AWS_SECRET_KEY"],
-)
-
-
-def upload_files(files_to_upload: List[str]) -> NoReturn:
-    """
-    Upload a list of files to the configured S3 bucket
-    :param files_to_upload: List of files to upload
-    :return: None
-    """
-    for key in files_to_upload:
-        LOGGER.info("Uploading file " + key)
-        try:
-            s3.upload_file(key, BUCKET_NAME, key)
-        except botocore.exceptions.ClientError as exception:
-            print(exception.response)
-
-
-def download_files(files_to_download: List[str]) -> NoReturn:
-    """
-    Download a list of files from the configured S3 bucket
-    :param files_to_download: List of files to download
-    :return: None
-    """
-    for key in files_to_download:
-        LOGGER.info("Downloading file " + key)
-        try:
-            s3.download_file(BUCKET_NAME, key, key)
-        except botocore.exceptions.ClientError as exception:
-            if exception.response["Error"]["Code"] == "404":
-                print("The object does not exist.")
-            else:
-                raise
-
-
-if __name__ == "__main__":
-    if sys.argv[1] == "download":
-        download_files([sys.argv[2]])
-    elif sys.argv[1] == "upload":
-        upload_files([sys.argv[2]])
--- a/server/reflector/utils/format_output.py
+++ b/server/reflector/utils/format_output.py
@@ -1,38 +0,0 @@
-"""
-Utility function to format the artefacts created during Reflector run
-"""
-
-import json
-
-with open("../artefacts/meeting_titles_and_summaries.txt", "r", encoding="utf-8") as f:
-    outputs = f.read()
-
-outputs = json.loads(outputs)
-
-transcript_file = open("../artefacts/meeting_transcript.txt", "a", encoding="utf-8")
-title_desc_file = open(
-    "../artefacts/meeting_title_description.txt", "a", encoding="utf-8"
-)
-summary_file = open("../artefacts/meeting_summary.txt", "a", encoding="utf-8")
-
-for item in outputs["topics"]:
-    transcript_file.write(item["transcript"])
-    summary_file.write(item["description"])
-
-    title_desc_file.write("TITLE: \n")
-    title_desc_file.write(item["title"])
-    title_desc_file.write("\n")
-
-    title_desc_file.write("DESCRIPTION: \n")
-    title_desc_file.write(item["description"])
-    title_desc_file.write("\n")
-
-    title_desc_file.write("TRANSCRIPT: \n")
-    title_desc_file.write(item["transcript"])
-    title_desc_file.write("\n")
-
-    title_desc_file.write("---------------------------------------- \n\n")
-
-transcript_file.close()
-title_desc_file.close()
-summary_file.close()
--- a/server/reflector/utils/run_utils.py
+++ b/server/reflector/utils/run_utils.py
@@ -1,55 +0,0 @@
-"""
-Utility file for server side asynchronous task running and config objects
-"""
-
-import asyncio
-import contextlib
-from functools import partial
-from threading import Lock
-from typing import ContextManager, Generic, TypeVar
-
-
-def run_in_executor(func, *args, executor=None, **kwargs):
-    """
-    Run the function in an executor, unblocking the main loop
-    :param func: Function to be run in executor
-    :param args: function parameters
-    :param executor: executor instance [Thread | Process]
-    :param kwargs: Additional parameters
-    :return: Future of function result upon completion
-    """
-    callback = partial(func, *args, **kwargs)
-    loop = asyncio.get_event_loop()
-    return loop.run_in_executor(executor, callback)
-
-
-# Genetic type template
-T = TypeVar("T")
-
-
-class Mutex(Generic[T]):
-    """
-    Mutex class to implement lock/release of a shared
-    protected variable
-    """
-
-    def __init__(self, value: T):
-        """
-        Create an instance of Mutex wrapper for the given resource
-        :param value: Shared resources to be thread protected
-        """
-        self.__value = value
-        self.__lock = Lock()
-
-    @contextlib.contextmanager
-    def lock(self) -> ContextManager[T]:
-        """
-        Lock the resource with a mutex to be used within a context block
-        The lock is automatically released on context exit
-        :return: Shared resource
-        """
-        self.__lock.acquire()
-        try:
-            yield self.__value
-        finally:
-            self.__lock.release()
--- a/server/reflector/utils/text_utils.py
+++ b/server/reflector/utils/text_utils.py
@@ -1,262 +0,0 @@
-"""
-Utility file for all text processing related functionalities
-"""
-
-import datetime
-from typing import List
-
-import nltk
-import torch
-from log_utils import LOGGER
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-from run_utils import CONFIG
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from transformers import BartForConditionalGeneration, BartTokenizer
-
-nltk.download("punkt", quiet=True)
-
-
-def preprocess_sentence(sentence: str) -> str:
-    """
-    Filter out undesirable tokens from thr sentence
-    :param sentence:
-    :return:
-    """
-    stop_words = set(stopwords.words("english"))
-    tokens = word_tokenize(sentence.lower())
-    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
-    return " ".join(tokens)
-
-
-def compute_similarity(sent1: str, sent2: str) -> float:
-    """
-    Compute the similarity
-    """
-    tfidf_vectorizer = TfidfVectorizer()
-    if sent1 is not None and sent2 is not None:
-        tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
-        return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
-    return 0.0
-
-
-def remove_almost_alike_sentences(sentences: List[str], threshold=0.7) -> List[str]:
-    """
-    Filter sentences that are similar beyond a set threshold
-    :param sentences:
-    :param threshold:
-    :return:
-    """
-    num_sentences = len(sentences)
-    removed_indices = set()
-
-    for i in range(num_sentences):
-        if i not in removed_indices:
-            for j in range(i + 1, num_sentences):
-                if j not in removed_indices:
-                    l_i = len(sentences[i])
-                    l_j = len(sentences[j])
-                    if l_i == 0 or l_j == 0:
-                        if l_i == 0:
-                            removed_indices.add(i)
-                        if l_j == 0:
-                            removed_indices.add(j)
-                    else:
-                        sentence1 = preprocess_sentence(sentences[i])
-                        sentence2 = preprocess_sentence(sentences[j])
-                        if len(sentence1) != 0 and len(sentence2) != 0:
-                            similarity = compute_similarity(sentence1, sentence2)
-
-                            if similarity >= threshold:
-                                removed_indices.add(max(i, j))
-
-    filtered_sentences = [
-        sentences[i] for i in range(num_sentences) if i not in removed_indices
-    ]
-    return filtered_sentences
-
-
-def remove_outright_duplicate_sentences_from_chunk(chunk: str) -> List[str]:
-    """
-    Remove repetitive sentences
-    :param chunk:
-    :return:
-    """
-    chunk_text = chunk["text"]
-    sentences = nltk.sent_tokenize(chunk_text)
-    nonduplicate_sentences = list(dict.fromkeys(sentences))
-    return nonduplicate_sentences
-
-
-def remove_whisper_repetitive_hallucination(
-    nonduplicate_sentences: List[str],
-) -> List[str]:
-    """
-    Remove sentences that are repeated as a result of Whisper
-    hallucinations
-    :param nonduplicate_sentences:
-    :return:
-    """
-    chunk_sentences = []
-
-    for sent in nonduplicate_sentences:
-        temp_result = ""
-        seen = {}
-        words = nltk.word_tokenize(sent)
-        n_gram_filter = 3
-        for i in range(len(words)):
-            if (
-                str(words[i : i + n_gram_filter]) in seen
-                and seen[str(words[i : i + n_gram_filter])]
-                == words[i + 1 : i + n_gram_filter + 2]
-            ):
-                pass
-            else:
-                seen[str(words[i : i + n_gram_filter])] = words[
-                    i + 1 : i + n_gram_filter + 2
-                ]
-                temp_result += words[i]
-                temp_result += " "
-        chunk_sentences.append(temp_result)
-    return chunk_sentences
-
-
-def post_process_transcription(whisper_result: dict) -> dict:
-    """
-    Parent function to perform post-processing on the transcription result
-    :param whisper_result:
-    :return:
-    """
-    transcript_text = ""
-    for chunk in whisper_result["chunks"]:
-        nonduplicate_sentences = remove_outright_duplicate_sentences_from_chunk(chunk)
-        chunk_sentences = remove_whisper_repetitive_hallucination(
-            nonduplicate_sentences
-        )
-        similarity_matched_sentences = remove_almost_alike_sentences(chunk_sentences)
-        chunk["text"] = " ".join(similarity_matched_sentences)
-        transcript_text += chunk["text"]
-    whisper_result["text"] = transcript_text
-    return whisper_result
-
-
-def summarize_chunks(chunks: List[str], tokenizer, model) -> List[str]:
-    """
-    Summarize each chunk using a summarizer model
-    :param chunks:
-    :param tokenizer:
-    :param model:
-    :return:
-    """
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    summaries = []
-    for c in chunks:
-        input_ids = tokenizer.encode(c, return_tensors="pt")
-        input_ids = input_ids.to(device)
-        with torch.no_grad():
-            summary_ids = model.generate(
-                input_ids,
-                num_beams=int(CONFIG["SUMMARIZER"]["BEAM_SIZE"]),
-                length_penalty=2.0,
-                max_length=int(CONFIG["SUMMARIZER"]["MAX_LENGTH"]),
-                early_stopping=True,
-            )
-            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-            summaries.append(summary)
-    return summaries
-
-
-def chunk_text(
-    text: str, max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])
-) -> List[str]:
-    """
-    Split text into smaller chunks.
-    :param text: Text to be chunked
-    :param max_chunk_length: length of chunk
-    :return: chunked texts
-    """
-    sentences = nltk.sent_tokenize(text)
-    chunks = []
-    current_chunk = ""
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) < max_chunk_length:
-            current_chunk += f" {sentence.strip()}"
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = f"{sentence.strip()}"
-    chunks.append(current_chunk.strip())
-    return chunks
-
-
-def summarize(
-    transcript_text: str,
-    timestamp: datetime.datetime.timestamp,
-    real_time: bool = False,
-    chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"],
-):
-    """
-    Summarize the given text either as a whole or as chunks as needed
-    :param transcript_text:
-    :param timestamp:
-    :param real_time:
-    :param chunk_summarize:
-    :return:
-    """
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    summary_model = CONFIG["SUMMARIZER"]["SUMMARY_MODEL"]
-    if not summary_model:
-        summary_model = "facebook/bart-large-cnn"
-
-    # Summarize the generated transcript using the BART model
-    LOGGER.info(f"Loading BART model: {summary_model}")
-    tokenizer = BartTokenizer.from_pretrained(summary_model)
-    model = BartForConditionalGeneration.from_pretrained(summary_model)
-    model = model.to(device)
-
-    output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
-    if real_time:
-        output_file = "real_time_" + output_file
-
-    if chunk_summarize != "YES":
-        max_length = int(CONFIG["SUMMARIZER"]["INPUT_ENCODING_MAX_LENGTH"])
-        inputs = tokenizer.batch_encode_plus(
-            [transcript_text],
-            truncation=True,
-            padding="longest",
-            max_length=max_length,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(device)
-
-        with torch.no_grad():
-            num_beans = int(CONFIG["SUMMARIZER"]["BEAM_SIZE"])
-            max_length = int(CONFIG["SUMMARIZER"]["MAX_LENGTH"])
-            summaries = model.generate(
-                inputs["input_ids"],
-                num_beams=num_beans,
-                length_penalty=2.0,
-                max_length=max_length,
-                early_stopping=True,
-            )
-
-        decoded_summaries = [
-            tokenizer.decode(
-                summary, skip_special_tokens=True, clean_up_tokenization_spaces=False
-            )
-            for summary in summaries
-        ]
-        summary = " ".join(decoded_summaries)
-        with open("./artefacts/" + output_file, "w", encoding="utf-8") as file:
-            file.write(summary.strip() + "\n")
-    else:
-        LOGGER.info("Breaking transcript into smaller chunks")
-        chunks = chunk_text(transcript_text)
-
-        LOGGER.info(f"Transcript broken into {len(chunks)} chunks of at most 500 words")
-
-        LOGGER.info(f"Writing summary text to: {output_file}")
-        with open(output_file, "w") as f:
-            summaries = summarize_chunks(chunks, tokenizer, model)
-            for summary in summaries:
-                f.write(summary.strip() + " ")
--- a/server/reflector/utils/viz_utils.py
+++ b/server/reflector/utils/viz_utils.py
@@ -1,283 +0,0 @@
-"""
-Utility file for all visualization related functions
-"""
-
-import ast
-import collections
-import datetime
-import os
-import pickle
-from typing import NoReturn
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import scattertext as st
-import spacy
-from nltk.corpus import stopwords
-from wordcloud import STOPWORDS, WordCloud
-
-en = spacy.load("en_core_web_md")
-spacy_stopwords = en.Defaults.stop_words
-
-STOPWORDS = (
-    set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords))
-)
-
-
-def create_wordcloud(
-    timestamp: datetime.datetime.timestamp, real_time: bool = False
-) -> NoReturn:
-    """
-    Create a basic word cloud visualization of transcribed text
-    :return: None. The wordcloud image is saved locally
-    """
-    filename = "transcript"
-    if real_time:
-        filename = (
-            "real_time_"
-            + filename
-            + "_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".txt"
-        )
-    else:
-        filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
-
-    with open("./artefacts/" + filename, "r") as f:
-        transcription_text = f.read()
-
-    # python_mask = np.array(PIL.Image.open("download1.png"))
-
-    wordcloud = WordCloud(
-        height=800,
-        width=800,
-        background_color="white",
-        stopwords=STOPWORDS,
-        min_font_size=8,
-    ).generate(transcription_text)
-
-    # Plot wordcloud and save image
-    plt.figure(facecolor=None)
-    plt.imshow(wordcloud, interpolation="bilinear")
-    plt.axis("off")
-    plt.tight_layout(pad=0)
-
-    wordcloud = "wordcloud"
-    if real_time:
-        wordcloud = (
-            "real_time_"
-            + wordcloud
-            + "_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".png"
-        )
-    else:
-        wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
-
-    plt.savefig("./artefacts/" + wordcloud)
-
-
-def create_talk_diff_scatter_viz(
-    timestamp: datetime.datetime.timestamp, real_time: bool = False
-) -> NoReturn:
-    """
-    Perform agenda vs transcription diff to see covered topics.
-    Create a scatter plot of words in topics.
-    :return: None. Saved locally.
-    """
-    spacy_model = "en_core_web_md"
-    nlp = spacy.load(spacy_model)
-    nlp.add_pipe("sentencizer")
-
-    agenda_topics = []
-    agenda = []
-    # Load the agenda
-    with open(os.path.join(os.getcwd(), "agenda-headers.txt"), "r") as f:
-        for line in f.readlines():
-            if line.strip():
-                agenda.append(line.strip())
-                agenda_topics.append(line.split(":")[0])
-
-    # Load the transcription with timestamp
-    if real_time:
-        filename = (
-            "./artefacts/real_time_transcript_with_timestamp_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".txt"
-        )
-    else:
-        filename = (
-            "./artefacts/transcript_with_timestamp_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".txt"
-        )
-    with open(filename) as file:
-        transcription_timestamp_text = file.read()
-
-    res = ast.literal_eval(transcription_timestamp_text)
-    chunks = res["chunks"]
-
-    # create df for processing
-    df = pd.DataFrame.from_dict(res["chunks"])
-
-    covered_items = {}
-    # ts: timestamp
-    # Map each timestamped chunk with top1 and top2 matched agenda
-    ts_to_topic_mapping_top_1 = {}
-    ts_to_topic_mapping_top_2 = {}
-
-    # Also create a mapping of the different timestamps
-    # in which each topic was covered
-    topic_to_ts_mapping_top_1 = collections.defaultdict(list)
-    topic_to_ts_mapping_top_2 = collections.defaultdict(list)
-
-    similarity_threshold = 0.7
-
-    for c in chunks:
-        doc_transcription = nlp(c["text"])
-        topic_similarities = []
-        for item in range(len(agenda)):
-            item_doc = nlp(agenda[item])
-            # if not doc_transcription or not all
-            # (token.has_vector for token in doc_transcription):
-            if not doc_transcription:
-                continue
-            similarity = doc_transcription.similarity(item_doc)
-            topic_similarities.append((item, similarity))
-        topic_similarities.sort(key=lambda x: x[1], reverse=True)
-        for i in range(2):
-            if topic_similarities[i][1] >= similarity_threshold:
-                covered_items[agenda[topic_similarities[i][0]]] = True
-            # top1 match
-            if i == 0:
-                ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[
-                    topic_similarities[i][0]
-                ]
-                topic_to_ts_mapping_top_1[
-                    agenda_topics[topic_similarities[i][0]]
-                ].append(c["timestamp"])
-            # top2 match
-            else:
-                ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[
-                    topic_similarities[i][0]
-                ]
-                topic_to_ts_mapping_top_2[
-                    agenda_topics[topic_similarities[i][0]]
-                ].append(c["timestamp"])
-
-    def create_new_columns(record: dict) -> dict:
-        """
-        Accumulate the mapping information into the df
-        :param record:
-        :return:
-        """
-        record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[
-            record["timestamp"]
-        ]
-        record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[
-            record["timestamp"]
-        ]
-        return record
-
-    df = df.apply(create_new_columns, axis=1)
-
-    # Count the number of items covered and calculate the percentage
-    num_covered_items = sum(covered_items.values())
-    percentage_covered = num_covered_items / len(agenda) * 100
-
-    # Print the results
-    print("💬 Agenda items covered in the transcription:")
-    for item in agenda:
-        if item in covered_items and covered_items[item]:
-            print("✅ ", item)
-        else:
-            print("❌ ", item)
-    print("📊 Coverage: {:.2f}%".format(percentage_covered))
-
-    # Save df, mappings for further experimentation
-    df_name = "df"
-    if real_time:
-        df_name = (
-            "real_time_"
-            + df_name
-            + "_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".pkl"
-        )
-    else:
-        df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    df.to_pickle("./artefacts/" + df_name)
-
-    my_mappings = [
-        ts_to_topic_mapping_top_1,
-        ts_to_topic_mapping_top_2,
-        topic_to_ts_mapping_top_1,
-        topic_to_ts_mapping_top_2,
-    ]
-
-    mappings_name = "mappings"
-    if real_time:
-        mappings_name = (
-            "real_time_"
-            + mappings_name
-            + "_"
-            + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-            + ".pkl"
-        )
-    else:
-        mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
-    pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
-
-    # to load,  my_mappings = pickle.load( open ("mappings.pkl", "rb") )
-
-    # pick the 2 most matched topic to be used for plotting
-    topic_times = collections.defaultdict(int)
-    for key in ts_to_topic_mapping_top_1.keys():
-        if key[0] is None or key[1] is None:
-            continue
-        duration = key[1] - key[0]
-        topic_times[ts_to_topic_mapping_top_1[key]] += duration
-
-    topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
-
-    if len(topic_times) > 1:
-        cat_1 = topic_times[0][0]
-        cat_1_name = topic_times[0][0]
-        cat_2_name = topic_times[1][0]
-
-        # Scatter plot of topics
-        df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
-        corpus = (
-            st.CorpusFromParsedDocuments(
-                df, category_col="ts_to_topic_mapping_top_1", parsed_col="parse"
-            )
-            .build()
-            .get_unigram_corpus()
-            .compact(st.AssociationCompactor(2000))
-        )
-        html = st.produce_scattertext_explorer(
-            corpus,
-            category=cat_1,
-            category_name=cat_1_name,
-            not_category_name=cat_2_name,
-            minimum_term_frequency=0,
-            pmi_threshold_coefficient=0,
-            width_in_pixels=1000,
-            transform=st.Scalers.dense_rank,
-        )
-        if real_time:
-            with open(
-                "./artefacts/real_time_scatter_"
-                + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-                + ".html",
-                "w",
-            ) as file:
-                file.write(html)
-        else:
-            with open(
-                "./artefacts/scatter_"
-                + timestamp.strftime("%m-%d-%Y_%H:%M:%S")
-                + ".html",
-                "w",
-            ) as file:
-                file.write(html)