minor refactor

2026-02-05 18:36:45 +00:00 · 2023-07-10 22:48:22 +05:30
parent 73c4270764
commit 3128813ca3
8 changed files with 82 additions and 85 deletions
--- a/utils/file_utilities.py
+++ b/utils/file_utilities.py
@@ -1,6 +1,7 @@
+import configparser
+
 import boto3
 import botocore
-import configparser
 from loguru import logger

 config = configparser.ConfigParser()
@@ -12,6 +13,7 @@ s3 = boto3.client('s3',
                  aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
                  aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"])

+
 def upload_files(files_to_upload):
    """
    Upload a list of files to the configured S3 bucket
@@ -45,6 +47,7 @@ def download_files(files_to_download):

 if __name__ == "__main__":
    import sys
+
    if sys.argv[1] == "download":
        download_files([sys.argv[2]])
    elif sys.argv[1] == "upload":
--- a/utils/server_utils.py
+++ b/utils/server_utils.py
@@ -1,9 +1,10 @@
 import asyncio
-from functools import partial
 import contextlib
+from functools import partial
 from threading import Lock
 from typing import ContextManager, Generic, TypeVar

+
 def run_in_executor(func, *args, executor=None, **kwargs):
    callback = partial(func, *args, **kwargs)
    loop = asyncio.get_event_loop()
@@ -11,6 +12,8 @@ def run_in_executor(func, *args, executor=None, **kwargs):


 T = TypeVar("T")
+
+
 class Mutex(Generic[T]):
    def __init__(self, value: T):
        self.__value = value
@@ -22,4 +25,4 @@ class Mutex(Generic[T]):
        try:
            yield self.__value
        finally:
-            self.__lock.release()
+            self.__lock.release()
--- a/utils/text_utilities.py
+++ b/utils/text_utilities.py
@@ -1,23 +1,27 @@
-import torch
 import configparser
+
 import nltk
-from transformers import BartTokenizer, BartForConditionalGeneration
+import torch
 from loguru import logger
 from nltk.corpus import stopwords
-from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.tokenize import word_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BartTokenizer, BartForConditionalGeneration
+
 nltk.download('punkt', quiet=True)

 config = configparser.ConfigParser()
 config.read('config.ini')

+
 def preprocess_sentence(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return ' '.join(tokens)

+
 def compute_similarity(sent1, sent2):
    """
    Compute the similarity
@@ -28,6 +32,7 @@ def compute_similarity(sent1, sent2):
        return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return 0.0

+
 def remove_almost_alike_sentences(sentences, threshold=0.7):
    num_sentences = len(sentences)
    removed_indices = set()
@@ -55,12 +60,14 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
    filtered_sentences = [sentences[i] for i in range(num_sentences) if i not in removed_indices]
    return filtered_sentences

+
 def remove_outright_duplicate_sentences_from_chunk(chunk):
    chunk_text = chunk["text"]
    sentences = nltk.sent_tokenize(chunk_text)
    nonduplicate_sentences = list(dict.fromkeys(sentences))
    return nonduplicate_sentences

+
 def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
    chunk_sentences = []

@@ -80,6 +87,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
        chunk_sentences.append(temp_result)
    return chunk_sentences

+
 def post_process_transcription(whisper_result):
    transcript_text = ""
    for chunk in whisper_result["chunks"]:
@@ -107,12 +115,13 @@ def summarize_chunks(chunks, tokenizer, model):
        input_ids = input_ids.to(device)
        with torch.no_grad():
            summary_ids = model.generate(input_ids,
-                               num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
-                               max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
+                                         num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
+                                         max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
    return summaries

+
 def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])):
    """
    Split text into smaller chunks.
@@ -132,6 +141,7 @@ def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])
    chunks.append(current_chunk.strip())
    return chunks

+
 def summarize(transcript_text, timestamp,
              real_time=False, summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
--- a/utils/viz_utilities.py
+++ b/utils/viz_utilities.py
@@ -1,15 +1,16 @@
-import matplotlib.pyplot as plt
-from wordcloud import WordCloud, STOPWORDS
-from nltk.corpus import stopwords
-import collections
-import spacy
-import os
-from pathlib import Path
-import pickle
 import ast
+import collections
+import configparser
+import os
+import pickle
+from pathlib import Path
+
+import matplotlib.pyplot as plt
 import pandas as pd
 import scattertext as st
-import configparser
+import spacy
+from nltk.corpus import stopwords
+from wordcloud import WordCloud, STOPWORDS

 config = configparser.ConfigParser()
 config.read('config.ini')
@@ -29,7 +30,7 @@ def create_wordcloud(timestamp, real_time=False):
    if real_time:
        filename = "real_time_" + filename + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
    else:
-        filename += "_" +  timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
+        filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"

    with open("./artefacts/" + filename, "r") as f:
        transcription_text = f.read()
@@ -202,4 +203,4 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
    if real_time:
        open('./artefacts/real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
    else:
-        open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
+        open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)