minor refactor

This commit is contained in:
Gokul Mohanarangan
2023-07-10 22:48:22 +05:30
parent 73c4270764
commit 3128813ca3
8 changed files with 82 additions and 85 deletions

View File

@@ -1,6 +1,7 @@
import configparser
import boto3
import botocore
import configparser
from loguru import logger
config = configparser.ConfigParser()
@@ -12,6 +13,7 @@ s3 = boto3.client('s3',
aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"])
def upload_files(files_to_upload):
"""
Upload a list of files to the configured S3 bucket
@@ -45,6 +47,7 @@ def download_files(files_to_download):
if __name__ == "__main__":
import sys
if sys.argv[1] == "download":
download_files([sys.argv[2]])
elif sys.argv[1] == "upload":

View File

@@ -1,9 +1,10 @@
import asyncio
from functools import partial
import contextlib
from functools import partial
from threading import Lock
from typing import ContextManager, Generic, TypeVar
def run_in_executor(func, *args, executor=None, **kwargs):
callback = partial(func, *args, **kwargs)
loop = asyncio.get_event_loop()
@@ -11,6 +12,8 @@ def run_in_executor(func, *args, executor=None, **kwargs):
T = TypeVar("T")
class Mutex(Generic[T]):
def __init__(self, value: T):
self.__value = value
@@ -22,4 +25,4 @@ class Mutex(Generic[T]):
try:
yield self.__value
finally:
self.__lock.release()
self.__lock.release()

View File

@@ -1,23 +1,27 @@
import torch
import configparser
import nltk
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
from loguru import logger
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartTokenizer, BartForConditionalGeneration
nltk.download('punkt', quiet=True)
config = configparser.ConfigParser()
config.read('config.ini')
def preprocess_sentence(sentence):
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(sentence.lower())
tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
return ' '.join(tokens)
def compute_similarity(sent1, sent2):
"""
Compute the similarity
@@ -28,6 +32,7 @@ def compute_similarity(sent1, sent2):
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
return 0.0
def remove_almost_alike_sentences(sentences, threshold=0.7):
num_sentences = len(sentences)
removed_indices = set()
@@ -55,12 +60,14 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
filtered_sentences = [sentences[i] for i in range(num_sentences) if i not in removed_indices]
return filtered_sentences
def remove_outright_duplicate_sentences_from_chunk(chunk):
chunk_text = chunk["text"]
sentences = nltk.sent_tokenize(chunk_text)
nonduplicate_sentences = list(dict.fromkeys(sentences))
return nonduplicate_sentences
def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
chunk_sentences = []
@@ -80,6 +87,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
chunk_sentences.append(temp_result)
return chunk_sentences
def post_process_transcription(whisper_result):
transcript_text = ""
for chunk in whisper_result["chunks"]:
@@ -107,12 +115,13 @@ def summarize_chunks(chunks, tokenizer, model):
input_ids = input_ids.to(device)
with torch.no_grad():
summary_ids = model.generate(input_ids,
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
return summaries
def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])):
"""
Split text into smaller chunks.
@@ -132,6 +141,7 @@ def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])
chunks.append(current_chunk.strip())
return chunks
def summarize(transcript_text, timestamp,
real_time=False, summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

View File

@@ -1,15 +1,16 @@
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import collections
import spacy
import os
from pathlib import Path
import pickle
import ast
import collections
import configparser
import os
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import scattertext as st
import configparser
import spacy
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
config = configparser.ConfigParser()
config.read('config.ini')
@@ -29,7 +30,7 @@ def create_wordcloud(timestamp, real_time=False):
if real_time:
filename = "real_time_" + filename + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open("./artefacts/" + filename, "r") as f:
transcription_text = f.read()
@@ -202,4 +203,4 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
if real_time:
open('./artefacts/real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)