mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-22 13:19:05 +00:00
minor refactor
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import configparser
|
||||
|
||||
import boto3
|
||||
import botocore
|
||||
import configparser
|
||||
from loguru import logger
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
@@ -12,6 +13,7 @@ s3 = boto3.client('s3',
|
||||
aws_access_key_id=config["DEFAULT"]["AWS_ACCESS_KEY"],
|
||||
aws_secret_access_key=config["DEFAULT"]["AWS_SECRET_KEY"])
|
||||
|
||||
|
||||
def upload_files(files_to_upload):
|
||||
"""
|
||||
Upload a list of files to the configured S3 bucket
|
||||
@@ -45,6 +47,7 @@ def download_files(files_to_download):
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if sys.argv[1] == "download":
|
||||
download_files([sys.argv[2]])
|
||||
elif sys.argv[1] == "upload":
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import asyncio
|
||||
from functools import partial
|
||||
import contextlib
|
||||
from functools import partial
|
||||
from threading import Lock
|
||||
from typing import ContextManager, Generic, TypeVar
|
||||
|
||||
|
||||
def run_in_executor(func, *args, executor=None, **kwargs):
|
||||
callback = partial(func, *args, **kwargs)
|
||||
loop = asyncio.get_event_loop()
|
||||
@@ -11,6 +12,8 @@ def run_in_executor(func, *args, executor=None, **kwargs):
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class Mutex(Generic[T]):
|
||||
def __init__(self, value: T):
|
||||
self.__value = value
|
||||
@@ -22,4 +25,4 @@ class Mutex(Generic[T]):
|
||||
try:
|
||||
yield self.__value
|
||||
finally:
|
||||
self.__lock.release()
|
||||
self.__lock.release()
|
||||
|
||||
@@ -1,23 +1,27 @@
|
||||
import torch
|
||||
import configparser
|
||||
|
||||
import nltk
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
import torch
|
||||
from loguru import logger
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from nltk.tokenize import word_tokenize
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
|
||||
|
||||
def preprocess_sentence(sentence):
|
||||
stop_words = set(stopwords.words('english'))
|
||||
tokens = word_tokenize(sentence.lower())
|
||||
tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def compute_similarity(sent1, sent2):
|
||||
"""
|
||||
Compute the similarity
|
||||
@@ -28,6 +32,7 @@ def compute_similarity(sent1, sent2):
|
||||
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
|
||||
return 0.0
|
||||
|
||||
|
||||
def remove_almost_alike_sentences(sentences, threshold=0.7):
|
||||
num_sentences = len(sentences)
|
||||
removed_indices = set()
|
||||
@@ -55,12 +60,14 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
|
||||
filtered_sentences = [sentences[i] for i in range(num_sentences) if i not in removed_indices]
|
||||
return filtered_sentences
|
||||
|
||||
|
||||
def remove_outright_duplicate_sentences_from_chunk(chunk):
|
||||
chunk_text = chunk["text"]
|
||||
sentences = nltk.sent_tokenize(chunk_text)
|
||||
nonduplicate_sentences = list(dict.fromkeys(sentences))
|
||||
return nonduplicate_sentences
|
||||
|
||||
|
||||
def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
|
||||
chunk_sentences = []
|
||||
|
||||
@@ -80,6 +87,7 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
|
||||
chunk_sentences.append(temp_result)
|
||||
return chunk_sentences
|
||||
|
||||
|
||||
def post_process_transcription(whisper_result):
|
||||
transcript_text = ""
|
||||
for chunk in whisper_result["chunks"]:
|
||||
@@ -107,12 +115,13 @@ def summarize_chunks(chunks, tokenizer, model):
|
||||
input_ids = input_ids.to(device)
|
||||
with torch.no_grad():
|
||||
summary_ids = model.generate(input_ids,
|
||||
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
|
||||
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
|
||||
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
|
||||
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
return summaries
|
||||
|
||||
|
||||
def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])):
|
||||
"""
|
||||
Split text into smaller chunks.
|
||||
@@ -132,6 +141,7 @@ def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])
|
||||
chunks.append(current_chunk.strip())
|
||||
return chunks
|
||||
|
||||
|
||||
def summarize(transcript_text, timestamp,
|
||||
real_time=False, summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from wordcloud import WordCloud, STOPWORDS
|
||||
from nltk.corpus import stopwords
|
||||
import collections
|
||||
import spacy
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import ast
|
||||
import collections
|
||||
import configparser
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import scattertext as st
|
||||
import configparser
|
||||
import spacy
|
||||
from nltk.corpus import stopwords
|
||||
from wordcloud import WordCloud, STOPWORDS
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
@@ -29,7 +30,7 @@ def create_wordcloud(timestamp, real_time=False):
|
||||
if real_time:
|
||||
filename = "real_time_" + filename + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
|
||||
with open("./artefacts/" + filename, "r") as f:
|
||||
transcription_text = f.read()
|
||||
@@ -202,4 +203,4 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
if real_time:
|
||||
open('./artefacts/real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
else:
|
||||
open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
|
||||
Reference in New Issue
Block a user