fix pipeline bugs

2026-02-04 18:06:48 +00:00 · 2023-06-23 12:16:10 +05:30
parent 037ef8bc41
commit c8c2f6b6b7
4 changed files with 30 additions and 15 deletions
--- a/real_time_transcript_with_timestamp_06-21-2023_153233.txt
+++ b/real_time_transcript_with_timestamp_06-21-2023_153233.txt
--- a/text_utilities.py
+++ b/text_utilities.py
@@ -20,8 +20,11 @@ def preprocess_sentence(sentence):
 def compute_similarity(sent1, sent2):
    tfidf_vectorizer = TfidfVectorizer()
    print("semt1", sent1, sent2)
    if sent1 is not None and sent2 is not None:
        tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
        return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return 0.0
 def remove_almost_alike_sentences(sentences, threshold=0.7):
    num_sentences = len(sentences)
@@ -31,8 +34,17 @@ def remove_almost_alike_sentences(sentences, threshold=0.7):
        if i not in removed_indices:
            for j in range(i + 1, num_sentences):
                if j not in removed_indices:
                    l_i = len(sentences[i])
                    l_j = len(sentences[j])
                    if l_i == 0 or l_j == 0:
                        if l_i == 0:
                            removed_indices.add(i)
                        if l_j == 0:
                            removed_indices.add(j)
                    else:
                        sentence1 = preprocess_sentence(sentences[i])
                        sentence2 = preprocess_sentence(sentences[j])
                        if len(sentence1) != 0 and len(sentence2) != 0:
                            similarity = compute_similarity(sentence1, sentence2)
                            if similarity >= threshold:
@@ -67,11 +79,14 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
    return chunk_sentences
 def post_process_transcription(whisper_result):
    transcript_text = ""
    for chunk in whisper_result["chunks"]:
        nonduplicate_sentences = remove_outright_duplicate_sentences_from_chunk(chunk)
        chunk_sentences = remove_whisper_repetitive_hallucination(nonduplicate_sentences)
        similarity_matched_sentences = remove_almost_alike_sentences(chunk_sentences)
        chunk["text"] = " ".join(similarity_matched_sentences)
        transcript_text += chunk["text"]
    whisper_result["text"] = transcript_text
    return whisper_result
--- a/viz_utilities.py
+++ b/viz_utilities.py
@@ -1,5 +1,6 @@
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud, STOPWORDS
 from nltk.corpus import stopwords as nltk_stopwords
 import collections
 import spacy
 import pickle
@@ -11,6 +12,10 @@ import configparser
 config = configparser.ConfigParser()
 config.read('config.ini')
 en = spacy.load('en_core_web_md')
 spacy_stopwords = en.Defaults.stop_words
 STOPWORDS = set(STOPWORDS).union(set(nltk_stopwords)).union(set(spacy_stopwords))
 def create_wordcloud(timestamp, real_time=False):
    """
@@ -26,13 +31,11 @@ def create_wordcloud(timestamp, real_time=False):
    with open(filename, "r") as f:
        transcription_text = f.read()
    stopwords = set(STOPWORDS)
    # python_mask = np.array(PIL.Image.open("download1.png"))
    wordcloud = WordCloud(height=800, width=800,
                          background_color='white',
-                          stopwords=stopwords,
+                          stopwords=STOPWORDS,
                          min_font_size=8).generate(transcription_text)
    # Plot wordcloud and save image
--- a/whisjax_realtime.py
+++ b/whisjax_realtime.py
@@ -106,10 +106,6 @@ def main():
    transcript_with_timestamp = post_process_transcription(transcript_with_timestamp)
    transcript_text = ""
    for chunk in transcript_with_timestamp["chunks"]:
        transcript_text += chunk["text"]
    logger.info("Creating word cloud")
    create_wordcloud(NOW, True)
@@ -125,7 +121,7 @@ def main():
                       "real_time_mappings_" + suffix + ".pkl"]
    upload_files(files_to_upload)
-    summarize(transcript_text, NOW, True, True)
+    summarize(transcript_with_timestamp["text"], NOW, True, True)
    logger.info("Summarization completed")