diff --git a/client.py b/client.py index 5cf8d47d..b0fa46a5 100644 --- a/client.py +++ b/client.py @@ -47,7 +47,7 @@ async def main(): logger.info(f"Cancelling {len(tasks)} outstanding tasks") await asyncio.gather(*tasks, return_exceptions=True) - logger.info(f"Flushing metrics") + logger.info(f'{"Flushing metrics"}') loop.stop() signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT) diff --git a/server_executor_cleaned.py b/server_executor_cleaned.py index ecac6d48..e0fb4cc3 100644 --- a/server_executor_cleaned.py +++ b/server_executor_cleaned.py @@ -74,7 +74,8 @@ class AudioStreamTrack(MediaStreamTrack): get_transcription, local_frames, executor=executor ) whisper_result.add_done_callback( - lambda f: channel_send(data_channel, str(whisper_result.result())) + lambda f: channel_send(data_channel, + str(whisper_result.result())) if (f.result()) else None ) @@ -126,7 +127,8 @@ async def offer(request): return web.Response( content_type="application/json", text=json.dumps( - {"sdp": pc.localDescription.sdp, "type": pc.localDescription.type} + {"sdp": pc.localDescription.sdp, + "type": pc.localDescription.type} ), ) diff --git a/stream_client.py b/stream_client.py index d7791e5c..628ee69e 100644 --- a/stream_client.py +++ b/stream_client.py @@ -37,8 +37,10 @@ class StreamClient: self.pcs = set() self.time_start = None self.queue = asyncio.Queue() - self.player = MediaPlayer(':' + str(config['DEFAULT']["AV_FOUNDATION_DEVICE_ID"]), - format='avfoundation', options={'channels': '2'}) + self.player = MediaPlayer( + ':' + str(config['DEFAULT']["AV_FOUNDATION_DEVICE_ID"]), + format='avfoundation', + options={'channels': '2'}) def stop(self): self.loop.run_until_complete(self.signaling.close()) @@ -115,7 +117,8 @@ class StreamClient: self.channel_log(channel, "<", message) if isinstance(message, str) and message.startswith("pong"): - elapsed_ms = (self.current_stamp() - int(message[5:])) / 1000 + elapsed_ms = (self.current_stamp() - int(message[5:]))\ + / 1000 print(" RTT %.2f ms" % elapsed_ms) await pc.setLocalDescription(await pc.createOffer()) @@ -135,7 +138,7 @@ class StreamClient: answer = RTCSessionDescription(sdp=params["sdp"], type=params["type"]) await pc.setRemoteDescription(answer) - self.reader = self.worker(f"worker", self.queue) + self.reader = self.worker(f'{"worker"}', self.queue) def get_reader(self): return self.reader diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/log_utils.py b/utils/log_utils.py index 0cdb30f4..f665f5da 100644 --- a/utils/log_utils.py +++ b/utils/log_utils.py @@ -1,4 +1,4 @@ -from loguru import logger +import loguru class SingletonLogger: @@ -11,7 +11,7 @@ class SingletonLogger: :return: SingletonLogger instance """ if not SingletonLogger.__instance: - SingletonLogger.__instance = logger + SingletonLogger.__instance = loguru.logger return SingletonLogger.__instance diff --git a/utils/run_utils.py b/utils/run_utils.py index 0ccd6942..dca09c87 100644 --- a/utils/run_utils.py +++ b/utils/run_utils.py @@ -31,7 +31,7 @@ def run_in_executor(func, *args, executor=None, **kwargs): """ callback = partial(func, *args, **kwargs) loop = asyncio.get_event_loop() - return asyncio.get_event_loop().run_in_executor(executor, callback) + return loop.run_in_executor(executor, callback) # Genetic type template diff --git a/utils/text_utilities.py b/utils/text_utilities.py index 900f9194..519990cb 100644 --- a/utils/text_utilities.py +++ b/utils/text_utilities.py @@ -15,7 +15,8 @@ nltk.download('punkt', quiet=True) def preprocess_sentence(sentence): stop_words = set(stopwords.words('english')) tokens = word_tokenize(sentence.lower()) - tokens = [token for token in tokens if token.isalnum() and token not in stop_words] + tokens = [token for token in tokens + if token.isalnum() and token not in stop_words] return ' '.join(tokens) @@ -49,12 +50,14 @@ def remove_almost_alike_sentences(sentences, threshold=0.7): sentence1 = preprocess_sentence(sentences[i]) sentence2 = preprocess_sentence(sentences[j]) if len(sentence1) != 0 and len(sentence2) != 0: - similarity = compute_similarity(sentence1, sentence2) + similarity = compute_similarity(sentence1, + sentence2) if similarity >= threshold: removed_indices.add(max(i, j)) - filtered_sentences = [sentences[i] for i in range(num_sentences) if i not in removed_indices] + filtered_sentences = [sentences[i] for i in range(num_sentences) + if i not in removed_indices] return filtered_sentences @@ -74,11 +77,13 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences): words = nltk.word_tokenize(sent) n_gram_filter = 3 for i in range(len(words)): - if str(words[i:i + n_gram_filter]) in seen and seen[str(words[i:i + n_gram_filter])] == words[ - i + 1:i + n_gram_filter + 2]: + if str(words[i:i + n_gram_filter]) in seen and \ + seen[str(words[i:i + n_gram_filter])] == \ + words[i + 1:i + n_gram_filter + 2]: pass else: - seen[str(words[i:i + n_gram_filter])] = words[i + 1:i + n_gram_filter + 2] + seen[str(words[i:i + n_gram_filter])] = \ + words[i + 1:i + n_gram_filter + 2] temp_result += words[i] temp_result += " " chunk_sentences.append(temp_result) @@ -88,9 +93,12 @@ def remove_whisper_repetitive_hallucination(nonduplicate_sentences): def post_process_transcription(whisper_result): transcript_text = "" for chunk in whisper_result["chunks"]: - nonduplicate_sentences = remove_outright_duplicate_sentences_from_chunk(chunk) - chunk_sentences = remove_whisper_repetitive_hallucination(nonduplicate_sentences) - similarity_matched_sentences = remove_almost_alike_sentences(chunk_sentences) + nonduplicate_sentences = \ + remove_outright_duplicate_sentences_from_chunk(chunk) + chunk_sentences = \ + remove_whisper_repetitive_hallucination(nonduplicate_sentences) + similarity_matched_sentences = \ + remove_almost_alike_sentences(chunk_sentences) chunk["text"] = " ".join(similarity_matched_sentences) transcript_text += chunk["text"] whisper_result["text"] = transcript_text @@ -111,18 +119,23 @@ def summarize_chunks(chunks, tokenizer, model): input_ids = tokenizer.encode(c, return_tensors='pt') input_ids = input_ids.to(device) with torch.no_grad(): - summary_ids = model.generate(input_ids, - num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0, - max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True) - summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) + summary_ids = \ + model.generate(input_ids, + num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), + length_penalty=2.0, + max_length=int(config["DEFAULT"]["MAX_LENGTH"]), + early_stopping=True) + summary = tokenizer.decode(summary_ids[0], + skip_special_tokens=True) summaries.append(summary) return summaries -def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])): +def chunk_text(text, + max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"])): """ Split text into smaller chunks. - :param txt: Text to be chunked + :param text: Text to be chunked :param max_chunk_length: length of chunk :return: chunked texts """ @@ -140,7 +153,8 @@ def chunk_text(text, max_chunk_length=int(config["DEFAULT"]["MAX_CHUNK_LENGTH"]) def summarize(transcript_text, timestamp, - real_time=False, summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): + real_time=False, + summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") summary_model = config["DEFAULT"]["SUMMARY_MODEL"] if not summary_model: @@ -157,9 +171,11 @@ def summarize(transcript_text, timestamp, output_filename = "real_time_" + output_filename if summarize_using_chunks != "YES": - inputs = tokenizer.batch_encode_plus([transcript_text], truncation=True, padding='longest', - max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]), - return_tensors='pt') + inputs = tokenizer.\ + batch_encode_plus([transcript_text], truncation=True, + padding='longest', + max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]), + return_tensors='pt') inputs = inputs.to(device) with torch.no_grad(): @@ -167,8 +183,8 @@ def summarize(transcript_text, timestamp, num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0, max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True) - decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for - summary in summaries] + decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) + for summary in summaries] summary = " ".join(decoded_summaries) with open(output_filename, 'w') as f: f.write(summary.strip() + "\n") @@ -176,7 +192,8 @@ def summarize(transcript_text, timestamp, logger.info("Breaking transcript into smaller chunks") chunks = chunk_text(transcript_text) - logger.info(f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable + logger.info(f"Transcript broken into {len(chunks)} " + f"chunks of at most 500 words") logger.info(f"Writing summary text to: {output_filename}") with open(output_filename, 'w') as f: diff --git a/utils/viz_utilities.py b/utils/viz_utilities.py index fa09144e..e1ab88c9 100644 --- a/utils/viz_utilities.py +++ b/utils/viz_utilities.py @@ -2,7 +2,6 @@ import ast import collections import os import pickle -from pathlib import Path import matplotlib.pyplot as plt import pandas as pd @@ -14,7 +13,8 @@ from wordcloud import STOPWORDS, WordCloud en = spacy.load('en_core_web_md') spacy_stopwords = en.Defaults.stop_words -STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords)) +STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\ + union(set(spacy_stopwords)) def create_wordcloud(timestamp, real_time=False): @@ -24,7 +24,8 @@ def create_wordcloud(timestamp, real_time=False): """ filename = "transcript" if real_time: - filename = "real_time_" + filename + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" + filename = "real_time_" + filename + "_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" @@ -46,7 +47,8 @@ def create_wordcloud(timestamp, real_time=False): wordcloud_name = "wordcloud" if real_time: - wordcloud_name = "real_time_" + wordcloud_name + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" + wordcloud_name = "real_time_" + wordcloud_name + "_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" else: wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" @@ -66,7 +68,6 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): agenda_topics = [] agenda = [] # Load the agenda - path = Path(__file__) with open(os.path.join(os.getcwd(), "agenda-headers.txt"), "r") as f: for line in f.readlines(): if line.strip(): @@ -76,9 +77,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): # Load the transcription with timestamp filename = "" if real_time: - filename = "real_time_transcript_with_timestamp_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" + filename = "real_time_transcript_with_timestamp_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: - filename = "transcript_with_timestamp_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" + filename = "transcript_with_timestamp_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" with open(filename) as f: transcription_timestamp_text = f.read() @@ -94,7 +97,8 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): ts_to_topic_mapping_top_1 = {} ts_to_topic_mapping_top_2 = {} - # Also create a mapping of the different timestamps in which each topic was covered + # Also create a mapping of the different timestamps + # in which each topic was covered topic_to_ts_mapping_top_1 = collections.defaultdict(list) topic_to_ts_mapping_top_2 = collections.defaultdict(list) @@ -105,7 +109,8 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): topic_similarities = [] for item in range(len(agenda)): item_doc = nlp(agenda[item]) - # if not doc_transcription or not all(token.has_vector for token in doc_transcription): + # if not doc_transcription or not all + # (token.has_vector for token in doc_transcription): if not doc_transcription: continue similarity = doc_transcription.similarity(item_doc) @@ -129,8 +134,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): :param record: :return: """ - record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[record["timestamp"]] - record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[record["timestamp"]] + record["ts_to_topic_mapping_top_1"] = \ + ts_to_topic_mapping_top_1[record["timestamp"]] + record["ts_to_topic_mapping_top_2"] = \ + ts_to_topic_mapping_top_2[record["timestamp"]] return record df = df.apply(create_new_columns, axis=1) @@ -151,7 +158,8 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): # Save df, mappings for further experimentation df_name = "df" if real_time: - df_name = "real_time_" + df_name + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" + df_name = "real_time_" + df_name + "_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" df.to_pickle(df_name) @@ -161,7 +169,8 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): mappings_name = "mappings" if real_time: - mappings_name = "real_time_" + mappings_name + "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" + mappings_name = "real_time_" + mappings_name + "_" +\ + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" pickle.dump(my_mappings, open(mappings_name, "wb")) @@ -197,6 +206,8 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): transform=st.Scalers.dense_rank ) if real_time: - open('./artefacts/real_time_scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) + open('./artefacts/real_time_scatter_' + + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) else: - open('./artefacts/scatter_' + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) + open('./artefacts/scatter_' + + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html) diff --git a/whisjax.py b/whisjax.py index 8946953f..9e8ce4cf 100644 --- a/whisjax.py +++ b/whisjax.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# summarize https://www.youtube.com/watch?v=imzTxoEDH_g --transcript=transcript.txt summary.txt +# summarize https://www.youtube.com/watch?v=imzTxoEDH_g # summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt # summarize podcast.mp3 summary.txt @@ -14,7 +14,6 @@ from urllib.parse import urlparse import jax.numpy as jnp import moviepy.editor -import moviepy.editor import nltk import yt_dlp as youtube_dl from whisper_jax import FlaxWhisperPipline @@ -39,11 +38,16 @@ def init_argparse() -> argparse.ArgumentParser: """ parser = argparse.ArgumentParser( usage="%(prog)s [OPTIONS] ", - description="Creates a transcript of a video or audio file, then summarizes it using ChatGPT." + description="Creates a transcript of a video or audio file, then" + " summarizes it using ChatGPT." ) - parser.add_argument("-l", "--language", help="Language that the summary should be written in", type=str, - default="english", choices=['english', 'spanish', 'french', 'german', 'romanian']) + parser.add_argument("-l", "--language", + help="Language that the summary should be written in", + type=str, + default="english", + choices=['english', 'spanish', 'french', 'german', + 'romanian']) parser.add_argument("location") return parser @@ -61,10 +65,12 @@ def main(): media_file = "" if url.scheme == 'http' or url.scheme == 'https': - # Check if we're being asked to retreive a YouTube URL, which is handled - # diffrently, as we'll use a secondary site to download the video first. + # Check if we're being asked to retreive a YouTube URL, which is + # handled differently, as we'll use a secondary site to download + # the video first. if re.search('youtube.com', url.netloc, re.IGNORECASE): - # Download the lowest resolution YouTube video (since we're just interested in the audio). + # Download the lowest resolution YouTube video + # (since we're just interested in the audio). # It will be saved to the current directory. logger.info("Downloading YouTube video at url: " + args.location) @@ -76,7 +82,7 @@ def main(): 'preferredcodec': 'mp3', 'preferredquality': '192', }], - 'outtmpl': 'audio', # Specify the output file path and name + 'outtmpl': 'audio', # Specify output file path and name } # Download the audio @@ -86,7 +92,8 @@ def main(): logger.info("Saved downloaded YouTube video to: " + media_file) else: - # XXX - Download file using urllib, check if file is audio/video using python-magic + # XXX - Download file using urllib, check if file is + # audio/video using python-magic logger.info(f"Downloading file at url: {args.location}") logger.info(" XXX - This method hasn't been implemented yet.") elif url.scheme == '': @@ -97,7 +104,7 @@ def main(): if media_file.endswith(".m4a"): subprocess.run(["ffmpeg", "-i", media_file, f"{media_file}.mp4"]) - input_file = f"{media_file}.mp4" + media_file = f"{media_file}.mp4" else: print("Unsupported URL scheme: " + url.scheme) quit() @@ -106,13 +113,15 @@ def main(): if not media_file.endswith(".mp3"): try: video = moviepy.editor.VideoFileClip(media_file) - audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name + audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", + delete=False).name video.audio.write_audiofile(audio_filename, logger=None) logger.info(f"Extracting audio to: {audio_filename}") # Handle audio only file - except: + except Exception: audio = moviepy.editor.AudioFileClip(media_file) - audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name + audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3", + delete=False).name audio.write_audiofile(audio_filename, logger=None) else: audio_filename = media_file @@ -132,10 +141,12 @@ def main(): for chunk in whisper_result["chunks"]: transcript_text += chunk["text"] - with open("./artefacts/transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as transcript_file: + with open("./artefacts/transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + + ".txt", "w") as transcript_file: transcript_file.write(transcript_text) - with open("./artefacts/transcript_with_timestamp_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", + with open("./artefacts/transcript_with_timestamp_" + + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as transcript_file_timestamps: transcript_file_timestamps.write(str(whisper_result)) diff --git a/whisjax_realtime.py b/whisjax_realtime.py index 68dc472a..63eab04d 100644 --- a/whisjax_realtime.py +++ b/whisjax_realtime.py @@ -30,7 +30,8 @@ def main(): p = pyaudio.PyAudio() AUDIO_DEVICE_ID = -1 for i in range(p.get_device_count()): - if p.get_device_info_by_index(i)["name"] == config["DEFAULT"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]: + if p.get_device_info_by_index(i)["name"] == \ + config["DEFAULT"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]: AUDIO_DEVICE_ID = i audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID) stream = p.open( @@ -42,7 +43,8 @@ def main(): input_device_index=int(audio_devices['index']) ) - pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"], + pipeline = FlaxWhisperPipline("openai/whisper-" + + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"], dtype=jnp.float16, batch_size=16) @@ -69,7 +71,8 @@ def main(): frames = [] start_time = time.time() for i in range(0, int(RATE / FRAMES_PER_BUFFER * RECORD_SECONDS)): - data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) + data = stream.read(FRAMES_PER_BUFFER, + exception_on_overflow=False) frames.append(data) end_time = time.time() @@ -87,7 +90,8 @@ def main(): if end is None: end = start + 15.0 duration = end - start - item = {'timestamp': (last_transcribed_time, last_transcribed_time + duration), + item = {'timestamp': (last_transcribed_time, + last_transcribed_time + duration), 'text': whisper_result['text'], 'stats': (str(end_time - start_time), str(duration)) } @@ -97,15 +101,19 @@ def main(): print(colored("", "yellow")) print(colored(whisper_result['text'], 'green')) - print(colored(" Recorded duration: " + str(end_time - start_time) + " | Transcribed duration: " + + print(colored(" Recorded duration: " + + str(end_time - start_time) + + " | Transcribed duration: " + str(duration), "yellow")) except Exception as e: print(e) finally: - with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: + with open("real_time_transcript_" + + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: f.write(transcription) - with open("real_time_transcript_with_timestamp_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: + with open("real_time_transcript_with_timestamp_" + + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: transcript_with_timestamp["text"] = transcription f.write(str(transcript_with_timestamp))