#!/usr/bin/env python3 import configparser import time import wave from datetime import datetime import jax.numpy as jnp import pyaudio from loguru import logger from pynput import keyboard from termcolor import colored from whisper_jax import FlaxWhisperPipline from utils.file_utilities import upload_files from utils.text_utilities import summarize, post_process_transcription from utils.viz_utilities import create_wordcloud, create_talk_diff_scatter_viz config = configparser.ConfigParser() config.read('config.ini') WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] FRAMES_PER_BUFFER = 8000 FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 96000 RECORD_SECONDS = 15 NOW = datetime.now() def main(): p = pyaudio.PyAudio() AUDIO_DEVICE_ID = -1 for i in range(p.get_device_count()): if p.get_device_info_by_index(i)["name"] == config["DEFAULT"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]: AUDIO_DEVICE_ID = i audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID) stream = p.open( format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=FRAMES_PER_BUFFER, input_device_index=int(audio_devices['index']) ) pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"], dtype=jnp.float16, batch_size=16) transcription = "" TEMP_AUDIO_FILE = "temp_audio.wav" global proceed proceed = True def on_press(key): if key == keyboard.Key.esc: global proceed proceed = False transcript_with_timestamp = {"text": "", "chunks": []} last_transcribed_time = 0.0 listener = keyboard.Listener(on_press=on_press) listener.start() print("Attempting real-time transcription.. Listening...") try: while proceed: frames = [] start_time = time.time() for i in range(0, int(RATE / FRAMES_PER_BUFFER * RECORD_SECONDS)): data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) frames.append(data) end_time = time.time() wf = wave.open(TEMP_AUDIO_FILE, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close() whisper_result = pipeline(TEMP_AUDIO_FILE, return_timestamps=True) timestamp = whisper_result["chunks"][0]["timestamp"] start = timestamp[0] end = timestamp[1] if end is None: end = start + 15.0 duration = end - start item = {'timestamp': (last_transcribed_time, last_transcribed_time + duration), 'text': whisper_result['text'], 'stats': (str(end_time - start_time), str(duration)) } last_transcribed_time = last_transcribed_time + duration transcript_with_timestamp["chunks"].append(item) transcription += whisper_result['text'] print(colored("", "yellow")) print(colored(whisper_result['text'], 'green')) print(colored(" Recorded duration: " + str(end_time - start_time) + " | Transcribed duration: " + str(duration), "yellow")) except Exception as e: print(e) finally: with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: f.write(transcription) with open("real_time_transcript_with_timestamp_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f: transcript_with_timestamp["text"] = transcription f.write(str(transcript_with_timestamp)) transcript_with_timestamp = post_process_transcription(transcript_with_timestamp) logger.info("Creating word cloud") create_wordcloud(NOW, True) logger.info("Performing talk-diff and talk-diff visualization") create_talk_diff_scatter_viz(NOW, True) # S3 : Push artefacts to S3 bucket suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S") files_to_upload = ["real_time_transcript_" + suffix + ".txt", "real_time_transcript_with_timestamp_" + suffix + ".txt", "real_time_df_" + suffix + ".pkl", "real_time_wordcloud_" + suffix + ".png", "real_time_mappings_" + suffix + ".pkl", "real_time_scatter_" + suffix + ".html"] upload_files(files_to_upload) summarize(transcript_with_timestamp["text"], NOW, True, True) logger.info("Summarization completed") # Summarization takes a lot of time, so do this separately at the end files_to_upload = ["real_time_summary_" + suffix + ".txt"] upload_files(files_to_upload) if __name__ == "__main__": main()