Push whisper-jax real time trial code

2026-04-26 07:05:19 +00:00 · 2023-06-09 16:02:07 +05:30
parent f13d912c84
commit 2d44024137
3 changed files with 93 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -160,6 +160,8 @@ cython_debug/
 #.idea/
 *.mp4
-*.txt
+summary.txt
-config.ini
+transcript.txt
-test_samples/
+*.ini
 test_samples/
 *.wav
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
 pyaudio==0.2.13
 keyboard==0.13.5
 pynput==1.7.6
 wave==0.0.2
--- a/whisjax_realtime_trial.py
+++ b/whisjax_realtime_trial.py
@@ -0,0 +1,84 @@
 #!/usr/bin/env python3
 import configparser
 import pyaudio
 from whisper_jax import FlaxWhisperPipline
 from pynput import keyboard
 import jax.numpy as jnp
 import wave
 config = configparser.ConfigParser()
 config.read('config.ini')
 WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
 OPENAI_APIKEY = config['DEFAULT']["OPENAI_APIKEY"]
 FRAMES_PER_BUFFER = 8000
 FORMAT = pyaudio.paInt16
 CHANNELS = 1
 RATE = 44100
 RECORD_SECONDS = 10
 def main():
    p = pyaudio.PyAudio()
    stream = p.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=FRAMES_PER_BUFFER
    )
    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
                                  dtype=jnp.float16,
                                  batch_size=16)
    transcript_file = open("transcript.txt", "w+")
    transcription = ""
    TEMP_AUDIO_FILE = "temp_audio.wav"
    global proceed
    proceed = True
    def on_press(key):
        if key == keyboard.Key.esc:
            global proceed
            proceed = False
    listener = keyboard.Listener(on_press=on_press)
    listener.start()
    while proceed:
        try:
            frames = []
            for i in range(0, int(RATE / FRAMES_PER_BUFFER * RECORD_SECONDS)):
                data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False)
                frames.append(data)
            print("Collected Input", len(frames))
            wf = wave.open(TEMP_AUDIO_FILE, 'wb')
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))
            wf.close()
            whisper_result = pipeline(TEMP_AUDIO_FILE, return_timestamps=True)
            print(whisper_result['text'])
            transcription += whisper_result['text']
            if len(transcription) > 10:
                transcription += "\n"
                transcript_file.write(transcription)
                transcription = ""
        except Exception as e:
            print(e)
            break
 if __name__ == "__main__":
    main()