mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-21 04:39:06 +00:00
update feature for real time transcription locally
This commit is contained in:
35
README.md
35
README.md
@@ -110,12 +110,35 @@ This is a jupyter notebook playground with template instructions on handling the
|
|||||||
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
|
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment libraries and
|
||||||
visualizations on top of the metadata.
|
visualizations on top of the metadata.
|
||||||
|
|
||||||
|
**WHISPER-JAX REALTIME TRANSCRIPTION PIPELINE:**
|
||||||
|
|
||||||
|
We also support a provision to perform real-time transcripton using whisper-jax pipeline. But, there are
|
||||||
|
a few pre-requisites before you run it on your local machine. The instructions are for
|
||||||
|
configuring on a MacOS.
|
||||||
|
|
||||||
|
We need to way to route audio from an application opened via the browser, ex. "Whereby" and audio from your local
|
||||||
|
microphone input which you will be using for speaking. We use [Blackhole](https://github.com/ExistentialAudio/BlackHole).
|
||||||
|
|
||||||
|
1) Install Blackhole-2ch (2 ch is enough) by 1 of 2 options listed.
|
||||||
|
2) Setup [Aggregare device](https://github.com/ExistentialAudio/BlackHole/wiki/Aggregate-Device) to route web audio and
|
||||||
|
local microphone input.
|
||||||
|
|
||||||
|
Be sure to mirror the settings given  (including the name)
|
||||||
|
3) Setup [Multi-Output device](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device)
|
||||||
|
Refer 
|
||||||
|
|
||||||
|
From the reflector root folder,
|
||||||
|
|
||||||
|
run ```python3 whisjax_realtime_trial.py```
|
||||||
|
|
||||||
|
**Permissions:**
|
||||||
|
|
||||||
|
You may have to add permission for Terminal/Code Editor microphone access to record audio and in
|
||||||
|
```System Preferences -> Privacy & Security -> Accessibility``` as well.
|
||||||
|
|
||||||
NEXT STEPS:
|
NEXT STEPS:
|
||||||
|
|
||||||
1) Run this demo on a local Mac M1 to test flow and observe the performance
|
|
||||||
2) Create a pipeline using a microphone to listen to audio chunks to perform transcription realtime (and also efficiently
|
1) Create a RunPod setup for this feature (mentioned in 1 & 2) and test it end-to-end
|
||||||
summarize it as well) -> *done as part of whisjax_realtime_trial.py*
|
2) Perform Speaker Diarization using Whisper-JAX
|
||||||
3) Create a RunPod setup for this feature (mentioned in 1 & 2) and test it end-to-end
|
3) Based on the feasibility of the above points, explore suitable visualizations for transcription & summarization.
|
||||||
4) Perform Speaker Diarization using Whisper-JAX
|
|
||||||
5) Based on the feasibility of the above points, explore suitable visualizations for transcription & summarization.
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ KMP_DUPLICATE_LIB_OK=TRUE
|
|||||||
OPENAI_APIKEY=
|
OPENAI_APIKEY=
|
||||||
# Export Whisper Model Size
|
# Export Whisper Model Size
|
||||||
WHISPER_MODEL_SIZE=medium
|
WHISPER_MODEL_SIZE=medium
|
||||||
|
WHISPER_REAL_TIME_MODEL_SIZE=medium
|
||||||
# AWS config
|
# AWS config
|
||||||
AWS_ACCESS_KEY=***REMOVED***
|
AWS_ACCESS_KEY=***REMOVED***
|
||||||
AWS_SECRET_KEY=***REMOVED***
|
AWS_SECRET_KEY=***REMOVED***
|
||||||
|
|||||||
BIN
images/aggregate_input.png
Normal file
BIN
images/aggregate_input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
BIN
images/multi-output.png
Normal file
BIN
images/multi-output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 113 KiB |
@@ -14,23 +14,28 @@ WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
|||||||
|
|
||||||
FRAMES_PER_BUFFER = 8000
|
FRAMES_PER_BUFFER = 8000
|
||||||
FORMAT = pyaudio.paInt16
|
FORMAT = pyaudio.paInt16
|
||||||
CHANNELS = 1
|
CHANNELS = 2
|
||||||
RATE = 44100
|
RATE = 44100
|
||||||
RECORD_SECONDS = 5
|
RECORD_SECONDS = 15
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
|
AUDIO_DEVICE_ID = -1
|
||||||
|
for i in range(p.get_device_count()):
|
||||||
|
if p.get_device_info_by_index(i)["name"] == "ref-agg-input":
|
||||||
|
AUDIO_DEVICE_ID = i
|
||||||
|
audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID)
|
||||||
stream = p.open(
|
stream = p.open(
|
||||||
format=FORMAT,
|
format=FORMAT,
|
||||||
channels=CHANNELS,
|
channels=CHANNELS,
|
||||||
rate=RATE,
|
rate=RATE,
|
||||||
input=True,
|
input=True,
|
||||||
frames_per_buffer=FRAMES_PER_BUFFER
|
frames_per_buffer=FRAMES_PER_BUFFER,
|
||||||
|
input_device_index=audio_devices['index']
|
||||||
)
|
)
|
||||||
|
|
||||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
pipeline = FlaxWhisperPipline("openai/whisper-" + config["DEFAULT"]["WHISPER_REAL_TIME_MODEL_SIZE"],
|
||||||
dtype=jnp.float16,
|
dtype=jnp.float16,
|
||||||
batch_size=16)
|
batch_size=16)
|
||||||
|
|
||||||
@@ -48,8 +53,7 @@ def main():
|
|||||||
|
|
||||||
listener = keyboard.Listener(on_press=on_press)
|
listener = keyboard.Listener(on_press=on_press)
|
||||||
listener.start()
|
listener.start()
|
||||||
print("Listening...")
|
print("Attempting real-time transcription.. Listening...")
|
||||||
|
|
||||||
while proceed:
|
while proceed:
|
||||||
try:
|
try:
|
||||||
frames = []
|
frames = []
|
||||||
|
|||||||
Reference in New Issue
Block a user