mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Compare commits
1 Commits
v0.17.0
...
mathieu/fi
| Author | SHA1 | Date | |
|---|---|---|---|
| 5aed513c47 |
@@ -11,7 +11,10 @@ from reflector.processors.audio_chunker_auto import AudioChunkerAutoProcessor
|
|||||||
|
|
||||||
class AudioChunkerSileroProcessor(AudioChunkerProcessor):
|
class AudioChunkerSileroProcessor(AudioChunkerProcessor):
|
||||||
"""
|
"""
|
||||||
Assemble audio frames into chunks with VAD-based speech detection using Silero VAD
|
Assemble audio frames into chunks with VAD-based speech detection using Silero VAD.
|
||||||
|
|
||||||
|
Expects input audio to be already downscaled to 16kHz mono s16 format
|
||||||
|
(handled by AudioDownscaleProcessor in the pipeline).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -31,12 +34,13 @@ class AudioChunkerSileroProcessor(AudioChunkerProcessor):
|
|||||||
self._init_vad(use_onnx)
|
self._init_vad(use_onnx)
|
||||||
|
|
||||||
def _init_vad(self, use_onnx=False):
|
def _init_vad(self, use_onnx=False):
|
||||||
"""Initialize Silero VAD model"""
|
"""Initialize Silero VAD model for 16kHz audio"""
|
||||||
try:
|
try:
|
||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
self.vad_model = load_silero_vad(onnx=use_onnx)
|
self.vad_model = load_silero_vad(onnx=use_onnx)
|
||||||
|
# VAD expects 16kHz audio (guaranteed by AudioDownscaleProcessor)
|
||||||
self.vad_iterator = VADIterator(self.vad_model, sampling_rate=16000)
|
self.vad_iterator = VADIterator(self.vad_model, sampling_rate=16000)
|
||||||
self.logger.info("Silero VAD initialized successfully")
|
self.logger.info("Silero VAD initialized for 16kHz audio")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Failed to initialize Silero VAD: {e}")
|
self.logger.error(f"Failed to initialize Silero VAD: {e}")
|
||||||
@@ -75,7 +79,7 @@ class AudioChunkerSileroProcessor(AudioChunkerProcessor):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Processing block with current buffer size
|
# Processing block with current buffer size
|
||||||
print(f"Processing block: {len(self.frames)} frames in buffer")
|
# print(f"Processing block: {len(self.frames)} frames in buffer")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Convert frames to numpy array for VAD
|
# Convert frames to numpy array for VAD
|
||||||
@@ -189,38 +193,29 @@ class AudioChunkerSileroProcessor(AudioChunkerProcessor):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _frames_to_numpy(self, frames: list[av.AudioFrame]) -> Optional[np.ndarray]:
|
def _frames_to_numpy(self, frames: list[av.AudioFrame]) -> Optional[np.ndarray]:
|
||||||
"""Convert av.AudioFrame list to numpy array for VAD processing"""
|
"""Convert av.AudioFrame list to numpy array for VAD processing
|
||||||
|
|
||||||
|
Input frames are already 16kHz mono s16 format from AudioDownscaleProcessor.
|
||||||
|
Only need to convert s16 to float32 for Silero VAD.
|
||||||
|
"""
|
||||||
if not frames:
|
if not frames:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_data = []
|
# Concatenate all frame arrays
|
||||||
for frame in frames:
|
audio_arrays = [frame.to_ndarray().flatten() for frame in frames]
|
||||||
frame_array = frame.to_ndarray()
|
if not audio_arrays:
|
||||||
|
|
||||||
if len(frame_array.shape) == 2:
|
|
||||||
frame_array = frame_array.flatten()
|
|
||||||
|
|
||||||
audio_data.append(frame_array)
|
|
||||||
|
|
||||||
if not audio_data:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
combined_audio = np.concatenate(audio_data)
|
combined_audio = np.concatenate(audio_arrays)
|
||||||
|
|
||||||
# Ensure float32 format
|
# Convert s16 to float32 (Silero VAD requires float32 in range [-1.0, 1.0])
|
||||||
if combined_audio.dtype == np.int16:
|
# Input is guaranteed to be s16 from AudioDownscaleProcessor
|
||||||
# Normalize int16 audio to float32 in range [-1.0, 1.0]
|
return combined_audio.astype(np.float32) / 32768.0
|
||||||
combined_audio = combined_audio.astype(np.float32) / 32768.0
|
|
||||||
elif combined_audio.dtype != np.float32:
|
|
||||||
combined_audio = combined_audio.astype(np.float32)
|
|
||||||
|
|
||||||
return combined_audio
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error converting frames to numpy: {e}")
|
self.logger.error(f"Error converting frames to numpy: {e}")
|
||||||
|
return None
|
||||||
return None
|
|
||||||
|
|
||||||
def _find_speech_segment_end(self, audio_array: np.ndarray) -> Optional[int]:
|
def _find_speech_segment_end(self, audio_array: np.ndarray) -> Optional[int]:
|
||||||
"""Find complete speech segments and return frame index at segment end"""
|
"""Find complete speech segments and return frame index at segment end"""
|
||||||
|
|||||||
Reference in New Issue
Block a user