From 2cf61b191fe9f7b2162d07c35f88631938f39a47 Mon Sep 17 00:00:00 2001 From: Sara Date: Mon, 9 Oct 2023 15:52:11 +0200 Subject: [PATCH] fix waveform generation --- server/reflector/utils/audio_waveform.py | 17 ++++++++++------- server/reflector/views/transcripts.py | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/server/reflector/utils/audio_waveform.py b/server/reflector/utils/audio_waveform.py index f31e1748..7a6fdb3e 100644 --- a/server/reflector/utils/audio_waveform.py +++ b/server/reflector/utils/audio_waveform.py @@ -9,11 +9,12 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int] path = path.as_posix() container = av.open(path) - stream = container.streams.get(audio=0)[0] + stream = container.streams.audio[0] duration = container.duration / av.time_base chunk_size_secs = duration / segments_count chunk_size = int(chunk_size_secs * stream.rate * stream.channels) + if chunk_size == 0: # there is not enough data to fill the chunks # so basically we use chunk_size of 1. @@ -22,7 +23,7 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int] # 1.1 is a safety margin as it seems that pyav decode # does not always return the exact number of chunks # that we expect. - volumes = np.zeros(int(segments_count * 1.1), dtype=int) + volumes = np.zeros(int(segments_count * 1.1), dtype=float) current_chunk_idx = 0 current_chunk_size = 0 current_chunk_volume = 0 @@ -35,7 +36,6 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int] count += len(data) frames += 1 samples += frame.samples - while len(data) > 0: datalen = len(data) @@ -53,13 +53,16 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int] current_chunk_idx += 1 current_chunk_size = 0 current_chunk_volume = 0 - volumes = volumes[:current_chunk_idx] - # normalize the volumes 0-128 - volumes = volumes * 128 / volumes.max() + # number of decimals to use when rounding the peak value + digits = 2 + max_val = float(max(volumes)) + new_volumes = [] + for x in volumes: + new_volumes.append(round(x / max_val, digits)) - return volumes.astype("uint8").tolist() + return new_volumes if __name__ == "__main__": diff --git a/server/reflector/views/transcripts.py b/server/reflector/views/transcripts.py index 410839d7..f7ab0f40 100644 --- a/server/reflector/views/transcripts.py +++ b/server/reflector/views/transcripts.py @@ -41,7 +41,7 @@ def generate_transcript_name(): class AudioWaveform(BaseModel): - data: list[int] + data: list[float] class TranscriptText(BaseModel):