mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
fix waveform generation
This commit is contained in:
@@ -9,11 +9,12 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int]
|
|||||||
path = path.as_posix()
|
path = path.as_posix()
|
||||||
|
|
||||||
container = av.open(path)
|
container = av.open(path)
|
||||||
stream = container.streams.get(audio=0)[0]
|
stream = container.streams.audio[0]
|
||||||
duration = container.duration / av.time_base
|
duration = container.duration / av.time_base
|
||||||
|
|
||||||
chunk_size_secs = duration / segments_count
|
chunk_size_secs = duration / segments_count
|
||||||
chunk_size = int(chunk_size_secs * stream.rate * stream.channels)
|
chunk_size = int(chunk_size_secs * stream.rate * stream.channels)
|
||||||
|
|
||||||
if chunk_size == 0:
|
if chunk_size == 0:
|
||||||
# there is not enough data to fill the chunks
|
# there is not enough data to fill the chunks
|
||||||
# so basically we use chunk_size of 1.
|
# so basically we use chunk_size of 1.
|
||||||
@@ -22,7 +23,7 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int]
|
|||||||
# 1.1 is a safety margin as it seems that pyav decode
|
# 1.1 is a safety margin as it seems that pyav decode
|
||||||
# does not always return the exact number of chunks
|
# does not always return the exact number of chunks
|
||||||
# that we expect.
|
# that we expect.
|
||||||
volumes = np.zeros(int(segments_count * 1.1), dtype=int)
|
volumes = np.zeros(int(segments_count * 1.1), dtype=float)
|
||||||
current_chunk_idx = 0
|
current_chunk_idx = 0
|
||||||
current_chunk_size = 0
|
current_chunk_size = 0
|
||||||
current_chunk_volume = 0
|
current_chunk_volume = 0
|
||||||
@@ -35,7 +36,6 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int]
|
|||||||
count += len(data)
|
count += len(data)
|
||||||
frames += 1
|
frames += 1
|
||||||
samples += frame.samples
|
samples += frame.samples
|
||||||
|
|
||||||
while len(data) > 0:
|
while len(data) > 0:
|
||||||
datalen = len(data)
|
datalen = len(data)
|
||||||
|
|
||||||
@@ -53,13 +53,16 @@ def get_audio_waveform(path: Path | str, segments_count: int = 256) -> list[int]
|
|||||||
current_chunk_idx += 1
|
current_chunk_idx += 1
|
||||||
current_chunk_size = 0
|
current_chunk_size = 0
|
||||||
current_chunk_volume = 0
|
current_chunk_volume = 0
|
||||||
|
|
||||||
volumes = volumes[:current_chunk_idx]
|
volumes = volumes[:current_chunk_idx]
|
||||||
|
|
||||||
# normalize the volumes 0-128
|
# number of decimals to use when rounding the peak value
|
||||||
volumes = volumes * 128 / volumes.max()
|
digits = 2
|
||||||
|
max_val = float(max(volumes))
|
||||||
|
new_volumes = []
|
||||||
|
for x in volumes:
|
||||||
|
new_volumes.append(round(x / max_val, digits))
|
||||||
|
|
||||||
return volumes.astype("uint8").tolist()
|
return new_volumes
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ def generate_transcript_name():
|
|||||||
|
|
||||||
|
|
||||||
class AudioWaveform(BaseModel):
|
class AudioWaveform(BaseModel):
|
||||||
data: list[int]
|
data: list[float]
|
||||||
|
|
||||||
|
|
||||||
class TranscriptText(BaseModel):
|
class TranscriptText(BaseModel):
|
||||||
|
|||||||
Reference in New Issue
Block a user