Files
reflector/server/reflector/processors/audio_transcript_modal.py
Mathieu Virbel 9265d201b5 fix: restore previous behavior on live pipeline + audio downscaler (#561)
This commit restore the original behavior with frame cutting. While
silero is used on our gpu for files, look like it's not working great on
the live pipeline. To be investigated, but at the moment, what we keep
is:

- refactored to extract the downscale for further processing in the
pipeline
- remove any downscale implementation from audio_chunker and audio_merge
- removed batching from audio_merge too for now
2025-08-22 10:49:26 -06:00

71 lines
2.2 KiB
Python

"""
Implementation using the GPU service from modal.com
API will be a POST request to TRANSCRIPT_URL:
```form
"timestamp": 123.456
"source_language": "en"
"target_language": "en"
"file": <audio file>
```
"""
from openai import AsyncOpenAI
from reflector.processors.audio_transcript import AudioTranscriptProcessor
from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
from reflector.processors.types import AudioFile, Transcript, Word
from reflector.settings import settings
class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
def __init__(
self,
modal_api_key: str | None = None,
**kwargs,
):
super().__init__()
if not settings.TRANSCRIPT_URL:
raise Exception(
"TRANSCRIPT_URL required to use AudioTranscriptModalProcessor"
)
self.transcript_url = settings.TRANSCRIPT_URL + "/v1"
self.timeout = settings.TRANSCRIPT_TIMEOUT
self.modal_api_key = modal_api_key
async def _transcript(self, data: AudioFile):
async with AsyncOpenAI(
base_url=self.transcript_url,
api_key=self.modal_api_key,
timeout=self.timeout,
) as client:
self.logger.debug(f"Try to transcribe audio {data.name}")
audio_file = open(data.path, "rb")
transcription = await client.audio.transcriptions.create(
file=audio_file,
model="whisper-1",
response_format="verbose_json",
language=self.get_pref("audio:source_language", "en"),
timestamp_granularities=["word"],
)
self.logger.debug(f"Transcription: {transcription}")
transcript = Transcript(
words=[
Word(
text=word.word,
start=word.start,
end=word.end,
)
for word in transcription.words
],
)
transcript.add_offset(data.timestamp)
return transcript
AudioTranscriptAutoProcessor.register("modal", AudioTranscriptModalProcessor)