diff --git a/server/gpu/modal_deployments/reflector_transcriber.py b/server/gpu/modal_deployments/reflector_transcriber.py index 5b7cb351..bde22339 100644 --- a/server/gpu/modal_deployments/reflector_transcriber.py +++ b/server/gpu/modal_deployments/reflector_transcriber.py @@ -1,89 +1,64 @@ -""" -Reflector GPU backend - transcriber -=================================== -""" - import os import tempfile import threading -from modal import Image, Secret, App, asgi_app, method, enter +import modal from pydantic import BaseModel -# Whisper -WHISPER_MODEL: str = "large-v2" -WHISPER_COMPUTE_TYPE: str = "float16" -WHISPER_NUM_WORKERS: int = 1 +MODELS_DIR = "/models" + +MODEL_NAME = "large-v2" +MODEL_COMPUTE_TYPE: str = "float16" +MODEL_NUM_WORKERS: int = 1 + +MINUTES = 60 # seconds + +volume = modal.Volume.from_name("models", create_if_missing=True) + +app = modal.App("reflector-transcriber") -WHISPER_MODEL_DIR = "/root/transcription_models" +def download_model(): + from faster_whisper import download_model -app = App(name="reflector-transcriber") + volume.reload() + + download_model(MODEL_NAME, cache_dir=MODELS_DIR) + + volume.commit() -def download_whisper(): - from faster_whisper.utils import download_model - - print("Downloading Whisper model") - download_model(WHISPER_MODEL, cache_dir=WHISPER_MODEL_DIR) - print("Whisper model downloaded") - - -def migrate_cache_llm(): - """ - XXX The cache for model files in Transformers v4.22.0 has been updated. - Migrating your old cache. This is a one-time only operation. You can - interrupt this and resume the migration later on by calling - `transformers.utils.move_cache()`. - """ - from transformers.utils.hub import move_cache - - print("Moving LLM cache") - move_cache(cache_dir=WHISPER_MODEL_DIR, new_cache_dir=WHISPER_MODEL_DIR) - print("LLM cache moved") - - -transcriber_image = ( - Image.debian_slim(python_version="3.10.8") - .apt_install("git") - .apt_install("wget") - .apt_install("libsndfile-dev") +image = ( + modal.Image.debian_slim(python_version="3.12") .pip_install( - "faster-whisper", - "requests", - "torch", - "transformers==4.34.0", - "sentencepiece", - "protobuf", - "huggingface_hub==0.16.4", - "gitpython", - "torchaudio", - "fairseq2", - "pyyaml", - "hf-transfer~=0.1" + "huggingface_hub==0.27.1", + "hf-transfer==0.1.9", + "torch==2.5.1", + "faster-whisper==1.1.1", ) - .run_function(download_whisper) - .run_function(migrate_cache_llm) .env( { + "HF_HUB_ENABLE_HF_TRANSFER": "1", "LD_LIBRARY_PATH": ( - "/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/:" - "/opt/conda/lib/python3.10/site-packages/nvidia/cublas/lib/" - ) + "/usr/local/lib/python3.12/site-packages/nvidia/cudnn/lib/:" + "/opt/conda/lib/python3.12/site-packages/nvidia/cublas/lib/" + ), } ) + .run_function(download_model, volumes={MODELS_DIR: volume}) ) @app.cls( gpu="A10G", - timeout=60 * 5, - container_idle_timeout=60 * 5, + timeout=5 * MINUTES, + container_idle_timeout=5 * MINUTES, allow_concurrent_inputs=6, - image=transcriber_image, + image=image, + volumes={MODELS_DIR: volume}, ) class Transcriber: - @enter() + @modal.enter() def enter(self): import faster_whisper import torch @@ -92,21 +67,20 @@ class Transcriber: self.use_gpu = torch.cuda.is_available() self.device = "cuda" if self.use_gpu else "cpu" self.model = faster_whisper.WhisperModel( - WHISPER_MODEL, + MODEL_NAME, device=self.device, - compute_type=WHISPER_COMPUTE_TYPE, - num_workers=WHISPER_NUM_WORKERS, - download_root=WHISPER_MODEL_DIR, - local_files_only=True + compute_type=MODEL_COMPUTE_TYPE, + num_workers=MODEL_NUM_WORKERS, + download_root=MODELS_DIR, + local_files_only=True, ) - @method() + @modal.method() def transcribe_segment( self, audio_data: str, audio_suffix: str, - source_language: str, - timestamp: float = 0 + language: str, ): with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: fp.write(audio_data) @@ -114,40 +88,23 @@ class Transcriber: with self.lock: segments, _ = self.model.transcribe( fp.name, - language=source_language, + language=language, beam_size=5, word_timestamps=True, vad_filter=True, vad_parameters={"min_silence_duration_ms": 500}, ) - multilingual_transcript = {} - transcript_source_lang = "" + text = "" words = [] - if segments: - segments = list(segments) + for segment in segments: + text += segment.text + words.extend( + {"word": word.word, "start": word.start, "end": word.end} + for word in segment.words + ) - for segment in segments: - transcript_source_lang += segment.text - for word in segment.words: - words.append( - { - "text": word.word, - "start": round(timestamp + word.start, 3), - "end": round(timestamp + word.end, 3), - } - ) - - multilingual_transcript[source_language] = transcript_source_lang - - return { - "text": multilingual_transcript, - "words": words - } - -# ------------------------------------------------------------------- -# Web API -# ------------------------------------------------------------------- + return {"text": text, "words": words} @app.function( @@ -155,21 +112,23 @@ class Transcriber: timeout=60, allow_concurrent_inputs=40, secrets=[ - Secret.from_name("reflector-gpu"), + modal.Secret.from_name("reflector-gpu"), ], + volumes={MODELS_DIR: volume}, ) -@asgi_app() +@modal.asgi_app() def web(): from fastapi import Body, Depends, FastAPI, HTTPException, UploadFile, status from fastapi.security import OAuth2PasswordBearer from typing_extensions import Annotated - transcriberstub = Transcriber() + transcriber = Transcriber() app = FastAPI() oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") - supported_audio_file_types = ["wav", "mp3", "ogg", "flac"] + + supported_file_types = ["mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm"] def apikey_auth(apikey: str = Depends(oauth2_scheme)): if apikey != os.environ["REFLECTOR_GPU_APIKEY"]: @@ -182,21 +141,20 @@ def web(): class TranscriptResponse(BaseModel): result: dict - @app.post("/transcribe", dependencies=[Depends(apikey_auth)]) + @app.post("/v1/audio/transcriptions", dependencies=[Depends(apikey_auth)]) def transcribe( file: UploadFile, - source_language: Annotated[str, Body(...)] = "en", - timestamp: Annotated[float, Body()] = 0.0 + model: str = "whisper-1", + language: Annotated[str, Body(...)] = "en", ) -> TranscriptResponse: audio_data = file.file.read() audio_suffix = file.filename.split(".")[-1] - assert audio_suffix in supported_audio_file_types + assert audio_suffix in supported_file_types - func = transcriberstub.transcribe_segment.spawn( + func = transcriber.transcribe_segment.spawn( audio_data=audio_data, audio_suffix=audio_suffix, - source_language=source_language, - timestamp=timestamp + language=language, ) result = func.get() return result diff --git a/server/poetry.lock b/server/poetry.lock index 1d6cabeb..f114e15b 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1068,6 +1068,17 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "dnspython" version = "2.4.2" @@ -1696,6 +1707,91 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "jiter" +version = "0.8.2" +description = "Fast iterable JSON parser." +optional = false +python-versions = ">=3.8" +files = [ + {file = "jiter-0.8.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ca8577f6a413abe29b079bc30f907894d7eb07a865c4df69475e868d73e71c7b"}, + {file = "jiter-0.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b25bd626bde7fb51534190c7e3cb97cee89ee76b76d7585580e22f34f5e3f393"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5c826a221851a8dc028eb6d7d6429ba03184fa3c7e83ae01cd6d3bd1d4bd17d"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d35c864c2dff13dfd79fb070fc4fc6235d7b9b359efe340e1261deb21b9fcb66"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f557c55bc2b7676e74d39d19bcb8775ca295c7a028246175d6a8b431e70835e5"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:580ccf358539153db147e40751a0b41688a5ceb275e6f3e93d91c9467f42b2e3"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af102d3372e917cffce49b521e4c32c497515119dc7bd8a75665e90a718bbf08"}, + {file = "jiter-0.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cadcc978f82397d515bb2683fc0d50103acff2a180552654bb92d6045dec2c49"}, + {file = "jiter-0.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ba5bdf56969cad2019d4e8ffd3f879b5fdc792624129741d3d83fc832fef8c7d"}, + {file = "jiter-0.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:3b94a33a241bee9e34b8481cdcaa3d5c2116f575e0226e421bed3f7a6ea71cff"}, + {file = "jiter-0.8.2-cp310-cp310-win32.whl", hash = "sha256:6e5337bf454abddd91bd048ce0dca5134056fc99ca0205258766db35d0a2ea43"}, + {file = "jiter-0.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:4a9220497ca0cb1fe94e3f334f65b9b5102a0b8147646118f020d8ce1de70105"}, + {file = "jiter-0.8.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:2dd61c5afc88a4fda7d8b2cf03ae5947c6ac7516d32b7a15bf4b49569a5c076b"}, + {file = "jiter-0.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a6c710d657c8d1d2adbbb5c0b0c6bfcec28fd35bd6b5f016395f9ac43e878a15"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9584de0cd306072635fe4b89742bf26feae858a0683b399ad0c2509011b9dc0"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5a90a923338531b7970abb063cfc087eebae6ef8ec8139762007188f6bc69a9f"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21974d246ed0181558087cd9f76e84e8321091ebfb3a93d4c341479a736f099"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32475a42b2ea7b344069dc1e81445cfc00b9d0e3ca837f0523072432332e9f74"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b9931fd36ee513c26b5bf08c940b0ac875de175341cbdd4fa3be109f0492586"}, + {file = "jiter-0.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce0820f4a3a59ddced7fce696d86a096d5cc48d32a4183483a17671a61edfddc"}, + {file = "jiter-0.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8ffc86ae5e3e6a93765d49d1ab47b6075a9c978a2b3b80f0f32628f39caa0c88"}, + {file = "jiter-0.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5127dc1abd809431172bc3fbe8168d6b90556a30bb10acd5ded41c3cfd6f43b6"}, + {file = "jiter-0.8.2-cp311-cp311-win32.whl", hash = "sha256:66227a2c7b575720c1871c8800d3a0122bb8ee94edb43a5685aa9aceb2782d44"}, + {file = "jiter-0.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:cde031d8413842a1e7501e9129b8e676e62a657f8ec8166e18a70d94d4682855"}, + {file = "jiter-0.8.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:e6ec2be506e7d6f9527dae9ff4b7f54e68ea44a0ef6b098256ddf895218a2f8f"}, + {file = "jiter-0.8.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76e324da7b5da060287c54f2fabd3db5f76468006c811831f051942bf68c9d44"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:180a8aea058f7535d1c84183c0362c710f4750bef66630c05f40c93c2b152a0f"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:025337859077b41548bdcbabe38698bcd93cfe10b06ff66617a48ff92c9aec60"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ecff0dc14f409599bbcafa7e470c00b80f17abc14d1405d38ab02e4b42e55b57"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ffd9fee7d0775ebaba131f7ca2e2d83839a62ad65e8e02fe2bd8fc975cedeb9e"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14601dcac4889e0a1c75ccf6a0e4baf70dbc75041e51bcf8d0e9274519df6887"}, + {file = "jiter-0.8.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92249669925bc1c54fcd2ec73f70f2c1d6a817928480ee1c65af5f6b81cdf12d"}, + {file = "jiter-0.8.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e725edd0929fa79f8349ab4ec7f81c714df51dc4e991539a578e5018fa4a7152"}, + {file = "jiter-0.8.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bf55846c7b7a680eebaf9c3c48d630e1bf51bdf76c68a5f654b8524335b0ad29"}, + {file = "jiter-0.8.2-cp312-cp312-win32.whl", hash = "sha256:7efe4853ecd3d6110301665a5178b9856be7e2a9485f49d91aa4d737ad2ae49e"}, + {file = "jiter-0.8.2-cp312-cp312-win_amd64.whl", hash = "sha256:83c0efd80b29695058d0fd2fa8a556490dbce9804eac3e281f373bbc99045f6c"}, + {file = "jiter-0.8.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:ca1f08b8e43dc3bd0594c992fb1fd2f7ce87f7bf0d44358198d6da8034afdf84"}, + {file = "jiter-0.8.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5672a86d55416ccd214c778efccf3266b84f87b89063b582167d803246354be4"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58dc9bc9767a1101f4e5e22db1b652161a225874d66f0e5cb8e2c7d1c438b587"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:37b2998606d6dadbb5ccda959a33d6a5e853252d921fec1792fc902351bb4e2c"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4ab9a87f3784eb0e098f84a32670cfe4a79cb6512fd8f42ae3d0709f06405d18"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79aec8172b9e3c6d05fd4b219d5de1ac616bd8da934107325a6c0d0e866a21b6"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:711e408732d4e9a0208008e5892c2966b485c783cd2d9a681f3eb147cf36c7ef"}, + {file = "jiter-0.8.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:653cf462db4e8c41995e33d865965e79641ef45369d8a11f54cd30888b7e6ff1"}, + {file = "jiter-0.8.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:9c63eaef32b7bebac8ebebf4dabebdbc6769a09c127294db6babee38e9f405b9"}, + {file = "jiter-0.8.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:eb21aaa9a200d0a80dacc7a81038d2e476ffe473ffdd9c91eb745d623561de05"}, + {file = "jiter-0.8.2-cp313-cp313-win32.whl", hash = "sha256:789361ed945d8d42850f919342a8665d2dc79e7e44ca1c97cc786966a21f627a"}, + {file = "jiter-0.8.2-cp313-cp313-win_amd64.whl", hash = "sha256:ab7f43235d71e03b941c1630f4b6e3055d46b6cb8728a17663eaac9d8e83a865"}, + {file = "jiter-0.8.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b426f72cd77da3fec300ed3bc990895e2dd6b49e3bfe6c438592a3ba660e41ca"}, + {file = "jiter-0.8.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2dd880785088ff2ad21ffee205e58a8c1ddabc63612444ae41e5e4b321b39c0"}, + {file = "jiter-0.8.2-cp313-cp313t-win_amd64.whl", hash = "sha256:3ac9f578c46f22405ff7f8b1f5848fb753cc4b8377fbec8470a7dc3997ca7566"}, + {file = "jiter-0.8.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9e1fa156ee9454642adb7e7234a383884452532bc9d53d5af2d18d98ada1d79c"}, + {file = "jiter-0.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cf5dfa9956d96ff2efb0f8e9c7d055904012c952539a774305aaaf3abdf3d6c"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e52bf98c7e727dd44f7c4acb980cb988448faeafed8433c867888268899b298b"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a2ecaa3c23e7a7cf86d00eda3390c232f4d533cd9ddea4b04f5d0644faf642c5"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08d4c92bf480e19fc3f2717c9ce2aa31dceaa9163839a311424b6862252c943e"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99d9a1eded738299ba8e106c6779ce5c3893cffa0e32e4485d680588adae6db8"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d20be8b7f606df096e08b0b1b4a3c6f0515e8dac296881fe7461dfa0fb5ec817"}, + {file = "jiter-0.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d33f94615fcaf872f7fd8cd98ac3b429e435c77619777e8a449d9d27e01134d1"}, + {file = "jiter-0.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:317b25e98a35ffec5c67efe56a4e9970852632c810d35b34ecdd70cc0e47b3b6"}, + {file = "jiter-0.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fc9043259ee430ecd71d178fccabd8c332a3bf1e81e50cae43cc2b28d19e4cb7"}, + {file = "jiter-0.8.2-cp38-cp38-win32.whl", hash = "sha256:fc5adda618205bd4678b146612ce44c3cbfdee9697951f2c0ffdef1f26d72b63"}, + {file = "jiter-0.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:cd646c827b4f85ef4a78e4e58f4f5854fae0caf3db91b59f0d73731448a970c6"}, + {file = "jiter-0.8.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:e41e75344acef3fc59ba4765df29f107f309ca9e8eace5baacabd9217e52a5ee"}, + {file = "jiter-0.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f22b16b35d5c1df9dfd58843ab2cd25e6bf15191f5a236bed177afade507bfc"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7200b8f7619d36aa51c803fd52020a2dfbea36ffec1b5e22cab11fd34d95a6d"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:70bf4c43652cc294040dbb62256c83c8718370c8b93dd93d934b9a7bf6c4f53c"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f9d471356dc16f84ed48768b8ee79f29514295c7295cb41e1133ec0b2b8d637d"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:859e8eb3507894093d01929e12e267f83b1d5f6221099d3ec976f0c995cb6bd9"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaa58399c01db555346647a907b4ef6d4f584b123943be6ed5588c3f2359c9f4"}, + {file = "jiter-0.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8f2d5ed877f089862f4c7aacf3a542627c1496f972a34d0474ce85ee7d939c27"}, + {file = "jiter-0.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:03c9df035d4f8d647f8c210ddc2ae0728387275340668fb30d2421e17d9a0841"}, + {file = "jiter-0.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8bd2a824d08d8977bb2794ea2682f898ad3d8837932e3a74937e93d62ecbb637"}, + {file = "jiter-0.8.2-cp39-cp39-win32.whl", hash = "sha256:ca29b6371ebc40e496995c94b988a101b9fbbed48a51190a4461fcb0a68b4a36"}, + {file = "jiter-0.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:1c0dfbd1be3cbefc7510102370d86e35d1d53e5a93d48519688b1bf0f761160a"}, + {file = "jiter-0.8.2.tar.gz", hash = "sha256:cd73d3e740666d0e639f678adb176fad25c1bcbdae88d8d7b857e1783bb4212d"}, +] + [[package]] name = "jiwer" version = "3.0.3" @@ -2267,6 +2363,31 @@ packaging = "*" protobuf = "*" sympy = "*" +[[package]] +name = "openai" +version = "1.59.7" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.8" +files = [ + {file = "openai-1.59.7-py3-none-any.whl", hash = "sha256:cfa806556226fa96df7380ab2e29814181d56fea44738c2b0e581b462c268692"}, + {file = "openai-1.59.7.tar.gz", hash = "sha256:043603def78c00befb857df9f0a16ee76a3af5984ba40cb7ee5e2f40db4646bf"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +jiter = ">=0.4.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.11,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +realtime = ["websockets (>=13,<15)"] + [[package]] name = "packaging" version = "23.2" @@ -4041,13 +4162,13 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "typing-extensions" -version = "4.8.0" +version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, - {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] [[package]] @@ -4483,4 +4604,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "3ba5402ab9fbec271f255c345c89c67385c722735b1d79a959e887c7c34a4047" +content-hash = "6f5213b520e038f396de4a6b6e742a54eec1e6e9ea67cb1fa36e134c8a946cbe" diff --git a/server/pyproject.toml b/server/pyproject.toml index caa72483..97d61d51 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -39,6 +39,7 @@ faster-whisper = "^0.10.0" transformers = "^4.36.2" black = "24.1.1" jsonschema = "^4.23.0" +openai = "^1.59.7" [tool.poetry.group.dev.dependencies] diff --git a/server/reflector/processors/audio_transcript_modal.py b/server/reflector/processors/audio_transcript_modal.py index 0ca4710f..ce2667c6 100644 --- a/server/reflector/processors/audio_transcript_modal.py +++ b/server/reflector/processors/audio_transcript_modal.py @@ -12,51 +12,46 @@ API will be a POST request to TRANSCRIPT_URL: """ -import httpx +from openai import AsyncOpenAI from reflector.processors.audio_transcript import AudioTranscriptProcessor from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor from reflector.processors.types import AudioFile, Transcript, Word from reflector.settings import settings -from reflector.utils.retry import retry class AudioTranscriptModalProcessor(AudioTranscriptProcessor): def __init__(self, modal_api_key: str): super().__init__() - self.transcript_url = settings.TRANSCRIPT_URL + "/transcribe" + self.transcript_url = settings.TRANSCRIPT_URL + "/v1" self.timeout = settings.TRANSCRIPT_TIMEOUT + self.api_key = settings.TRANSCRIPT_MODAL_API_KEY self.headers = {"Authorization": f"Bearer {modal_api_key}"} async def _transcript(self, data: AudioFile): - async with httpx.AsyncClient() as client: + async with AsyncOpenAI( + base_url=self.transcript_url, + api_key=self.api_key, + timeout=self.timeout, + ) as client: self.logger.debug(f"Try to transcribe audio {data.name}") - files = { - "file": (data.name, data.fd), - } - source_language = self.get_pref("audio:source_language", "en") - json_payload = {"source_language": source_language} - response = await retry(client.post)( - self.transcript_url, - files=files, - timeout=self.timeout, - headers=self.headers, - params=json_payload, - follow_redirects=True, - ) - self.logger.debug( - f"Transcript response: {response.status_code} {response.content}" + audio_file = open(data.path, "rb") + transcription = await client.audio.transcriptions.create( + file=audio_file, + model="whisper-1", + response_format="verbose_json", + language=self.get_pref("audio:source_language", "en"), + timestamp_granularities=["word"], ) - response.raise_for_status() - result = response.json() + self.logger.debug(f"Transcription: {transcription}") transcript = Transcript( words=[ Word( - text=word["text"], - start=word["start"], - end=word["end"], + text=word.word, + start=word.start, + end=word.end, ) - for word in result["words"] + for word in transcription.words ], ) transcript.add_offset(data.timestamp)