From acdd5f7dab56793e0e8236da39c6537dd853d9c0 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Mon, 21 Aug 2023 12:53:49 +0530 Subject: [PATCH] update --- server/gpu/modal/reflector_transcriber.py | 36 +++++++++---------- .../processors/audio_transcript_modal.py | 18 +++++----- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py index f2db9225..335342b2 100644 --- a/server/gpu/modal/reflector_transcriber.py +++ b/server/gpu/modal/reflector_transcriber.py @@ -10,7 +10,7 @@ from modal import Image, Secret, Stub, asgi_app, method from pydantic import BaseModel # Whisper -WHISPER_MODEL: str = "tiny" +WHISPER_MODEL: str = "large-v2" WHISPER_COMPUTE_TYPE: str = "float16" WHISPER_NUM_WORKERS: int = 1 WHISPER_CACHE_DIR: str = "/cache/whisper" @@ -18,7 +18,7 @@ WHISPER_CACHE_DIR: str = "/cache/whisper" # Translation Model TRANSLATION_MODEL = "facebook/m2m100_418M" -stub = Stub(name="reflector-translator") +stub = Stub(name="reflector-transcriber") def download_whisper(): @@ -82,9 +82,9 @@ class Whisper: self, audio_data: str, audio_suffix: str, - timestamp: float = 0, - source_language: str = "en", - target_language: str = "fr" + source_language: str, + target_language: str, + timestamp: float = 0 ): with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: fp.write(audio_data) @@ -126,7 +126,8 @@ class Whisper: forced_bos_token_id=forced_bos_token_id ) result = self.translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) - multilingual_transcript[target_language] = result[0].strip() + translation = result[0].strip() + multilingual_transcript[target_language] = translation return { "text": multilingual_transcript, @@ -167,29 +168,26 @@ def web(): headers={"WWW-Authenticate": "Bearer"}, ) - class TranscriptionRequest(BaseModel): - file: UploadFile - timestamp: Annotated[float, Form()] = 0 - source_language: Annotated[str, Form()] = "en" - target_language: Annotated[str, Form()] = "en" - class TranscriptResponse(BaseModel): result: dict @app.post("/transcribe", dependencies=[Depends(apikey_auth)]) async def transcribe( - req - ): - print(req) - audio_data = await req.file.read() - audio_suffix = req.file.filename.split(".")[-1] + file: UploadFile, + timestamp: Annotated[float, Form()] = 0, + source_language: Annotated[str, Form()] = "en", + target_language: Annotated[str, Form()] = "en" + ) -> TranscriptResponse: + audio_data = await file.read() + audio_suffix = file.filename.split(".")[-1] assert audio_suffix in supported_audio_file_types func = transcriberstub.transcribe_segment.spawn( audio_data=audio_data, audio_suffix=audio_suffix, - source_language="en", - timestamp=req.timestamp + source_language=source_language, + target_language=target_language, + timestamp=timestamp ) result = func.get() return result diff --git a/server/reflector/processors/audio_transcript_modal.py b/server/reflector/processors/audio_transcript_modal.py index 5d7a6b85..d9d8c3a7 100644 --- a/server/reflector/processors/audio_transcript_modal.py +++ b/server/reflector/processors/audio_transcript_modal.py @@ -55,28 +55,28 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor): files = { "file": (data.name, data.fd), } + # TODO: Get the source / target language from the UI preferences dynamically - # like context, session objects + # Update code here once this is possible. + # i.e) extract from context/session objects source_language = "en" - target_language = "fr" + target_language = "en" languages = TranslationLanguages() # Only way to set the target should be the UI element like dropdown. # Hence, this assert should never fail. assert languages.is_supported(target_language) - data = { + json_payload = { "source_language": source_language, "target_language": target_language, } - print("TRYING TO TRANSCRIBE") - response = await retry(client.post)( self.transcript_url, files=files, timeout=self.timeout, headers=self.headers, - # data=data + json=json_payload.values(), ) self.logger.debug( @@ -85,9 +85,9 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor): response.raise_for_status() result = response.json() - # Sanity check for translation status in result - if "target_language" in result["text"]: - text = result["text"]["target_language"] + # Sanity check for translation status in the result + if target_language in result["text"]: + text = result["text"][target_language] else: text = result["text"]["en"] transcript = Transcript(