This commit is contained in:
Gokul Mohanarangan
2023-08-21 12:53:49 +05:30
parent 5b0883730f
commit acdd5f7dab
2 changed files with 26 additions and 28 deletions

View File

@@ -10,7 +10,7 @@ from modal import Image, Secret, Stub, asgi_app, method
from pydantic import BaseModel from pydantic import BaseModel
# Whisper # Whisper
WHISPER_MODEL: str = "tiny" WHISPER_MODEL: str = "large-v2"
WHISPER_COMPUTE_TYPE: str = "float16" WHISPER_COMPUTE_TYPE: str = "float16"
WHISPER_NUM_WORKERS: int = 1 WHISPER_NUM_WORKERS: int = 1
WHISPER_CACHE_DIR: str = "/cache/whisper" WHISPER_CACHE_DIR: str = "/cache/whisper"
@@ -18,7 +18,7 @@ WHISPER_CACHE_DIR: str = "/cache/whisper"
# Translation Model # Translation Model
TRANSLATION_MODEL = "facebook/m2m100_418M" TRANSLATION_MODEL = "facebook/m2m100_418M"
stub = Stub(name="reflector-translator") stub = Stub(name="reflector-transcriber")
def download_whisper(): def download_whisper():
@@ -82,9 +82,9 @@ class Whisper:
self, self,
audio_data: str, audio_data: str,
audio_suffix: str, audio_suffix: str,
timestamp: float = 0, source_language: str,
source_language: str = "en", target_language: str,
target_language: str = "fr" timestamp: float = 0
): ):
with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp:
fp.write(audio_data) fp.write(audio_data)
@@ -126,7 +126,8 @@ class Whisper:
forced_bos_token_id=forced_bos_token_id forced_bos_token_id=forced_bos_token_id
) )
result = self.translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) result = self.translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
multilingual_transcript[target_language] = result[0].strip() translation = result[0].strip()
multilingual_transcript[target_language] = translation
return { return {
"text": multilingual_transcript, "text": multilingual_transcript,
@@ -167,29 +168,26 @@ def web():
headers={"WWW-Authenticate": "Bearer"}, headers={"WWW-Authenticate": "Bearer"},
) )
class TranscriptionRequest(BaseModel):
file: UploadFile
timestamp: Annotated[float, Form()] = 0
source_language: Annotated[str, Form()] = "en"
target_language: Annotated[str, Form()] = "en"
class TranscriptResponse(BaseModel): class TranscriptResponse(BaseModel):
result: dict result: dict
@app.post("/transcribe", dependencies=[Depends(apikey_auth)]) @app.post("/transcribe", dependencies=[Depends(apikey_auth)])
async def transcribe( async def transcribe(
req file: UploadFile,
): timestamp: Annotated[float, Form()] = 0,
print(req) source_language: Annotated[str, Form()] = "en",
audio_data = await req.file.read() target_language: Annotated[str, Form()] = "en"
audio_suffix = req.file.filename.split(".")[-1] ) -> TranscriptResponse:
audio_data = await file.read()
audio_suffix = file.filename.split(".")[-1]
assert audio_suffix in supported_audio_file_types assert audio_suffix in supported_audio_file_types
func = transcriberstub.transcribe_segment.spawn( func = transcriberstub.transcribe_segment.spawn(
audio_data=audio_data, audio_data=audio_data,
audio_suffix=audio_suffix, audio_suffix=audio_suffix,
source_language="en", source_language=source_language,
timestamp=req.timestamp target_language=target_language,
timestamp=timestamp
) )
result = func.get() result = func.get()
return result return result

View File

@@ -55,28 +55,28 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
files = { files = {
"file": (data.name, data.fd), "file": (data.name, data.fd),
} }
# TODO: Get the source / target language from the UI preferences dynamically # TODO: Get the source / target language from the UI preferences dynamically
# like context, session objects # Update code here once this is possible.
# i.e) extract from context/session objects
source_language = "en" source_language = "en"
target_language = "fr" target_language = "en"
languages = TranslationLanguages() languages = TranslationLanguages()
# Only way to set the target should be the UI element like dropdown. # Only way to set the target should be the UI element like dropdown.
# Hence, this assert should never fail. # Hence, this assert should never fail.
assert languages.is_supported(target_language) assert languages.is_supported(target_language)
data = { json_payload = {
"source_language": source_language, "source_language": source_language,
"target_language": target_language, "target_language": target_language,
} }
print("TRYING TO TRANSCRIBE")
response = await retry(client.post)( response = await retry(client.post)(
self.transcript_url, self.transcript_url,
files=files, files=files,
timeout=self.timeout, timeout=self.timeout,
headers=self.headers, headers=self.headers,
# data=data json=json_payload.values(),
) )
self.logger.debug( self.logger.debug(
@@ -85,9 +85,9 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
# Sanity check for translation status in result # Sanity check for translation status in the result
if "target_language" in result["text"]: if target_language in result["text"]:
text = result["text"]["target_language"] text = result["text"][target_language]
else: else:
text = result["text"]["en"] text = result["text"]["en"]
transcript = Transcript( transcript = Transcript(