From 2f0e9a51f73b6dbf6c87eaab614bd305dfa44741 Mon Sep 17 00:00:00 2001 From: Gokul Mohanarangan Date: Wed, 16 Aug 2023 13:28:23 +0530 Subject: [PATCH] integrate reflector-gpu-modal repo --- server/gpu/modal/README.md | 92 ++++++++++++ server/gpu/modal/reflector_llm.py | 170 +++++++++++++++++++++ server/gpu/modal/reflector_transcriber.py | 173 ++++++++++++++++++++++ 3 files changed, 435 insertions(+) create mode 100644 server/gpu/modal/README.md create mode 100644 server/gpu/modal/reflector_llm.py create mode 100644 server/gpu/modal/reflector_transcriber.py diff --git a/server/gpu/modal/README.md b/server/gpu/modal/README.md new file mode 100644 index 00000000..9491a00c --- /dev/null +++ b/server/gpu/modal/README.md @@ -0,0 +1,92 @@ +# Reflector GPU implementation - Transcription and LLM + +This repository hold an API for the GPU implementation of the Reflector API service, +and use [Modal.com](https://modal.com) + +- `reflector_llm.py` - LLM API +- `reflector_transcriber.py` - Transcription API + +## Modal.com deployment + +Create a modal secret, and name it `reflector-gpu`. +It should contain an `REFLECTOR_APIKEY` environment variable with a value. + +The deployment is done using [Modal.com](https://modal.com) service. + +``` +$ modal deploy reflector_transcriber.py +... +└── 🔨 Created web => https://xxxx--reflector-transcriber-web.modal.run + +$ modal deploy reflector_llm.py +... +└── 🔨 Created web => https://xxxx--reflector-llm-web.modal.run +``` + +Then in your reflector api configuration `.env`, you can set theses keys: + +``` +TRANSCRIPT_BACKEND=modal +TRANSCRIPT_URL=https://xxxx--reflector-transcriber-web.modal.run +TRANSCRIPT_MODAL_API_KEY=REFLECTOR_APIKEY + +LLM_BACKEND=modal +LLM_URL=https://xxxx--reflector-llm-web.modal.run +LLM_MODAL_API_KEY=REFLECTOR_APIKEY +``` + +## API + +Authentication must be passed with the `Authorization` header, using the `bearer` scheme. + +``` +Authorization: bearer +``` + +### Warmup (both) + +`POST /warmup` + +**response** +``` +{ + "status": "ok" +} +``` + +### LLM + +`POST /llm` + +**request** +``` +{ + "prompt": "xxx" +} +``` + +**response** +``` +{ + "text": "xxx completed" +} +``` + +### Transcription + +`POST /transcribe` + +**request** (multipart/form-data) + +- `file` - audio file +- `language` - language code (e.g. `en`) + +**response** +``` +{ + "text": "xxx", + "words": [ + {"text": "xxx", "start": 0.0, "end": 1.0} + ] +} +``` diff --git a/server/gpu/modal/reflector_llm.py b/server/gpu/modal/reflector_llm.py new file mode 100644 index 00000000..bf6f4cf5 --- /dev/null +++ b/server/gpu/modal/reflector_llm.py @@ -0,0 +1,170 @@ +""" +Reflector GPU backend - LLM +=========================== + +""" + +import os +from modal import Image, method, Stub, asgi_app, Secret + + +# LLM +LLM_MODEL: str = "lmsys/vicuna-13b-v1.5" +LLM_LOW_CPU_MEM_USAGE: bool = False +LLM_TORCH_DTYPE: str = "bfloat16" +LLM_MAX_NEW_TOKENS: int = 300 + +IMAGE_MODEL_DIR = "/model" + +stub = Stub(name="reflector-llm") + + +def download_llm(): + from huggingface_hub import snapshot_download + + print("Downloading LLM model") + snapshot_download(LLM_MODEL, local_dir=IMAGE_MODEL_DIR) + print("LLM model downloaded") + + +def migrate_cache_llm(): + """ + XXX The cache for model files in Transformers v4.22.0 has been updated. + Migrating your old cache. This is a one-time only operation. You can + interrupt this and resume the migration later on by calling + `transformers.utils.move_cache()`. + """ + from transformers.utils.hub import move_cache + + print("Moving LLM cache") + move_cache() + print("LLM cache moved") + + +llm_image = ( + Image.debian_slim(python_version="3.10.8") + .apt_install("git") + .pip_install( + "transformers", + "torch", + "sentencepiece", + "protobuf", + "einops==0.6.1", + "hf-transfer~=0.1", + "huggingface_hub==0.16.4", + ) + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) + .run_function(download_llm) + .run_function(migrate_cache_llm) +) + + +@stub.cls( + gpu="A100", + timeout=60 * 5, + container_idle_timeout=60 * 5, + concurrency_limit=2, + image=llm_image, +) +class LLM: + def __enter__(self): + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + + print("Instance llm model") + model = AutoModelForCausalLM.from_pretrained( + IMAGE_MODEL_DIR, + torch_dtype=getattr(torch, LLM_TORCH_DTYPE), + low_cpu_mem_usage=LLM_LOW_CPU_MEM_USAGE, + ) + + # generation configuration + print("Instance llm generation config") + model.config.max_new_tokens = LLM_MAX_NEW_TOKENS + gen_cfg = GenerationConfig.from_model_config(model.config) + gen_cfg.max_new_tokens = LLM_MAX_NEW_TOKENS + + # load tokenizer + print("Instance llm tokenizer") + tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL) + + # move model to gpu + print("Move llm model to GPU") + model = model.cuda() + + print("Warmup llm done") + self.model = model + self.tokenizer = tokenizer + self.gen_cfg = gen_cfg + + def __exit__(self, *args): + print("Exit llm") + + @method() + def warmup(self): + print("Warmup ok") + return {"status": "ok"} + + @method() + def generate(self, prompt: str): + print(f"Generate {prompt=}") + # tokenize prompt + input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to( + self.model.device + ) + output = self.model.generate(input_ids, generation_config=self.gen_cfg) + + # decode output + response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True) + print(f"Generated {response=}") + return {"text": response} + + +# ------------------------------------------------------------------- +# Web API +# ------------------------------------------------------------------- + + +@stub.function( + container_idle_timeout=60 * 10, + timeout=60 * 5, + secrets=[ + Secret.from_name("reflector-gpu"), + ], +) +@asgi_app() +def web(): + from fastapi import FastAPI, HTTPException, status, Depends + from fastapi.security import OAuth2PasswordBearer + from pydantic import BaseModel + + llmstub = LLM() + + app = FastAPI() + oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") + + def apikey_auth(apikey: str = Depends(oauth2_scheme)): + if apikey != os.environ["REFLECTOR_GPU_APIKEY"]: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + headers={"WWW-Authenticate": "Bearer"}, + ) + + class LLMRequest(BaseModel): + prompt: str + + @app.post("/llm", dependencies=[Depends(apikey_auth)]) + async def llm( + req: LLMRequest, + ): + func = llmstub.generate.spawn(prompt=req.prompt) + result = func.get() + return result + + @app.post("/warmup", dependencies=[Depends(apikey_auth)]) + async def warmup(): + return llmstub.warmup.spawn().get() + + return app diff --git a/server/gpu/modal/reflector_transcriber.py b/server/gpu/modal/reflector_transcriber.py new file mode 100644 index 00000000..631233cc --- /dev/null +++ b/server/gpu/modal/reflector_transcriber.py @@ -0,0 +1,173 @@ +""" +Reflector GPU backend - transcriber +=================================== +""" + +import tempfile +import os +from modal import Image, method, Stub, asgi_app, Secret +from pydantic import BaseModel + + +# Whisper +WHISPER_MODEL: str = "large-v2" +WHISPER_COMPUTE_TYPE: str = "float16" +WHISPER_NUM_WORKERS: int = 1 +WHISPER_CACHE_DIR: str = "/cache/whisper" + +stub = Stub(name="reflector-transcriber") + + +def download_whisper(): + from faster_whisper.utils import download_model + + download_model(WHISPER_MODEL, local_files_only=False) + + +whisper_image = ( + Image.debian_slim(python_version="3.10.8") + .apt_install("git") + .pip_install( + "faster-whisper", + "requests", + "torch", + ) + .run_function(download_whisper) + .env( + { + "LD_LIBRARY_PATH": ( + "/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/:" + "/opt/conda/lib/python3.10/site-packages/nvidia/cublas/lib/" + ) + } + ) +) + + +@stub.cls( + gpu="A10G", + container_idle_timeout=60, + image=whisper_image, +) +class Whisper: + def __enter__(self): + import torch + import faster_whisper + + self.use_gpu = torch.cuda.is_available() + device = "cuda" if self.use_gpu else "cpu" + self.model = faster_whisper.WhisperModel( + WHISPER_MODEL, + device=device, + compute_type=WHISPER_COMPUTE_TYPE, + num_workers=WHISPER_NUM_WORKERS, + ) + + @method() + def warmup(self): + return {"status": "ok"} + + @method() + def transcribe_segment( + self, + audio_data: str, + audio_suffix: str, + timestamp: float = 0, + language: str = "en", + ): + with tempfile.NamedTemporaryFile("wb+", suffix=f".{audio_suffix}") as fp: + fp.write(audio_data) + + segments, _ = self.model.transcribe( + fp.name, + language=language, + beam_size=5, + word_timestamps=True, + vad_filter=True, + vad_parameters={"min_silence_duration_ms": 500}, + ) + + transcript = "" + words = [] + if segments: + segments = list(segments) + + for segment in segments: + transcript += segment.text + for word in segment.words: + words.append( + { + "text": word.word, + "start": round(timestamp + word.start, 3), + "end": round(timestamp + word.end, 3), + } + ) + return { + "text": transcript, + "words": words, + } + + +# ------------------------------------------------------------------- +# Web API +# ------------------------------------------------------------------- + + +@stub.function( + container_idle_timeout=60, + timeout=60, + secrets=[ + Secret.from_name("reflector-gpu"), + ], +) +@asgi_app() +def web(): + from fastapi import FastAPI, UploadFile, Form, Depends, HTTPException, status + from fastapi.security import OAuth2PasswordBearer + from typing_extensions import Annotated + + transcriberstub = Whisper() + + app = FastAPI() + + oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") + + def apikey_auth(apikey: str = Depends(oauth2_scheme)): + if apikey != os.environ["REFLECTOR_GPU_APIKEY"]: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid API key", + headers={"WWW-Authenticate": "Bearer"}, + ) + + class TranscriptionRequest(BaseModel): + timestamp: float = 0 + language: str = "en" + + class TranscriptResponse(BaseModel): + result: str + + @app.post("/transcribe", dependencies=[Depends(apikey_auth)]) + async def transcribe( + file: UploadFile, + timestamp: Annotated[float, Form()] = 0, + language: Annotated[str, Form()] = "en", + ): + audio_data = await file.read() + audio_suffix = file.filename.split(".")[-1] + assert audio_suffix in ["wav", "mp3", "ogg", "flac"] + + func = transcriberstub.transcribe_segment.spawn( + audio_data=audio_data, + audio_suffix=audio_suffix, + language=language, + timestamp=timestamp, + ) + result = func.get() + return result + + @app.post("/warmup", dependencies=[Depends(apikey_auth)]) + async def warmup(): + return transcriberstub.warmup.spawn().get() + + return app