Merge pull request #171 from Monadical-SAS/modal

Use JSONFormer on Modal GPU
This commit is contained in:
projects-g
2023-08-17 22:08:48 +05:30
committed by GitHub
12 changed files with 121 additions and 55 deletions

View File

@@ -29,3 +29,10 @@ repos:
hooks:
- id: black
files: ^server/(reflector|tests)/
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
name: isort (python)
files: ^server/(gpu|evaluate|reflector)/

View File

@@ -45,8 +45,8 @@
## llm backend implementation
## =======================================================
## Use oobagooda (default)
#LLM_BACKEND=oobagooda
## Use oobabooga (default)
#LLM_BACKEND=oobabooga
#LLM_URL=http://xxx:7860/api/generate/v1
## Using serverless modal.com (require reflector-gpu-modal deployed)

View File

@@ -3,14 +3,15 @@ Reflector GPU backend - LLM
===========================
"""
import json
import os
from modal import Image, method, Stub, asgi_app, Secret
from typing import Optional
from modal import Image, Secret, Stub, asgi_app, method
# LLM
LLM_MODEL: str = "lmsys/vicuna-13b-v1.5"
LLM_LOW_CPU_MEM_USAGE: bool = False
LLM_LOW_CPU_MEM_USAGE: bool = True
LLM_TORCH_DTYPE: str = "bfloat16"
LLM_MAX_NEW_TOKENS: int = 300
@@ -49,6 +50,8 @@ llm_image = (
"torch",
"sentencepiece",
"protobuf",
"jsonformer==0.12.0",
"accelerate==0.21.0",
"einops==0.6.1",
"hf-transfer~=0.1",
"huggingface_hub==0.16.4",
@@ -81,6 +84,7 @@ class LLM:
# generation configuration
print("Instance llm generation config")
# JSONFormer doesn't yet support generation configs, but keeping for future usage
model.config.max_new_tokens = LLM_MAX_NEW_TOKENS
gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.max_new_tokens = LLM_MAX_NEW_TOKENS
@@ -107,16 +111,30 @@ class LLM:
return {"status": "ok"}
@method()
def generate(self, prompt: str):
def generate(self, prompt: str, schema: str = None):
print(f"Generate {prompt=}")
# tokenize prompt
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
self.model.device
)
output = self.model.generate(input_ids, generation_config=self.gen_cfg)
# If a schema is given, conform to schema
if schema:
print(f"Schema {schema=}")
import jsonformer
# decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True)
jsonformer_llm = jsonformer.Jsonformer(model=self.model,
tokenizer=self.tokenizer,
json_schema=json.loads(schema),
prompt=prompt,
max_string_token_length=self.gen_cfg.max_new_tokens)
response = jsonformer_llm()
else:
# If no schema, perform prompt only generation
# tokenize prompt
input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(
self.model.device
)
output = self.model.generate(input_ids, generation_config=self.gen_cfg)
# decode output
response = self.tokenizer.decode(output[0].cpu(), skip_special_tokens=True)
print(f"Generated {response=}")
return {"text": response}
@@ -135,7 +153,7 @@ class LLM:
)
@asgi_app()
def web():
from fastapi import FastAPI, HTTPException, status, Depends
from fastapi import Depends, FastAPI, HTTPException, status
from fastapi.security import OAuth2PasswordBearer
from pydantic import BaseModel
@@ -154,12 +172,16 @@ def web():
class LLMRequest(BaseModel):
prompt: str
schema: Optional[dict] = None
@app.post("/llm", dependencies=[Depends(apikey_auth)])
async def llm(
req: LLMRequest,
):
func = llmstub.generate.spawn(prompt=req.prompt)
if req.schema:
func = llmstub.generate.spawn(prompt=req.prompt, schema=json.dumps(req.schema))
else:
func = llmstub.generate.spawn(prompt=req.prompt)
result = func.get()
return result

View File

@@ -1,10 +1,11 @@
from reflector.settings import settings
from reflector.utils.retry import retry
from reflector.logger import logger as reflector_logger
from time import monotonic
import importlib
import json
import re
from time import monotonic
from reflector.logger import logger as reflector_logger
from reflector.settings import settings
from reflector.utils.retry import retry
class LLM:
@@ -20,7 +21,7 @@ class LLM:
Return an instance depending on the settings.
Settings used:
- `LLM_BACKEND`: key of the backend, defaults to `oobagooda`
- `LLM_BACKEND`: key of the backend, defaults to `oobabooga`
- `LLM_URL`: url of the backend
"""
if name is None:
@@ -44,10 +45,16 @@ class LLM:
async def _warmup(self, logger: reflector_logger):
pass
async def generate(self, prompt: str, logger: reflector_logger, **kwargs) -> dict:
async def generate(
self,
prompt: str,
logger: reflector_logger,
schema: dict | None = None,
**kwargs,
) -> dict:
logger.info("LLM generate", prompt=repr(prompt))
try:
result = await retry(self._generate)(prompt=prompt, **kwargs)
result = await retry(self._generate)(prompt=prompt, schema=schema, **kwargs)
except Exception:
logger.exception("Failed to call llm after retrying")
raise
@@ -59,7 +66,7 @@ class LLM:
return result
async def _generate(self, prompt: str, **kwargs) -> str:
async def _generate(self, prompt: str, schema: dict | None, **kwargs) -> str:
raise NotImplementedError
def _parse_json(self, result: str) -> dict:

View File

@@ -1,7 +1,7 @@
import httpx
from reflector.llm.base import LLM
from reflector.settings import settings
from reflector.utils.retry import retry
import httpx
class BananaLLM(LLM):
@@ -13,18 +13,22 @@ class BananaLLM(LLM):
"X-Banana-Model-Key": settings.LLM_BANANA_MODEL_KEY,
}
async def _generate(self, prompt: str, **kwargs):
async def _generate(self, prompt: str, schema: dict | None, **kwargs):
json_payload = {"prompt": prompt}
if schema:
json_payload["schema"] = schema
async with httpx.AsyncClient() as client:
response = await retry(client.post)(
settings.LLM_URL,
headers=self.headers,
json={"prompt": prompt},
json=json_payload,
timeout=self.timeout,
retry_timeout=300, # as per their sdk
)
response.raise_for_status()
text = response.json()["text"]
text = text[len(prompt) :] # remove prompt
if not schema:
text = text[len(prompt) :]
return text

View File

@@ -1,7 +1,7 @@
import httpx
from reflector.llm.base import LLM
from reflector.settings import settings
from reflector.utils.retry import retry
import httpx
class ModalLLM(LLM):
@@ -23,18 +23,22 @@ class ModalLLM(LLM):
)
response.raise_for_status()
async def _generate(self, prompt: str, **kwargs):
async def _generate(self, prompt: str, schema: dict | None, **kwargs):
json_payload = {"prompt": prompt}
if schema:
json_payload["schema"] = schema
async with httpx.AsyncClient() as client:
response = await retry(client.post)(
self.llm_url,
headers=self.headers,
json={"prompt": prompt},
json=json_payload,
timeout=self.timeout,
retry_timeout=60 * 5,
)
response.raise_for_status()
text = response.json()["text"]
text = text[len(prompt) :] # remove prompt
if not schema:
text = text[len(prompt) :]
return text
@@ -48,6 +52,14 @@ if __name__ == "__main__":
result = await llm.generate("Hello, my name is", logger=logger)
print(result)
schema = {
"type": "object",
"properties": {"name": {"type": "string"}},
}
result = await llm.generate("Hello, my name is", schema=schema, logger=logger)
print(result)
import asyncio
asyncio.run(main())

View File

@@ -1,18 +1,21 @@
import httpx
from reflector.llm.base import LLM
from reflector.settings import settings
import httpx
class OobagoodaLLM(LLM):
async def _generate(self, prompt: str, **kwargs):
class OobaboogaLLM(LLM):
async def _generate(self, prompt: str, schema: dict | None, **kwargs):
json_payload = {"prompt": prompt}
if schema:
json_payload["schema"] = schema
async with httpx.AsyncClient() as client:
response = await client.post(
settings.LLM_URL,
headers={"Content-Type": "application/json"},
json={"prompt": prompt},
json=json_payload,
)
response.raise_for_status()
return response.json()
LLM.register("oobagooda", OobagoodaLLM)
LLM.register("oobabooga", OobaboogaLLM)

View File

@@ -1,7 +1,7 @@
import httpx
from reflector.llm.base import LLM
from reflector.logger import logger
from reflector.settings import settings
import httpx
class OpenAILLM(LLM):
@@ -15,7 +15,7 @@ class OpenAILLM(LLM):
self.max_tokens = settings.LLM_MAX_TOKENS
logger.info(f"LLM use openai backend at {self.openai_url}")
async def _generate(self, prompt: str, **kwargs) -> str:
async def _generate(self, prompt: str, schema: dict | None, **kwargs) -> str:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.openai_key}",

View File

@@ -1,7 +1,7 @@
from reflector.processors.base import Processor
from reflector.processors.types import Transcript, TitleSummary
from reflector.utils.retry import retry
from reflector.llm import LLM
from reflector.processors.base import Processor
from reflector.processors.types import TitleSummary, Transcript
from reflector.utils.retry import retry
class TranscriptTopicDetectorProcessor(Processor):
@@ -15,9 +15,11 @@ class TranscriptTopicDetectorProcessor(Processor):
PROMPT = """
### Human:
Create a JSON object as response.The JSON object must have 2 fields:
i) title and ii) summary.For the title field,generate a short title
for the given text. For the summary field, summarize the given text
in three sentences.
i) title and ii) summary.
For the title field, generate a short title for the given text.
For the summary field, summarize the given text in a maximum of
three sentences.
{input_text}
@@ -30,6 +32,13 @@ class TranscriptTopicDetectorProcessor(Processor):
self.transcript = None
self.min_transcript_length = min_transcript_length
self.llm = LLM.get_instance()
self.topic_detector_schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"summary": {"type": "string"},
},
}
async def _warmup(self):
await self.llm.warmup(logger=self.logger)
@@ -52,7 +61,9 @@ class TranscriptTopicDetectorProcessor(Processor):
text = self.transcript.text
self.logger.info(f"Topic detector got {len(text)} length transcript")
prompt = self.PROMPT.format(input_text=text)
result = await retry(self.llm.generate)(prompt=prompt, logger=self.logger)
result = await retry(self.llm.generate)(
prompt=prompt, schema=self.topic_detector_schema, logger=self.logger
)
summary = TitleSummary(
title=result["title"],
summary=result["summary"],

View File

@@ -55,8 +55,8 @@ class Settings(BaseSettings):
TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY: str | None = None
# LLM
# available backend: openai, banana, modal, oobagooda
LLM_BACKEND: str = "oobagooda"
# available backend: openai, banana, modal, oobabooga
LLM_BACKEND: str = "oobabooga"
# LLM common configuration
LLM_URL: str | None = None

View File

@@ -15,7 +15,7 @@ async def test_basic_process(event_loop):
settings.TRANSCRIPT_BACKEND = "whisper"
class LLMTest(LLM):
async def _generate(self, prompt: str, **kwargs) -> str:
async def _generate(self, prompt: str, schema: dict | None, **kwargs) -> str:
return {
"title": "TITLE",
"summary": "SUMMARY",

View File

@@ -3,17 +3,17 @@
# FIXME test websocket connection after RTC is finished still send the full events
# FIXME try with locked session, RTC should not work
import pytest
import asyncio
import json
import threading
from pathlib import Path
from unittest.mock import patch
from httpx import AsyncClient
import pytest
from httpx import AsyncClient
from httpx_ws import aconnect_ws
from reflector.app import app
from uvicorn import Config, Server
import threading
import asyncio
from pathlib import Path
from httpx_ws import aconnect_ws
class ThreadedUvicorn:
@@ -61,7 +61,7 @@ async def dummy_llm():
from reflector.llm.base import LLM
class TestLLM(LLM):
async def _generate(self, prompt: str, **kwargs):
async def _generate(self, prompt: str, schema: dict | None, **kwargs):
return json.dumps({"title": "LLM TITLE", "summary": "LLM SUMMARY"})
with patch("reflector.llm.base.LLM.get_instance") as mock_llm: