Merge pull request #137 from Monadical-SAS/serverless-gpu-modal

server: implement modal backend for llm and transcription
2026-05-13 22:45:18 +00:00 · 2023-08-11 16:19:01 +02:00
parent a822c9d482 802f2c248e
commit 77d07c8178
11 changed files with 204 additions and 4 deletions
--- a/server/reflector/llm/base.py
+++ b/server/reflector/llm/base.py
@@ -1,6 +1,7 @@
 from reflector.settings import settings
 from reflector.utils.retry import retry
 from reflector.logger import logger as reflector_logger
+from time import monotonic
 import importlib
 import json
 import re
@@ -29,6 +30,21 @@ class LLM:
            importlib.import_module(module_name)
        return cls._registry[name]()

+    async def warmup(self, logger: reflector_logger):
+        start = monotonic()
+        name = self.__class__.__name__
+        logger.info(f"LLM[{name}] warming up...")
+        try:
+            await self._warmup(logger=logger)
+            duration = monotonic() - start
+            logger.info(f"LLM[{name}] warmup took {duration:.2f} seconds")
+        except Exception:
+            logger.exception(f"LLM[{name}] warmup failed")
+            raise
+
+    async def _warmup(self, logger: reflector_logger):
+        pass
+
    async def generate(self, prompt: str, logger: reflector_logger, **kwargs) -> dict:
        logger.info("LLM generate", prompt=repr(prompt))
        try:
--- a/server/reflector/llm/llm_modal.py
+++ b/server/reflector/llm/llm_modal.py
@@ -0,0 +1,53 @@
+from reflector.llm.base import LLM
+from reflector.settings import settings
+from reflector.utils.retry import retry
+import httpx
+
+
+class ModalLLM(LLM):
+    def __init__(self):
+        super().__init__()
+        self.timeout = settings.LLM_TIMEOUT
+        self.llm_url = settings.LLM_URL + "/llm"
+        self.llm_warmup_url = settings.LLM_URL + "/warmup"
+        self.headers = {
+            "Authorization": f"Bearer {settings.LLM_MODAL_API_KEY}",
+        }
+
+    async def _warmup(self, logger):
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                self.llm_warmup_url,
+                headers=self.headers,
+                timeout=60**5,
+            )
+            response.raise_for_status()
+
+    async def _generate(self, prompt: str, **kwargs):
+        async with httpx.AsyncClient() as client:
+            response = await retry(client.post)(
+                self.llm_url,
+                headers=self.headers,
+                json={"prompt": prompt},
+                timeout=self.timeout,
+                retry_timeout=60 * 5,
+            )
+            response.raise_for_status()
+            text = response.json()["text"]
+            text = text[len(prompt) :]  # remove prompt
+            return text
+
+
+LLM.register("modal", ModalLLM)
+
+if __name__ == "__main__":
+    from reflector.logger import logger
+
+    async def main():
+        llm = ModalLLM()
+        result = await llm.generate("Hello, my name is", logger=logger)
+        print(result)
+
+    import asyncio
+
+    asyncio.run(main())
--- a/server/reflector/processors/audio_transcript_auto.py
+++ b/server/reflector/processors/audio_transcript_auto.py
@@ -47,6 +47,9 @@ class AudioTranscriptAutoProcessor(AudioTranscriptProcessor):
    def off(self, callback):
        self.processor.off(callback)

+    async def _warmup(self):
+        return await self.processor._warmup()
+
    async def _push(self, data: AudioFile):
        return await self.processor._push(data)

--- a/server/reflector/processors/audio_transcript_modal.py
+++ b/server/reflector/processors/audio_transcript_modal.py
@@ -0,0 +1,83 @@
+"""
+Implementation using the GPU service from modal.com
+
+API will be a POST request to TRANSCRIPT_URL:
+
+```form
+"timestamp": 123.456
+"language": "en"
+"file": <audio file>
+```
+
+"""
+
+from reflector.processors.audio_transcript import AudioTranscriptProcessor
+from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
+from reflector.processors.types import AudioFile, Transcript, Word
+from reflector.settings import settings
+from reflector.utils.retry import retry
+from time import monotonic
+import httpx
+
+
+class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
+    def __init__(self, modal_api_key: str):
+        super().__init__()
+        self.transcript_url = settings.TRANSCRIPT_URL + "/transcribe"
+        self.warmup_url = settings.TRANSCRIPT_URL + "/warmup"
+        self.timeout = settings.TRANSCRIPT_TIMEOUT
+        self.headers = {
+            "Authorization": f"Bearer {modal_api_key}",
+        }
+
+    async def _warmup(self):
+        try:
+            async with httpx.AsyncClient() as client:
+                start = monotonic()
+                self.logger.debug("Transcribe modal: warming up...")
+                response = await client.post(
+                    self.warmup_url,
+                    headers=self.headers,
+                    timeout=self.timeout,
+                )
+                response.raise_for_status()
+                duration = monotonic() - start
+                self.logger.debug(f"Transcribe modal: warmup took {duration:.2f}s")
+        except Exception:
+            self.logger.exception("Transcribe modal: warmup failed")
+
+    async def _transcript(self, data: AudioFile):
+        async with httpx.AsyncClient() as client:
+            self.logger.debug(f"Try to transcribe audio {data.path.name}")
+            files = {
+                "file": (data.path.name, data.path.open("rb")),
+            }
+            response = await retry(client.post)(
+                self.transcript_url,
+                files=files,
+                timeout=self.timeout,
+                headers=self.headers,
+            )
+
+            self.logger.debug(
+                f"Transcript response: {response.status_code} {response.content}"
+            )
+            response.raise_for_status()
+            result = response.json()
+            transcript = Transcript(
+                text=result["text"],
+                words=[
+                    Word(
+                        text=word["text"],
+                        start=word["start"],
+                        end=word["end"],
+                    )
+                    for word in result["words"]
+                ],
+            )
+            transcript.add_offset(data.timestamp)
+
+        return transcript
+
+
+AudioTranscriptAutoProcessor.register("modal", AudioTranscriptModalProcessor)
--- a/server/reflector/processors/base.py
+++ b/server/reflector/processors/base.py
@@ -7,6 +7,7 @@ import asyncio
 class Processor:
    INPUT_TYPE: type = None
    OUTPUT_TYPE: type = None
+    WARMUP_EVENT: str = "WARMUP_EVENT"

    def __init__(self, callback=None, custom_logger=None):
        self._processors = []
@@ -85,12 +86,21 @@ class Processor:
    def describe(self, level=0):
        logger.info("  " * level + self.__class__.__name__)

+    async def warmup(self):
+        """
+        Warmup the processor
+        """
+        await self._warmup()
+
    async def _push(self, data):
        raise NotImplementedError

    async def _flush(self):
        pass

+    async def _warmup(self):
+        pass
+
    @classmethod
    def as_threaded(cls, *args, **kwargs):
        """
@@ -129,10 +139,17 @@ class ThreadedProcessor(Processor):
                if data is None:
                    await self.processor.flush()
                    break
+                if data == self.WARMUP_EVENT:
+                    self.logger.debug(f"Warming up {self.processor.__class__.__name__}")
+                    await self.processor.warmup()
+                    continue
                await self.processor.push(data)
            finally:
                self.queue.task_done()

+    async def _warmup(self):
+        await self.queue.put(self.WARMUP_EVENT)
+
    async def _push(self, data):
        await self.queue.put(data)

@@ -163,6 +180,7 @@ class Pipeline(Processor):
    OUTPUT_TYPE = None

    def __init__(self, *processors: Processor):
+        self._warmed_up = False
        super().__init__()
        self.logger = logger.bind(pipeline=self.uid)
        self.logger.info("Pipeline created")
@@ -178,6 +196,11 @@ class Pipeline(Processor):
        self.INPUT_TYPE = processors[0].INPUT_TYPE
        self.OUTPUT_TYPE = processors[-1].OUTPUT_TYPE

+    async def _warmup(self):
+        for processor in self.processors:
+            self.logger.debug(f"Warming up {processor.__class__.__name__}")
+            await processor.warmup()
+
    async def _push(self, data):
        await self.processors[0].push(data)

--- a/server/reflector/processors/transcript_topic_detector.py
+++ b/server/reflector/processors/transcript_topic_detector.py
@@ -31,6 +31,9 @@ class TranscriptTopicDetectorProcessor(Processor):
        self.min_transcript_length = min_transcript_length
        self.llm = LLM.get_instance()

+    async def _warmup(self):
+        await self.llm.warmup(logger=self.logger)
+
    async def _push(self, data: Transcript):
        if self.transcript is None:
            self.transcript = data
--- a/server/reflector/processors/types.py
+++ b/server/reflector/processors/types.py
@@ -49,6 +49,11 @@ class Transcript(BaseModel):
            self.words.extend(other.words)
        self.text += other.text

+    def add_offset(self, offset: float):
+        for word in self.words:
+            word.start += offset
+            word.end += offset
+
    def clone(self):
        words = [
            Word(text=word.text, start=word.start, end=word.end) for word in self.words
--- a/server/reflector/settings.py
+++ b/server/reflector/settings.py
@@ -36,6 +36,9 @@ class Settings(BaseSettings):
    TRANSCRIPT_BANANA_API_KEY: str | None = None
    TRANSCRIPT_BANANA_MODEL_KEY: str | None = None

+    # Audio transcription modal.com configuration
+    TRANSCRIPT_MODAL_API_KEY: str | None = None
+
    # Audio transcription storage
    TRANSCRIPT_STORAGE_BACKEND: str = "aws"

@@ -63,6 +66,9 @@ class Settings(BaseSettings):
    LLM_BANANA_API_KEY: str | None = None
    LLM_BANANA_MODEL_KEY: str | None = None

+    # LLM Modal configuration
+    LLM_MODAL_API_KEY: str | None = None
+
    # Sentry
    SENTRY_DSN: str | None = None

--- a/server/reflector/utils/retry.py
+++ b/server/reflector/utils/retry.py
@@ -58,12 +58,14 @@ def retry(fn):
                if result:
                    return result
            except HTTPStatusError as e:
+                logger.exception(e)
                status_code = e.response.status_code
                logger.debug(f"HTTP status {status_code} - {e}")
                if status_code in retry_httpx_status_stop:
                    message = f"HTTP status {status_code} is in retry_httpx_status_stop"
                    raise RetryHTTPException(message) from e
            except retry_ignore_exc_types as e:
+                logger.exception(e)
                last_exception = e

            logger.debug(
--- a/server/reflector/views/rtc_offer.py
+++ b/server/reflector/views/rtc_offer.py
@@ -159,6 +159,7 @@ async def rtc_offer_base(
        TranscriptTopicDetectorProcessor.as_threaded(callback=on_topic),
        TranscriptFinalSummaryProcessor.as_threaded(callback=on_final_summary),
    )
+    await ctx.pipeline.warmup()

    # handle RTC peer connection
    pc = RTCPeerConnection()
--- a/server/reflector/views/transcripts.py
+++ b/server/reflector/views/transcripts.py
@@ -1,11 +1,16 @@
-from fastapi import APIRouter, HTTPException, Request, WebSocket, WebSocketDisconnect
+from fastapi import (
+    APIRouter,
+    HTTPException,
+    Request,
+    WebSocket,
+    WebSocketDisconnect,
+)
 from pydantic import BaseModel, Field
 from uuid import UUID, uuid4
 from datetime import datetime
 from fastapi_pagination import Page, paginate
 from reflector.logger import logger
 from .rtc_offer import rtc_offer_base, RtcOffer, PipelineEvent
-import asyncio
 from typing import Optional


@@ -239,8 +244,8 @@ async def transcript_events_websocket(transcript_id: UUID, websocket: WebSocket)
    # endless loop to wait for new events
    try:
        while True:
-            await asyncio.sleep(42)
-    except WebSocketDisconnect:
+            await websocket.receive()
+    except (RuntimeError, WebSocketDisconnect):
        ws_manager.disconnect(transcript_id, websocket)