GPU Inference Server (#1112)

* feat: local inference server * fix: source to use bash + vars * chore: isort and black * fix: make file + inference mode * chore: logging * refactor: remove old links * fix: add new env vars * feat: hf inference server * refactor: remove old links * test: batch and single response * chore: black + isort * separate gpu and cpu dockerfiles * moved gpu to separate dockerfile * Fixed test endpoints * Edits to API. server won't start due to failed instantiation error * Method signature * fix: gpu_infer * tests: fix tests --------- Co-authored-by: Andriy Mulyar <andriy.mulyar@gmail.com>
2025-06-19 12:14:20 +00:00 · 2023-07-21 14:13:29 -05:00 · 2023-07-21 14:13:29 -05:00 · 8aba2c9009
commit 8aba2c9009
parent 58f0fcab57
14 changed files with 271 additions and 112 deletions
--- a/gpt4all-api/.isort.cfg
+++ b/gpt4all-api/.isort.cfg
@ -0,0 +1,7 @@
 [settings]
 known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
 line_length=120
 include_trailing_comma=True
 multi_line_output=3
 use_parentheses=True
--- a/gpt4all-api/README.md
+++ b/gpt4all-api/README.md
@ -17,6 +17,18 @@ Then, start the backend with:
 docker compose up --build
 ```
 This will run both the API and locally hosted GPU inference server. If you want to run the API without the GPU inference server, you can run:
 ```bash
 docker compose up --build gpt4all_api
 ```
 To run the API with the GPU inference server, you will need to include environment variables (like the `MODEL_ID`). Edit the `.env` file and run
 ```bash
 docker compose --env-file .env up --build
 ```
 #### Spinning up your app
 Run `docker compose up` to spin up the backend. Monitor the logs for errors in-case you forgot to set an environment variable above.
--- a/gpt4all-api/docker-compose.gpu.yaml
+++ b/gpt4all-api/docker-compose.gpu.yaml
@ -0,0 +1,24 @@
 version: "3.8"
 services:
  gpt4all_gpu:
    image: ghcr.io/huggingface/text-generation-inference
    container_name: gpt4all_gpu
    restart: always #restart on error (usually code compilation from save during bad state)
    environment:
      - HUGGING_FACE_HUB_TOKEN=token
      - USE_FLASH_ATTENTION=false
      - MODEL_ID=''
      - NUM_SHARD=1
    command: --model-id $MODEL_ID --num-shard $NUM_SHARD
    volumes:
      - ./:/data
    ports:
      - "8080:80"
    shm_size: 1g
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
--- a/gpt4all-api/docker-compose.yaml
+++ b/gpt4all-api/docker-compose.yaml
@ -1,4 +1,4 @@
-version: "3.5"
+version: "3.8"
 services:
  gpt4all_api:
@ -13,6 +13,7 @@ services:
      - LOGLEVEL=debug
      - PORT=4891
      - model=ggml-mpt-7b-chat.bin
      - inference_mode=cpu
    volumes:
      - './gpt4all_api/app:/app'
    command: ["/start-reload.sh"]
--- a/gpt4all-api/gpt4all_api/app/api_v1/events.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/events.py
@ -1,8 +1,10 @@
 import logging
 from api_v1.settings import settings
 from fastapi import HTTPException
 from fastapi.responses import JSONResponse
 from starlette.requests import Request
-from api_v1.settings import settings
+
 log = logging.getLogger(__name__)
@ -19,8 +21,9 @@ async def on_startup(app):
    startup_msg = startup_msg_fmt.format(settings=settings)
    log.info(startup_msg)
 def startup_event_handler(app):
    async def start_app() -> None:
        await on_startup(app)
-    return start_app
+    return start_app
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
@ -1,9 +1,10 @@
 from fastapi import APIRouter, Depends, Response, Security, status
 from pydantic import BaseModel, Field
 from typing import List, Dict
 import logging
 import time
 from typing import Dict, List
 from api_v1.settings import settings
 from fastapi import APIRouter, Depends, Response, Security, status
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@ -11,11 +12,11 @@ logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class ChatCompletionMessage(BaseModel):
    role: str
    content: str
 class ChatCompletionRequest(BaseModel):
    model: str = Field(..., description='The model to generate a completion from.')
    messages: List[ChatCompletionMessage] = Field(..., description='The model to generate a completion from.')
@ -26,11 +27,13 @@ class ChatCompletionChoice(BaseModel):
    index: int
    finish_reason: str
 class ChatCompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = 'text_completion'
@ -42,6 +45,7 @@ class ChatCompletionResponse(BaseModel):
 router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])
@router.post("/completions", response_model=ChatCompletionResponse)
 async def chat_completion(request: ChatCompletionRequest):
    '''
@ -53,11 +57,5 @@ async def chat_completion(request: ChatCompletionRequest):
        created=time.time(),
        model=request.model,
        choices=[{}],
-        usage={
+        usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
            'prompt_tokens': 0,
            'completion_tokens': 0,
            'total_tokens': 0
        }
    )
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
@ -1,14 +1,16 @@
 import json
 from fastapi import APIRouter, Depends, Response, Security, status
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field
 from typing import List, Dict, Iterable, AsyncIterable
 import logging
 from uuid import uuid4
 from api_v1.settings import settings
 from gpt4all import GPT4All
 import time
 from typing import Dict, List, Union
 from uuid import uuid4
 import aiohttp
 import asyncio
 from api_v1.settings import settings
 from fastapi import APIRouter, Depends, Response, Security, status, HTTPException
 from fastapi.responses import StreamingResponse
 from gpt4all import GPT4All
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@ -16,14 +18,17 @@ logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class CompletionRequest(BaseModel):
-    model: str = Field(..., description='The model to generate a completion from.')
+    model: str = Field(settings.model, description='The model to generate a completion from.')
-    prompt: str = Field(..., description='The prompt to begin completing from.')
+    prompt: Union[List[str], str] = Field(..., description='The prompt to begin completing from.')
-    max_tokens: int = Field(7, description='Max tokens to generate')
+    max_tokens: int = Field(None, description='Max tokens to generate')
-    temperature: float = Field(0, description='Model temperature')
+    temperature: float = Field(settings.temp, description='Model temperature')
-    top_p: float = Field(1.0, description='top_p')
+    top_p: float = Field(settings.top_k, description='top_p')
-    n: int = Field(1, description='')
+    top_k: int = Field(settings.top_k, description='top_k')
    n: int = Field(1, description='How many completions to generate for each prompt')
    stream: bool = Field(False, description='Stream responses')
    repeat_penalty: float = Field(settings.repeat_penalty, description='Repeat penalty')
 class CompletionChoice(BaseModel):
@ -58,7 +63,6 @@ class CompletionStreamResponse(BaseModel):
 router = APIRouter(prefix="/completions", tags=["Completion Endpoints"])
 def stream_completion(output: Iterable, base_response: CompletionStreamResponse):
    """
    Streams a GPT4All output to the client.
@ -80,6 +84,27 @@ def stream_completion(output: Iterable, base_response: CompletionStreamResponse)
        ))]
        yield f"data: {json.dumps(dict(chunk))}\n\n"
 async def gpu_infer(payload, header):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(
                settings.hf_inference_server_host, headers=header, data=json.dumps(payload)
            ) as response:
                resp = await response.json()
            return resp
        except aiohttp.ClientError as e:
            # Handle client-side errors (e.g., connection error, invalid URL)
            logger.error(f"Client error: {e}")
        except aiohttp.ServerError as e:
            # Handle server-side errors (e.g., internal server error)
            logger.error(f"Server error: {e}")
        except json.JSONDecodeError as e:
            # Handle JSON decoding errors
            logger.error(f"JSON decoding error: {e}")
        except Exception as e:
            # Handle other unexpected exceptions
            logger.error(f"Unexpected error: {e}")
@router.post("/", response_model=CompletionResponse)
 async def completions(request: CompletionRequest):
@ -87,42 +112,104 @@ async def completions(request: CompletionRequest):
    Completes a GPT4All model response.
    '''
-    model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
+    if request.model != settings.model:
        raise HTTPException(status_code=400, detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
-    output = model.generate(prompt=request.prompt,
+    if settings.inference_mode == "gpu":
-                            n_predict=request.max_tokens,
+        params = request.dict(exclude={'model', 'prompt', 'max_tokens', 'n'})
-                            streaming=request.stream,
+        params["max_new_tokens"] = request.max_tokens
-                            top_k=20,
+        params["num_return_sequences"] = request.n
-                            top_p=request.top_p,
+
-                            temp=request.temperature,
+        header = {"Content-Type": "application/json"}
-                            n_batch=1024,
+        payload = {"parameters": params}
-                            repeat_penalty=1.2,
+        if isinstance(request.prompt, list):
-                            repeat_last_n=10)
+            tasks = []
            for prompt in request.prompt:
                payload["inputs"] = prompt
                task = gpu_infer(payload, header)
                tasks.append(task)
            results = await asyncio.gather(*tasks)
            choices = []
            for response in results:
                scores = response["scores"] if "scores" in response else -1.0
                choices.append(
                    dict(
                        CompletionChoice(
                            text=response["generated_text"], index=0, logprobs=scores, finish_reason='stop'
                        )
                    )
                )
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=choices,
                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
            )
        else:
            # If streaming, we need to return a StreamingResponse
            payload["inputs"] = request.prompt
            resp = await gpu_infer(payload, header)
            output = resp["generated_text"]
            # this returns all logprobs
            scores = resp["scores"] if "scores" in resp else -1.0
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[dict(CompletionChoice(text=output, index=0, logprobs=scores, finish_reason='stop'))],
                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
            )
    # If streaming, we need to return a StreamingResponse
    if request.stream:
        base_chunk = CompletionStreamResponse(
            id=str(uuid4()),
            created=time.time(),
            model=request.model,
            choices=[]
        )
        return StreamingResponse((response for response in stream_completion(output, base_chunk)),
                                 media_type="text/event-stream")
    else:
-        return CompletionResponse(
+
-            id=str(uuid4()),
+        if isinstance(request.prompt, list):
-            created=time.time(),
+            if len(request.prompt) > 1:
-            model=request.model,
+                raise HTTPException(status_code=400, detail="Can only infer one inference per request in CPU mode.")
-            choices=[dict(CompletionChoice(
+            else:
-                text=output,
+                request.prompt = request.prompt[0]
-                index=0,
+
-                logprobs=-1,
+        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
-                finish_reason='stop'
+
-            ))],
+        output = model.generate(prompt=request.prompt,
-            usage={
+                                max_tokens=request.max_tokens,
-                'prompt_tokens': 0, #TODO how to compute this?
+                                streaming=request.stream,
-                'completion_tokens': 0,
+                                top_k=request.top_k,
-                'total_tokens': 0
+                                top_p=request.top_p,
-            }
+                                temp=request.temperature,
-        )
+                                )
        # If streaming, we need to return a StreamingResponse
        if request.stream:
            base_chunk = CompletionStreamResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[]
            )
            return StreamingResponse((response for response in stream_completion(output, base_chunk)),
                                     media_type="text/event-stream")
        else:
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[dict(CompletionChoice(
                    text=output,
                    index=0,
                    logprobs=-1,
                    finish_reason='stop'
                ))],
                usage={
                    'prompt_tokens': 0,  # TODO how to compute this?
                    'completion_tokens': 0,
                    'total_tokens': 0
                }
            )
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
@ -1,22 +1,27 @@
 import logging
 from typing import Dict, List
 from api_v1.settings import settings
 from fastapi import APIRouter, Depends, Response, Security, status
 from pydantic import BaseModel, Field
 from typing import List, Dict
 import logging
 from api_v1.settings import settings
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class ListEnginesResponse(BaseModel):
    data: List[Dict] = Field(..., description="All available models.")
 class EngineResponse(BaseModel):
    data: List[Dict] = Field(..., description="All available models.")
 router = APIRouter(prefix="/engines", tags=["Search Endpoints"])
@router.get("/", response_model=ListEnginesResponse)
 async def list_engines():
    '''
@ -29,10 +34,7 @@ async def list_engines():
@router.get("/{engine_id}", response_model=EngineResponse)
 async def retrieve_engine(engine_id: str):
-    '''
+    ''' '''
    '''
    raise NotImplementedError()
    return EngineResponse()
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
@ -1,6 +1,7 @@
 import logging
 from fastapi import APIRouter
 from fastapi.responses import JSONResponse
 log = logging.getLogger(__name__)
 router = APIRouter(prefix="/health", tags=["Health"])
--- a/gpt4all-api/gpt4all_api/app/api_v1/settings.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/settings.py
@ -5,6 +5,14 @@ class Settings(BaseSettings):
    app_environment = 'dev'
    model: str = 'ggml-mpt-7b-chat.bin'
    gpt4all_path: str = '/models'
    inference_mode: str = "cpu"
    hf_inference_server_host: str = "http://gpt4all_gpu:80/generate"
    temp: float = 0.18
    top_p: float = 1.0
    top_k: int = 50
    repeat_penalty: float = 1.18
 settings = Settings()
--- a/gpt4all-api/gpt4all_api/app/main.py
+++ b/gpt4all-api/gpt4all_api/app/main.py
@ -1,19 +1,19 @@
 import os
 import docs
 import logging
 from fastapi import FastAPI, HTTPException, Request
 from starlette.middleware.cors import CORSMiddleware
 from fastapi.logger import logger as fastapi_logger
 from api_v1.settings import settings
 from api_v1.api import router as v1_router
 from api_v1 import events
 import os
 import docs
 from api_v1 import events
 from api_v1.api import router as v1_router
 from api_v1.settings import settings
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.logger import logger as fastapi_logger
 from starlette.middleware.cors import CORSMiddleware
 logger = logging.getLogger(__name__)
 app = FastAPI(title='GPT4All API', description=docs.desc)
-#CORS Configuration (in-case you want to deploy)
+# CORS Configuration (in-case you want to deploy)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
@ -29,14 +29,23 @@ app.include_router(v1_router, prefix='/v1')
 app.add_event_handler('startup', events.startup_event_handler(app))
 app.add_exception_handler(HTTPException, events.on_http_error)
@app.on_event("startup")
 async def startup():
    global model
-    logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
+    if settings.inference_mode == "cpu":
-    from gpt4all import GPT4All
+        logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
-    model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
+        from gpt4all import GPT4All
        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
    else:
        # is it possible to do this once the server is up?
        ## TODO block until HF inference server is up.
        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
    logger.info("GPT4All API is ready.")
@app.on_event("shutdown")
 async def shutdown():
@ -57,5 +66,7 @@ if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
    uvicorn_logger.handlers = gunicorn_error_logger.handlers
 else:
    # https://github.com/tiangolo/fastapi/issues/2019
-    LOG_FORMAT2 = "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
+    LOG_FORMAT2 = (
-    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
+        "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
    )
    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
--- a/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
+++ b/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
@ -2,30 +2,22 @@
 Use the OpenAI python API to test gpt4all models.
 """
 import openai
 openai.api_base = "http://localhost:4891/v1"
 openai.api_key = "not needed for a local LLM"
 def test_completion():
-    model = "gpt4all-j-v1.3-groovy"
+    model = "ggml-mpt-7b-chat.bin"
    prompt = "Who is Michael Jordan?"
    response = openai.Completion.create(
-        model=model,
+        model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
        prompt=prompt,
        max_tokens=50,
        temperature=0.28,
        top_p=0.95,
        n=1,
        echo=True,
        stream=False
    )
    assert len(response['choices'][0]['text']) > len(prompt)
    print(response)
 def test_streaming_completion():
-    model = "gpt4all-j-v1.3-groovy"
+    model = "ggml-mpt-7b-chat.bin"
    prompt = "Who is Michael Jordan?"
    tokens = []
    for resp in openai.Completion.create(
@ -42,10 +34,12 @@ def test_streaming_completion():
    assert (len(tokens) > 0)
    assert (len("".join(tokens)) > len(prompt))
-# def test_chat_completions():
+
-#     model = "gpt4all-j-v1.3-groovy"
+def test_batched_completion():
-#     prompt = "Who is Michael Jordan?"
+    model = "ggml-mpt-7b-chat.bin"
-#     response = openai.ChatCompletion.create(
+    prompt = "Who is Michael Jordan?"
-#         model=model,
+    response = openai.Completion.create(
-#         messages=[]
+        model=model, prompt=[prompt] * 3, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
-#     )
+    )
    assert len(response['choices'][0]['text']) > len(prompt)
    assert len(response['choices']) == 3
--- a/gpt4all-api/gpt4all_api/requirements.txt
+++ b/gpt4all-api/gpt4all_api/requirements.txt
@ -1,10 +1,12 @@
 aiohttp>=3.6.2
 aiofiles
-pydantic>=1.4.0
+pydantic>=1.4.0,<2.0.0
 requests>=2.24.0
 ujson>=2.0.2
 fastapi>=0.95.0
 Jinja2>=3.0
-gpt4all==1.0.1
+gpt4all>=1.0.0
 pytest
-openai
+openai
 black
 isort
--- a/gpt4all-api/makefile
+++ b/gpt4all-api/makefile
@ -1,22 +1,26 @@
 ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 APP_NAME:=gpt4all_api
 PYTHON:=python3.8
 SHELL := /bin/bash
 all: dependencies
 fresh: clean dependencies
 testenv: clean_testenv test_build
-	docker compose up --build
+	docker compose -f docker-compose.yaml up --build
 testenv_gpu: clean_testenv test_build
 	docker compose -f docker-compose.yaml -f docker-compose.gpu.yaml up --build
 testenv_d: clean_testenv test_build
 	docker compose up --build -d
 test:
-	docker compose exec gpt4all_api pytest -svv --disable-warnings -p no:cacheprovider /app/tests
+	docker compose exec $(APP_NAME) pytest -svv --disable-warnings -p no:cacheprovider /app/tests
 test_build:
-    DOCKER_BUILDKIT=1 docker build -t gpt4all_api --progress plain -f gpt4all_api/Dockerfile.buildkit .
+    DOCKER_BUILDKIT=1 docker build -t $(APP_NAME) --progress plain -f $(APP_NAME)/Dockerfile.buildkit .
 clean_testenv:
 	docker compose down -v
@ -27,7 +31,7 @@ venv:
 	if [ ! -d $(ROOT_DIR)/env ]; then $(PYTHON) -m venv $(ROOT_DIR)/env; fi
 dependencies: venv
-	source $(ROOT_DIR)/env/bin/activate; yes w | python -m pip install -r $(ROOT_DIR)/gpt4all_api/requirements.txt
+	source $(ROOT_DIR)/env/bin/activate; $(PYTHON) -m pip install -r $(ROOT_DIR)/$(APP_NAME)/requirements.txt
 clean: clean_testenv
 	# Remove existing environment
@ -35,3 +39,8 @@ clean: clean_testenv
 	rm -rf $(ROOT_DIR)/$(APP_NAME)/*.pyc;
 black:
 	source $(ROOT_DIR)/env/bin/activate; black -l 120 -S --target-version py38 $(APP_NAME)
 isort:
 	source $(ROOT_DIR)/env/bin/activate; isort  --ignore-whitespace --atomic -w 120 $(APP_NAME)