GPU Inference Server (#1112)

* feat: local inference server

* fix: source to use bash + vars

* chore: isort and black

* fix: make file + inference mode

* chore: logging

* refactor: remove old links

* fix: add new env vars

* feat: hf inference server

* refactor: remove old links

* test: batch and single response

* chore: black + isort

* separate gpu and cpu dockerfiles

* moved gpu to separate dockerfile

* Fixed test endpoints

* Edits to API. server won't start due to failed instantiation error

* Method signature

* fix: gpu_infer

* tests: fix tests

---------

Co-authored-by: Andriy Mulyar <andriy.mulyar@gmail.com>
This commit is contained in:
Zach Nussbaum
2023-07-21 14:13:29 -05:00
committed by GitHub
parent 58f0fcab57
commit 8aba2c9009
14 changed files with 271 additions and 112 deletions

View File

@@ -1,19 +1,19 @@
import os
import docs
import logging
from fastapi import FastAPI, HTTPException, Request
from starlette.middleware.cors import CORSMiddleware
from fastapi.logger import logger as fastapi_logger
from api_v1.settings import settings
from api_v1.api import router as v1_router
from api_v1 import events
import os
import docs
from api_v1 import events
from api_v1.api import router as v1_router
from api_v1.settings import settings
from fastapi import FastAPI, HTTPException, Request
from fastapi.logger import logger as fastapi_logger
from starlette.middleware.cors import CORSMiddleware
logger = logging.getLogger(__name__)
app = FastAPI(title='GPT4All API', description=docs.desc)
#CORS Configuration (in-case you want to deploy)
# CORS Configuration (in-case you want to deploy)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@@ -29,14 +29,23 @@ app.include_router(v1_router, prefix='/v1')
app.add_event_handler('startup', events.startup_event_handler(app))
app.add_exception_handler(HTTPException, events.on_http_error)
@app.on_event("startup")
async def startup():
global model
logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
from gpt4all import GPT4All
model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
if settings.inference_mode == "cpu":
logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
from gpt4all import GPT4All
model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
else:
# is it possible to do this once the server is up?
## TODO block until HF inference server is up.
logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
logger.info("GPT4All API is ready.")
@app.on_event("shutdown")
async def shutdown():
@@ -57,5 +66,7 @@ if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
uvicorn_logger.handlers = gunicorn_error_logger.handlers
else:
# https://github.com/tiangolo/fastapi/issues/2019
LOG_FORMAT2 = "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
LOG_FORMAT2 = (
"[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
)
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)