GPU Inference Server (#1112)

* feat: local inference server * fix: source to use bash + vars * chore: isort and black * fix: make file + inference mode * chore: logging * refactor: remove old links * fix: add new env vars * feat: hf inference server * refactor: remove old links * test: batch and single response * chore: black + isort * separate gpu and cpu dockerfiles * moved gpu to separate dockerfile * Fixed test endpoints * Edits to API. server won't start due to failed instantiation error * Method signature * fix: gpu_infer * tests: fix tests --------- Co-authored-by: Andriy Mulyar <andriy.mulyar@gmail.com>
2025-09-29 08:46:10 +00:00 · 2023-07-21 14:13:29 -05:00
parent 58f0fcab57
commit 8aba2c9009
14 changed files with 271 additions and 112 deletions
--- a/gpt4all-api/gpt4all_api/app/main.py
+++ b/gpt4all-api/gpt4all_api/app/main.py
@@ -1,19 +1,19 @@
-import os
-import docs
 import logging
-from fastapi import FastAPI, HTTPException, Request
-from starlette.middleware.cors import CORSMiddleware
-from fastapi.logger import logger as fastapi_logger
-from api_v1.settings import settings
-from api_v1.api import router as v1_router
-from api_v1 import events
 import os

+import docs
+from api_v1 import events
+from api_v1.api import router as v1_router
+from api_v1.settings import settings
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.logger import logger as fastapi_logger
+from starlette.middleware.cors import CORSMiddleware
+
 logger = logging.getLogger(__name__)

 app = FastAPI(title='GPT4All API', description=docs.desc)

-#CORS Configuration (in-case you want to deploy)
+# CORS Configuration (in-case you want to deploy)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
@@ -29,14 +29,23 @@ app.include_router(v1_router, prefix='/v1')
 app.add_event_handler('startup', events.startup_event_handler(app))
 app.add_exception_handler(HTTPException, events.on_http_error)

+
@app.on_event("startup")
 async def startup():
    global model
-    logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
-    from gpt4all import GPT4All
-    model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
+    if settings.inference_mode == "cpu":
+        logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
+        from gpt4all import GPT4All
+
+        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
+
+        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
+
+    else:
+        # is it possible to do this once the server is up?
+        ## TODO block until HF inference server is up.
+        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")

-    logger.info("GPT4All API is ready.")

@app.on_event("shutdown")
 async def shutdown():
@@ -57,5 +66,7 @@ if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
    uvicorn_logger.handlers = gunicorn_error_logger.handlers
 else:
    # https://github.com/tiangolo/fastapi/issues/2019
-    LOG_FORMAT2 = "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
-    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
+    LOG_FORMAT2 = (
+        "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
+    )
+    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)