GPU Inference Server (#1112)

* feat: local inference server * fix: source to use bash + vars * chore: isort and black * fix: make file + inference mode * chore: logging * refactor: remove old links * fix: add new env vars * feat: hf inference server * refactor: remove old links * test: batch and single response * chore: black + isort * separate gpu and cpu dockerfiles * moved gpu to separate dockerfile * Fixed test endpoints * Edits to API. server won't start due to failed instantiation error * Method signature * fix: gpu_infer * tests: fix tests --------- Co-authored-by: Andriy Mulyar <andriy.mulyar@gmail.com>
2025-10-12 19:49:16 +00:00 · 2023-07-21 14:13:29 -05:00
parent 58f0fcab57
commit 8aba2c9009
14 changed files with 271 additions and 112 deletions
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
@@ -1,9 +1,10 @@
-from fastapi import APIRouter, Depends, Response, Security, status
-from pydantic import BaseModel, Field
-from typing import List, Dict
 import logging
 import time
+from typing import Dict, List
+
 from api_v1.settings import settings
+from fastapi import APIRouter, Depends, Response, Security, status
+from pydantic import BaseModel, Field

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -11,11 +12,11 @@ logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml


-
 class ChatCompletionMessage(BaseModel):
    role: str
    content: str

+
 class ChatCompletionRequest(BaseModel):
    model: str = Field(..., description='The model to generate a completion from.')
    messages: List[ChatCompletionMessage] = Field(..., description='The model to generate a completion from.')
@@ -26,11 +27,13 @@ class ChatCompletionChoice(BaseModel):
    index: int
    finish_reason: str

+
 class ChatCompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

+
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = 'text_completion'
@@ -42,6 +45,7 @@ class ChatCompletionResponse(BaseModel):

 router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])

+
@router.post("/completions", response_model=ChatCompletionResponse)
 async def chat_completion(request: ChatCompletionRequest):
    '''
@@ -53,11 +57,5 @@ async def chat_completion(request: ChatCompletionRequest):
        created=time.time(),
        model=request.model,
        choices=[{}],
-        usage={
-            'prompt_tokens': 0,
-            'completion_tokens': 0,
-            'total_tokens': 0
-        }
+        usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
    )
-
-