GPU Inference Server (#1112)

* feat: local inference server

* fix: source to use bash + vars

* chore: isort and black

* fix: make file + inference mode

* chore: logging

* refactor: remove old links

* fix: add new env vars

* feat: hf inference server

* refactor: remove old links

* test: batch and single response

* chore: black + isort

* separate gpu and cpu dockerfiles

* moved gpu to separate dockerfile

* Fixed test endpoints

* Edits to API. server won't start due to failed instantiation error

* Method signature

* fix: gpu_infer

* tests: fix tests

---------

Co-authored-by: Andriy Mulyar <andriy.mulyar@gmail.com>
This commit is contained in:
Zach Nussbaum
2023-07-21 14:13:29 -05:00
committed by GitHub
parent 58f0fcab57
commit 8aba2c9009
14 changed files with 271 additions and 112 deletions

View File

@@ -1,9 +1,10 @@
from fastapi import APIRouter, Depends, Response, Security, status
from pydantic import BaseModel, Field
from typing import List, Dict
import logging
import time
from typing import Dict, List
from api_v1.settings import settings
from fastapi import APIRouter, Depends, Response, Security, status
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@@ -11,11 +12,11 @@ logger.setLevel(logging.DEBUG)
### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
class ChatCompletionMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = Field(..., description='The model to generate a completion from.')
messages: List[ChatCompletionMessage] = Field(..., description='The model to generate a completion from.')
@@ -26,11 +27,13 @@ class ChatCompletionChoice(BaseModel):
index: int
finish_reason: str
class ChatCompletionUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = 'text_completion'
@@ -42,6 +45,7 @@ class ChatCompletionResponse(BaseModel):
router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])
@router.post("/completions", response_model=ChatCompletionResponse)
async def chat_completion(request: ChatCompletionRequest):
'''
@@ -53,11 +57,5 @@ async def chat_completion(request: ChatCompletionRequest):
created=time.time(),
model=request.model,
choices=[{}],
usage={
'prompt_tokens': 0,
'completion_tokens': 0,
'total_tokens': 0
}
usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
)