mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-09 03:47:25 +00:00
Implement /v1/chat/completions endpoint for CPU mode
Signed-off-by: Johannes Plötner <johannes.w.m.ploetner@gmail.com>
This commit is contained in:
parent
61d6765361
commit
026ee4e46b
@ -2,7 +2,8 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from typing import List
|
from typing import List
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from fastapi import APIRouter
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from gpt4all import GPT4All
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
from api_v1.settings import settings
|
from api_v1.settings import settings
|
||||||
from fastapi.responses import StreamingResponse
|
from fastapi.responses import StreamingResponse
|
||||||
@ -18,6 +19,7 @@ class ChatCompletionMessage(BaseModel):
|
|||||||
class ChatCompletionRequest(BaseModel):
|
class ChatCompletionRequest(BaseModel):
|
||||||
model: str = Field(settings.model, description='The model to generate a completion from.')
|
model: str = Field(settings.model, description='The model to generate a completion from.')
|
||||||
messages: List[ChatCompletionMessage] = Field(..., description='Messages for the chat completion.')
|
messages: List[ChatCompletionMessage] = Field(..., description='Messages for the chat completion.')
|
||||||
|
temperature: float = Field(settings.temp, description='Model temperature')
|
||||||
|
|
||||||
class ChatCompletionChoice(BaseModel):
|
class ChatCompletionChoice(BaseModel):
|
||||||
message: ChatCompletionMessage
|
message: ChatCompletionMessage
|
||||||
@ -45,15 +47,41 @@ async def chat_completion(request: ChatCompletionRequest):
|
|||||||
'''
|
'''
|
||||||
Completes a GPT4All model response based on the last message in the chat.
|
Completes a GPT4All model response based on the last message in the chat.
|
||||||
'''
|
'''
|
||||||
# Example: Echo the last message content with some modification
|
# GPU is not implemented yet
|
||||||
|
if settings.inference_mode == "gpu":
|
||||||
|
raise HTTPException(status_code=400,
|
||||||
|
detail=f"Not implemented yet: Can only infere in CPU mode.")
|
||||||
|
|
||||||
|
# we only support the configured model
|
||||||
|
if request.model != settings.model:
|
||||||
|
raise HTTPException(status_code=400,
|
||||||
|
detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
|
||||||
|
|
||||||
|
# run only of we have a message
|
||||||
if request.messages:
|
if request.messages:
|
||||||
last_message = request.messages[-1].content
|
model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
|
||||||
response_content = f"Echo: {last_message}"
|
|
||||||
|
# format system message and conversation history correctly
|
||||||
|
formatted_messages = ""
|
||||||
|
for message in request.messages:
|
||||||
|
formatted_messages += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"
|
||||||
|
|
||||||
|
# the LLM will complete the response of the assistant
|
||||||
|
formatted_messages += "<|im_start|>assistant\n"
|
||||||
|
response = model.generate(
|
||||||
|
prompt=formatted_messages,
|
||||||
|
temp=request.temperature
|
||||||
|
)
|
||||||
|
|
||||||
|
# the LLM may continue to hallucinate the conversation, but we want only the first response
|
||||||
|
# so, cut off everything after first <|im_end|>
|
||||||
|
index = response.find("<|im_end|>")
|
||||||
|
response_content = response[:index].strip()
|
||||||
else:
|
else:
|
||||||
response_content = "No messages received."
|
response_content = "No messages received."
|
||||||
|
|
||||||
# Create a chat message for the response
|
# Create a chat message for the response
|
||||||
response_message = ChatCompletionMessage(role="system", content=response_content)
|
response_message = ChatCompletionMessage(role="assistant", content=response_content)
|
||||||
|
|
||||||
# Create a choice object with the response message
|
# Create a choice object with the response message
|
||||||
response_choice = ChatCompletionChoice(
|
response_choice = ChatCompletionChoice(
|
||||||
|
@ -51,7 +51,7 @@ def test_batched_completion():
|
|||||||
model = model_id # replace with your specific model ID
|
model = model_id # replace with your specific model ID
|
||||||
prompt = "Who is Michael Jordan?"
|
prompt = "Who is Michael Jordan?"
|
||||||
responses = []
|
responses = []
|
||||||
|
|
||||||
# Loop to create completions one at a time
|
# Loop to create completions one at a time
|
||||||
for _ in range(3):
|
for _ in range(3):
|
||||||
response = openai.Completion.create(
|
response = openai.Completion.create(
|
||||||
@ -62,7 +62,7 @@ def test_batched_completion():
|
|||||||
# Assertions to check the responses
|
# Assertions to check the responses
|
||||||
for response in responses:
|
for response in responses:
|
||||||
assert len(response['choices'][0]['text']) > len(prompt)
|
assert len(response['choices'][0]['text']) > len(prompt)
|
||||||
|
|
||||||
assert len(responses) == 3
|
assert len(responses) == 3
|
||||||
|
|
||||||
def test_embedding():
|
def test_embedding():
|
||||||
@ -74,4 +74,20 @@ def test_embedding():
|
|||||||
|
|
||||||
assert response["model"] == model
|
assert response["model"] == model
|
||||||
assert isinstance(output, list)
|
assert isinstance(output, list)
|
||||||
assert all(isinstance(x, args) for x in output)
|
assert all(isinstance(x, args) for x in output)
|
||||||
|
|
||||||
|
def test_chat_completion():
|
||||||
|
model = model_id
|
||||||
|
|
||||||
|
response = openai.ChatCompletion.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Knock knock."},
|
||||||
|
{"role": "assistant", "content": "Who's there?"},
|
||||||
|
{"role": "user", "content": "Orange."},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.choices[0].message.role == "assistant"
|
||||||
|
assert len(response.choices[0].message.content) > 0
|
||||||
|
Loading…
Reference in New Issue
Block a user