Run on platform

This commit is contained in:
Yevhenii Semendiak 2024-04-26 16:15:51 +03:00
parent c1802e7cf0
commit 00e3e85b81
4 changed files with 267 additions and 4 deletions

103
.neuro/live.yaml Normal file
View File

@ -0,0 +1,103 @@
kind: live
title: private-gpt
# other files from https://github.com/zylon-ai/private-gpt
defaults:
life_span: 5d
images:
privategpt:
ref: image:$[[ project.id ]]:v1
dockerfile: $[[ flow.workspace ]]/Dockerfile.external
context: $[[ flow.workspace ]]/
build_preset: cpu-large
volumes:
cache:
remote: storage:$[[ flow.project_id ]]/cache
mount: /root/.cache/huggingface
local: cache
data:
remote: storage:$[[ flow.project_id ]]/data
mount: /home/worker/app/local_data
local: local_data
pgdata:
remote: storage:$[[ flow.project_id ]]/pgdata
mount: /var/lib/postgresql/data
local: pgdata
pgdata_onprem:
remote: disk:pgdata
mount: /var/lib/postgresql/data
ollama_models:
remote: storage:$[[ flow.project_id ]]/ollama_models
mount: /root/.ollama
local: models
project:
remote: storage:$[[ flow.project_id ]]
mount: /project
local: .
settings:
remote: storage:$[[ flow.project_id ]]/settings
mount: /home/worker/app/settings
local: settings
tiktoken_cache:
remote: storage:$[[ flow.project_id ]]/tiktoken_cache
mount: /home/worker/app/tiktoken_cache
local: tiktoken_cache
jobs:
pgpt:
image: ${{ images.privategpt.ref }}
name: pgpt
preset: cpu-small
http_port: "8080"
# detach: true
browse: true
volumes:
- ${{ volumes.data.ref_rw }}
- ${{ upload(volumes.settings).ref_rw }}
- ${{ volumes.tiktoken_cache.ref_rw }}
env:
PORT: 8080
PGPT_PROFILES: vllm-pgvector
PGPT_SETTINGS_FOLDER: ${{ volumes.settings.mount }}
VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1
OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434
POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }}
VLLM_MODEL: stabilityai/StableBeluga-13B
vllm:
image: vllm/vllm-openai:v0.4.0
name: vllm
preset: gpu-2x3090
detach: true
http_port: "8000"
volumes:
- ${{ volumes.cache.ref_rw }}
cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2
ollama:
image: ollama/ollama:latest
volumes:
- ${{ volumes.ollama_models.ref_rw }}
preset: gpu-small
detach: true
env:
MODEL: "nomic-embed-text"
GIN_MODE: release
http_port: "11434"
entrypoint: "bash -c 'ollama serve & sleep 10 && ollama pull ${MODEL} && sleep infinity'"
pgvector:
image: pgvector/pgvector:pg16
detach: true
preset: cpu-small
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
PGDATA: ${{ volumes.pgdata.mount }}
volumes:
# - ${{ volumes.pgdata.ref_rw }}
- ${{ volumes.pgdata_onprem.ref_rw }}

View File

@ -1,9 +1,9 @@
FROM python:3.11.6-slim-bookworm as base
# Install poetry
RUN pip install pipx
RUN python3 -m pipx ensurepath
RUN pipx install poetry
# RUN pip install pipx
# RUN python3 -m pipx ensurepath
RUN pip install poetry
ENV PATH="/root/.local/bin:$PATH"
ENV PATH=".venv/bin/:$PATH"
@ -14,7 +14,8 @@ FROM base as dependencies
WORKDIR /home/worker/app
COPY pyproject.toml poetry.lock ./
RUN poetry install --extras "ui vector-stores-qdrant llms-ollama embeddings-ollama"
ARG POETRY_EXTRAS="ui vector-stores-postgres llms-ollama llms-openai-like embeddings-ollama"
RUN poetry install --extras "$POETRY_EXTRAS"
FROM base as app
@ -28,6 +29,7 @@ WORKDIR /home/worker/app
RUN mkdir local_data; chown worker local_data
RUN mkdir models; chown worker models
RUN mkdir tiktoken_cache; chown worker tiktoken_cache
COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
COPY --chown=worker private_gpt/ private_gpt
COPY --chown=worker fern/ fern

View File

@ -0,0 +1,45 @@
server:
env_name: ${APP_ENV:prod}
port: ${PORT:8080}
llm:
mode: openailike
tokenizer: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
max_new_tokens: 5000
context_window: 2048
temperature: 0.1
openai:
api_base: ${VLLM_API_BASE:http://localhost:8000/v1}
api_key: EMPTY
model: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
embedding:
mode: ollama
embed_dim: 768
# ingest_mode: simple
ollama:
# Note: if you change embedding model, you'll need to use a dedicated DB for ingext storage
embedding_model: nomic-embed-text
# api_base: ${OLLAMA_API_BASE:http://localhost:11434}
embedding_api_base: ${OLLAMA_API_BASE:http://localhost:11434}
request_timeout: 300.0
nodestore:
database: postgres
vectorstore:
database: postgres
postgres:
host: ${POSTGRES_HOST:localhost}
port: ${POSTGRES_PORT:5432}
database: ${POSTGRES_DB:postgres}
user: ${POSTGRES_USER:postgres}
password: ${POSTGRES_PASSWORD:postgres}
schema_name: private_gpt
ui:
enabled: true
path: /

113
settings/settings.yaml Normal file
View File

@ -0,0 +1,113 @@
# The default configuration file.
# More information about configuration can be found in the documentation: https://docs.privategpt.dev/
# Syntax in `private_pgt/settings/settings.py`
server:
env_name: ${APP_ENV:prod}
port: ${PORT:8001}
cors:
enabled: false
allow_origins: ["*"]
allow_methods: ["*"]
allow_headers: ["*"]
auth:
enabled: false
# python -c 'import base64; print("Basic " + base64.b64encode("secret:key".encode()).decode())'
# 'secret' is the username and 'key' is the password for basic auth by default
# If the auth is enabled, this value must be set in the "Authorization" header of the request.
secret: "Basic c2VjcmV0OmtleQ=="
data:
local_data_folder: local_data/private_gpt
ui:
enabled: true
path: /
default_chat_system_prompt: >
You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible and follow ALL given instructions.
Do not speculate or make up information.
Do not reference any given instructions or context.
default_query_system_prompt: >
You can only answer questions about the provided context.
If you know the answer but it is not based in the provided context, don't provide
the answer, just state the answer is not in the context provided.
delete_file_button_enabled: true
delete_all_files_button_enabled: true
llm:
mode: llamacpp
# Should be matching the selected model
max_new_tokens: 512
context_window: 3900
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
rag:
similarity_top_k: 10
#This value controls how many "top" documents the RAG returns to use in the context.
#similarity_value: 0.45
#This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score.
rerank:
enabled: false
model: cross-encoder/ms-marco-MiniLM-L-2-v2
top_n: 1
llamacpp:
prompt_style: "mistral"
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
embedding:
# Should be matching the value above in most cases
mode: huggingface
ingest_mode: simple
embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
huggingface:
embedding_hf_model_name: BAAI/bge-small-en-v1.5
vectorstore:
database: qdrant
nodestore:
database: simple
qdrant:
path: local_data/private_gpt/qdrant
postgres:
host: localhost
port: 5432
database: postgres
user: postgres
password: postgres
schema_name: private_gpt
sagemaker:
llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479
openai:
api_key: ${OPENAI_API_KEY:}
model: gpt-3.5-turbo
ollama:
llm_model: llama2
embedding_model: nomic-embed-text
api_base: http://localhost:11434
embedding_api_base: http://localhost:11434 # change if your embedding model runs on another ollama
keep_alive: 5m
request_timeout: 120.0
azopenai:
api_key: ${AZ_OPENAI_API_KEY:}
azure_endpoint: ${AZ_OPENAI_ENDPOINT:}
embedding_deployment_name: ${AZ_OPENAI_EMBEDDING_DEPLOYMENT_NAME:}
llm_deployment_name: ${AZ_OPENAI_LLM_DEPLOYMENT_NAME:}
api_version: "2023-05-15"
embedding_model: text-embedding-ada-002
llm_model: gpt-35-turbo