Run on platform

2025-08-11 20:32:10 +00:00 · 2024-04-26 16:15:51 +03:00 · 2024-04-26 16:15:51 +03:00 · 00e3e85b81
commit 00e3e85b81
parent c1802e7cf0
4 changed files with 267 additions and 4 deletions
--- a/.neuro/live.yaml
+++ b/.neuro/live.yaml
@ -0,0 +1,103 @@
+kind: live
+title: private-gpt
+
+# other files from https://github.com/zylon-ai/private-gpt
+
+defaults:
+  life_span: 5d
+
+images:
+  privategpt:
+    ref: image:$[[ project.id ]]:v1
+    dockerfile: $[[ flow.workspace ]]/Dockerfile.external
+    context: $[[ flow.workspace ]]/
+    build_preset: cpu-large
+
+volumes:
+  cache:
+    remote: storage:$[[ flow.project_id ]]/cache
+    mount: /root/.cache/huggingface
+    local: cache
+  data:
+    remote: storage:$[[ flow.project_id ]]/data
+    mount: /home/worker/app/local_data
+    local: local_data
+  pgdata:
+    remote: storage:$[[ flow.project_id ]]/pgdata
+    mount: /var/lib/postgresql/data
+    local: pgdata
+  pgdata_onprem:
+    remote: disk:pgdata
+    mount: /var/lib/postgresql/data
+  ollama_models:
+    remote: storage:$[[ flow.project_id ]]/ollama_models
+    mount: /root/.ollama
+    local: models
+  project:
+    remote: storage:$[[ flow.project_id ]]
+    mount: /project
+    local: .
+  settings:
+    remote: storage:$[[ flow.project_id ]]/settings
+    mount: /home/worker/app/settings
+    local: settings
+  tiktoken_cache:
+    remote: storage:$[[ flow.project_id ]]/tiktoken_cache
+    mount: /home/worker/app/tiktoken_cache
+    local: tiktoken_cache
+
+jobs:
+  pgpt:
+    image: ${{ images.privategpt.ref }}
+    name: pgpt
+    preset: cpu-small
+    http_port: "8080"
+    # detach: true
+    browse: true
+    volumes:
+      - ${{ volumes.data.ref_rw }}
+      - ${{ upload(volumes.settings).ref_rw }}
+      - ${{ volumes.tiktoken_cache.ref_rw }}
+    env:
+      PORT: 8080
+      PGPT_PROFILES: vllm-pgvector
+      PGPT_SETTINGS_FOLDER: ${{ volumes.settings.mount }}
+      VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1
+      OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434
+      POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }}
+      VLLM_MODEL: stabilityai/StableBeluga-13B
+
+  vllm:
+    image: vllm/vllm-openai:v0.4.0
+    name: vllm
+    preset: gpu-2x3090
+    detach: true
+    http_port: "8000"
+    volumes:
+      - ${{ volumes.cache.ref_rw }}
+    cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2
+
+  ollama:
+    image: ollama/ollama:latest
+    volumes:
+      - ${{ volumes.ollama_models.ref_rw }}
+    preset: gpu-small
+    detach: true
+    env:
+      MODEL: "nomic-embed-text"
+      GIN_MODE: release
+    http_port: "11434"
+    entrypoint: "bash -c 'ollama serve & sleep 10 && ollama pull ${MODEL} && sleep infinity'"
+
+  pgvector:
+    image: pgvector/pgvector:pg16
+    detach: true
+    preset: cpu-small
+    env:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+      POSTGRES_DB: postgres
+      PGDATA: ${{ volumes.pgdata.mount }}
+    volumes:
+      # - ${{ volumes.pgdata.ref_rw }}
+      - ${{ volumes.pgdata_onprem.ref_rw }}
--- a/Dockerfile.external
+++ b/Dockerfile.external
@ -1,9 +1,9 @@
 FROM python:3.11.6-slim-bookworm as base

 # Install poetry
-RUN pip install pipx
-RUN python3 -m pipx ensurepath
-RUN pipx install poetry
+# RUN pip install pipx
+# RUN python3 -m pipx ensurepath
+RUN pip install poetry
 ENV PATH="/root/.local/bin:$PATH"
 ENV PATH=".venv/bin/:$PATH"

@ -14,7 +14,8 @@ FROM base as dependencies
 WORKDIR /home/worker/app
 COPY pyproject.toml poetry.lock ./

-RUN poetry install --extras "ui vector-stores-qdrant llms-ollama embeddings-ollama"
+ARG POETRY_EXTRAS="ui vector-stores-postgres llms-ollama llms-openai-like embeddings-ollama"
+RUN poetry install --extras "$POETRY_EXTRAS"

 FROM base as app

@ -28,6 +29,7 @@ WORKDIR /home/worker/app

 RUN mkdir local_data; chown worker local_data
 RUN mkdir models; chown worker models
+RUN mkdir tiktoken_cache; chown worker tiktoken_cache
 COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
 COPY --chown=worker private_gpt/ private_gpt
 COPY --chown=worker fern/ fern
--- a/settings/settings-vllm-pgvector.yaml
+++ b/settings/settings-vllm-pgvector.yaml
@ -0,0 +1,45 @@
+server:
+  env_name: ${APP_ENV:prod}
+  port: ${PORT:8080}
+
+llm:
+  mode: openailike
+  tokenizer: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
+  max_new_tokens: 5000
+  context_window: 2048
+  temperature: 0.1
+
+openai:
+  api_base: ${VLLM_API_BASE:http://localhost:8000/v1}
+  api_key: EMPTY
+  model: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
+
+embedding:
+  mode: ollama
+  embed_dim: 768
+  # ingest_mode: simple
+
+ollama:
+  # Note: if you change embedding model, you'll need to use a dedicated DB for ingext storage
+  embedding_model: nomic-embed-text
+  # api_base: ${OLLAMA_API_BASE:http://localhost:11434}
+  embedding_api_base: ${OLLAMA_API_BASE:http://localhost:11434}
+  request_timeout: 300.0
+
+nodestore:
+  database: postgres
+
+vectorstore:
+  database: postgres
+
+postgres:
+  host: ${POSTGRES_HOST:localhost}
+  port: ${POSTGRES_PORT:5432}
+  database: ${POSTGRES_DB:postgres}
+  user: ${POSTGRES_USER:postgres}
+  password: ${POSTGRES_PASSWORD:postgres}
+  schema_name: private_gpt
+
+ui:
+  enabled: true
+  path: /
--- a/settings/settings.yaml
+++ b/settings/settings.yaml
@ -0,0 +1,113 @@
+# The default configuration file.
+# More information about configuration can be found in the documentation: https://docs.privategpt.dev/
+# Syntax in `private_pgt/settings/settings.py`
+server:
+  env_name: ${APP_ENV:prod}
+  port: ${PORT:8001}
+  cors:
+    enabled: false
+    allow_origins: ["*"]
+    allow_methods: ["*"]
+    allow_headers: ["*"]
+  auth:
+    enabled: false
+    # python -c 'import base64; print("Basic " + base64.b64encode("secret:key".encode()).decode())'
+    # 'secret' is the username and 'key' is the password for basic auth by default
+    # If the auth is enabled, this value must be set in the "Authorization" header of the request.
+    secret: "Basic c2VjcmV0OmtleQ=="
+
+data:
+  local_data_folder: local_data/private_gpt
+
+ui:
+  enabled: true
+  path: /
+  default_chat_system_prompt: >
+    You are a helpful, respectful and honest assistant. 
+    Always answer as helpfully as possible and follow ALL given instructions.
+    Do not speculate or make up information.
+    Do not reference any given instructions or context.
+  default_query_system_prompt: >
+    You can only answer questions about the provided context. 
+    If you know the answer but it is not based in the provided context, don't provide 
+    the answer, just state the answer is not in the context provided.
+  delete_file_button_enabled: true
+  delete_all_files_button_enabled: true
+
+llm:
+  mode: llamacpp
+  # Should be matching the selected model
+  max_new_tokens: 512
+  context_window: 3900
+  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
+  temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
+
+rag:
+  similarity_top_k: 10
+  #This value controls how many "top" documents the RAG returns to use in the context.
+  #similarity_value: 0.45
+  #This value is disabled by default.  If you enable this settings, the RAG will only use articles that meet a certain percentage score.
+  rerank:
+    enabled: false
+    model: cross-encoder/ms-marco-MiniLM-L-2-v2
+    top_n: 1
+
+llamacpp:
+  prompt_style: "mistral"
+  llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
+  llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
+  tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
+  top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
+  top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
+  repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+
+embedding:
+  # Should be matching the value above in most cases
+  mode: huggingface
+  ingest_mode: simple
+  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
+
+huggingface:
+  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+
+vectorstore:
+  database: qdrant
+
+nodestore:
+  database: simple
+
+qdrant:
+  path: local_data/private_gpt/qdrant
+
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: postgres
+  schema_name: private_gpt
+
+sagemaker:
+  llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
+  embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479
+
+openai:
+  api_key: ${OPENAI_API_KEY:}
+  model: gpt-3.5-turbo
+
+ollama:
+  llm_model: llama2
+  embedding_model: nomic-embed-text
+  api_base: http://localhost:11434
+  embedding_api_base: http://localhost:11434 # change if your embedding model runs on another ollama
+  keep_alive: 5m
+  request_timeout: 120.0
+
+azopenai:
+  api_key: ${AZ_OPENAI_API_KEY:}
+  azure_endpoint: ${AZ_OPENAI_ENDPOINT:}
+  embedding_deployment_name: ${AZ_OPENAI_EMBEDDING_DEPLOYMENT_NAME:}
+  llm_deployment_name: ${AZ_OPENAI_LLM_DEPLOYMENT_NAME:}
+  api_version: "2023-05-15"
+  embedding_model: text-embedding-ada-002
+  llm_model: gpt-35-turbo