Run on platform

2025-08-15 22:23:28 +00:00 · 2024-04-26 16:15:51 +03:00 · 2024-04-26 16:15:51 +03:00 · 00e3e85b81
commit 00e3e85b81
parent c1802e7cf0
4 changed files with 267 additions and 4 deletions
--- a/.neuro/live.yaml
+++ b/.neuro/live.yaml
@ -0,0 +1,103 @@
 kind: live
 title: private-gpt
 # other files from https://github.com/zylon-ai/private-gpt
 defaults:
  life_span: 5d
 images:
  privategpt:
    ref: image:$[[ project.id ]]:v1
    dockerfile: $[[ flow.workspace ]]/Dockerfile.external
    context: $[[ flow.workspace ]]/
    build_preset: cpu-large
 volumes:
  cache:
    remote: storage:$[[ flow.project_id ]]/cache
    mount: /root/.cache/huggingface
    local: cache
  data:
    remote: storage:$[[ flow.project_id ]]/data
    mount: /home/worker/app/local_data
    local: local_data
  pgdata:
    remote: storage:$[[ flow.project_id ]]/pgdata
    mount: /var/lib/postgresql/data
    local: pgdata
  pgdata_onprem:
    remote: disk:pgdata
    mount: /var/lib/postgresql/data
  ollama_models:
    remote: storage:$[[ flow.project_id ]]/ollama_models
    mount: /root/.ollama
    local: models
  project:
    remote: storage:$[[ flow.project_id ]]
    mount: /project
    local: .
  settings:
    remote: storage:$[[ flow.project_id ]]/settings
    mount: /home/worker/app/settings
    local: settings
  tiktoken_cache:
    remote: storage:$[[ flow.project_id ]]/tiktoken_cache
    mount: /home/worker/app/tiktoken_cache
    local: tiktoken_cache
 jobs:
  pgpt:
    image: ${{ images.privategpt.ref }}
    name: pgpt
    preset: cpu-small
    http_port: "8080"
    # detach: true
    browse: true
    volumes:
      - ${{ volumes.data.ref_rw }}
      - ${{ upload(volumes.settings).ref_rw }}
      - ${{ volumes.tiktoken_cache.ref_rw }}
    env:
      PORT: 8080
      PGPT_PROFILES: vllm-pgvector
      PGPT_SETTINGS_FOLDER: ${{ volumes.settings.mount }}
      VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1
      OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434
      POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }}
      VLLM_MODEL: stabilityai/StableBeluga-13B
  vllm:
    image: vllm/vllm-openai:v0.4.0
    name: vllm
    preset: gpu-2x3090
    detach: true
    http_port: "8000"
    volumes:
      - ${{ volumes.cache.ref_rw }}
    cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2
  ollama:
    image: ollama/ollama:latest
    volumes:
      - ${{ volumes.ollama_models.ref_rw }}
    preset: gpu-small
    detach: true
    env:
      MODEL: "nomic-embed-text"
      GIN_MODE: release
    http_port: "11434"
    entrypoint: "bash -c 'ollama serve & sleep 10 && ollama pull ${MODEL} && sleep infinity'"
  pgvector:
    image: pgvector/pgvector:pg16
    detach: true
    preset: cpu-small
    env:
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD: postgres
      POSTGRES_DB: postgres
      PGDATA: ${{ volumes.pgdata.mount }}
    volumes:
      # - ${{ volumes.pgdata.ref_rw }}
      - ${{ volumes.pgdata_onprem.ref_rw }}
--- a/Dockerfile.external
+++ b/Dockerfile.external
@ -1,9 +1,9 @@
 FROM python:3.11.6-slim-bookworm as base
 # Install poetry
-RUN pip install pipx
+# RUN pip install pipx
-RUN python3 -m pipx ensurepath
+# RUN python3 -m pipx ensurepath
-RUN pipx install poetry
+RUN pip install poetry
 ENV PATH="/root/.local/bin:$PATH"
 ENV PATH=".venv/bin/:$PATH"
@ -14,7 +14,8 @@ FROM base as dependencies
 WORKDIR /home/worker/app
 COPY pyproject.toml poetry.lock ./
-RUN poetry install --extras "ui vector-stores-qdrant llms-ollama embeddings-ollama"
+ARG POETRY_EXTRAS="ui vector-stores-postgres llms-ollama llms-openai-like embeddings-ollama"
 RUN poetry install --extras "$POETRY_EXTRAS"
 FROM base as app
@ -28,6 +29,7 @@ WORKDIR /home/worker/app
 RUN mkdir local_data; chown worker local_data
 RUN mkdir models; chown worker models
 RUN mkdir tiktoken_cache; chown worker tiktoken_cache
 COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
 COPY --chown=worker private_gpt/ private_gpt
 COPY --chown=worker fern/ fern
--- a/settings/settings-vllm-pgvector.yaml
+++ b/settings/settings-vllm-pgvector.yaml
@ -0,0 +1,45 @@
 server:
  env_name: ${APP_ENV:prod}
  port: ${PORT:8080}
 llm:
  mode: openailike
  tokenizer: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
  max_new_tokens: 5000
  context_window: 2048
  temperature: 0.1
 openai:
  api_base: ${VLLM_API_BASE:http://localhost:8000/v1}
  api_key: EMPTY
  model: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5}
 embedding:
  mode: ollama
  embed_dim: 768
  # ingest_mode: simple
 ollama:
  # Note: if you change embedding model, you'll need to use a dedicated DB for ingext storage
  embedding_model: nomic-embed-text
  # api_base: ${OLLAMA_API_BASE:http://localhost:11434}
  embedding_api_base: ${OLLAMA_API_BASE:http://localhost:11434}
  request_timeout: 300.0
 nodestore:
  database: postgres
 vectorstore:
  database: postgres
 postgres:
  host: ${POSTGRES_HOST:localhost}
  port: ${POSTGRES_PORT:5432}
  database: ${POSTGRES_DB:postgres}
  user: ${POSTGRES_USER:postgres}
  password: ${POSTGRES_PASSWORD:postgres}
  schema_name: private_gpt
 ui:
  enabled: true
  path: /
--- a/settings/settings.yaml
+++ b/settings/settings.yaml
@ -0,0 +1,113 @@
 # The default configuration file.
 # More information about configuration can be found in the documentation: https://docs.privategpt.dev/
 # Syntax in `private_pgt/settings/settings.py`
 server:
  env_name: ${APP_ENV:prod}
  port: ${PORT:8001}
  cors:
    enabled: false
    allow_origins: ["*"]
    allow_methods: ["*"]
    allow_headers: ["*"]
  auth:
    enabled: false
    # python -c 'import base64; print("Basic " + base64.b64encode("secret:key".encode()).decode())'
    # 'secret' is the username and 'key' is the password for basic auth by default
    # If the auth is enabled, this value must be set in the "Authorization" header of the request.
    secret: "Basic c2VjcmV0OmtleQ=="
 data:
  local_data_folder: local_data/private_gpt
 ui:
  enabled: true
  path: /
  default_chat_system_prompt: >
    You are a helpful, respectful and honest assistant. 
    Always answer as helpfully as possible and follow ALL given instructions.
    Do not speculate or make up information.
    Do not reference any given instructions or context.
  default_query_system_prompt: >
    You can only answer questions about the provided context. 
    If you know the answer but it is not based in the provided context, don't provide 
    the answer, just state the answer is not in the context provided.
  delete_file_button_enabled: true
  delete_all_files_button_enabled: true
 llm:
  mode: llamacpp
  # Should be matching the selected model
  max_new_tokens: 512
  context_window: 3900
  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
  temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
 rag:
  similarity_top_k: 10
  #This value controls how many "top" documents the RAG returns to use in the context.
  #similarity_value: 0.45
  #This value is disabled by default.  If you enable this settings, the RAG will only use articles that meet a certain percentage score.
  rerank:
    enabled: false
    model: cross-encoder/ms-marco-MiniLM-L-2-v2
    top_n: 1
 llamacpp:
  prompt_style: "mistral"
  llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
  llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
  tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
  top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
  top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
  repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
 embedding:
  # Should be matching the value above in most cases
  mode: huggingface
  ingest_mode: simple
  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
 huggingface:
  embedding_hf_model_name: BAAI/bge-small-en-v1.5
 vectorstore:
  database: qdrant
 nodestore:
  database: simple
 qdrant:
  path: local_data/private_gpt/qdrant
 postgres:
  host: localhost
  port: 5432
  database: postgres
  user: postgres
  password: postgres
  schema_name: private_gpt
 sagemaker:
  llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
  embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479
 openai:
  api_key: ${OPENAI_API_KEY:}
  model: gpt-3.5-turbo
 ollama:
  llm_model: llama2
  embedding_model: nomic-embed-text
  api_base: http://localhost:11434
  embedding_api_base: http://localhost:11434 # change if your embedding model runs on another ollama
  keep_alive: 5m
  request_timeout: 120.0
 azopenai:
  api_key: ${AZ_OPENAI_API_KEY:}
  azure_endpoint: ${AZ_OPENAI_ENDPOINT:}
  embedding_deployment_name: ${AZ_OPENAI_EMBEDDING_DEPLOYMENT_NAME:}
  llm_deployment_name: ${AZ_OPENAI_LLM_DEPLOYMENT_NAME:}
  api_version: "2023-05-15"
  embedding_model: text-embedding-ada-002
  llm_model: gpt-35-turbo