diff --git a/.neuro/live.yaml b/.neuro/live.yaml new file mode 100644 index 00000000..2bc9bf37 --- /dev/null +++ b/.neuro/live.yaml @@ -0,0 +1,103 @@ +kind: live +title: private-gpt + +# other files from https://github.com/zylon-ai/private-gpt + +defaults: + life_span: 5d + +images: + privategpt: + ref: image:$[[ project.id ]]:v1 + dockerfile: $[[ flow.workspace ]]/Dockerfile.external + context: $[[ flow.workspace ]]/ + build_preset: cpu-large + +volumes: + cache: + remote: storage:$[[ flow.project_id ]]/cache + mount: /root/.cache/huggingface + local: cache + data: + remote: storage:$[[ flow.project_id ]]/data + mount: /home/worker/app/local_data + local: local_data + pgdata: + remote: storage:$[[ flow.project_id ]]/pgdata + mount: /var/lib/postgresql/data + local: pgdata + pgdata_onprem: + remote: disk:pgdata + mount: /var/lib/postgresql/data + ollama_models: + remote: storage:$[[ flow.project_id ]]/ollama_models + mount: /root/.ollama + local: models + project: + remote: storage:$[[ flow.project_id ]] + mount: /project + local: . + settings: + remote: storage:$[[ flow.project_id ]]/settings + mount: /home/worker/app/settings + local: settings + tiktoken_cache: + remote: storage:$[[ flow.project_id ]]/tiktoken_cache + mount: /home/worker/app/tiktoken_cache + local: tiktoken_cache + +jobs: + pgpt: + image: ${{ images.privategpt.ref }} + name: pgpt + preset: cpu-small + http_port: "8080" + # detach: true + browse: true + volumes: + - ${{ volumes.data.ref_rw }} + - ${{ upload(volumes.settings).ref_rw }} + - ${{ volumes.tiktoken_cache.ref_rw }} + env: + PORT: 8080 + PGPT_PROFILES: vllm-pgvector + PGPT_SETTINGS_FOLDER: ${{ volumes.settings.mount }} + VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1 + OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434 + POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }} + VLLM_MODEL: stabilityai/StableBeluga-13B + + vllm: + image: vllm/vllm-openai:v0.4.0 + name: vllm + preset: gpu-2x3090 + detach: true + http_port: "8000" + volumes: + - ${{ volumes.cache.ref_rw }} + cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2 + + ollama: + image: ollama/ollama:latest + volumes: + - ${{ volumes.ollama_models.ref_rw }} + preset: gpu-small + detach: true + env: + MODEL: "nomic-embed-text" + GIN_MODE: release + http_port: "11434" + entrypoint: "bash -c 'ollama serve & sleep 10 && ollama pull ${MODEL} && sleep infinity'" + + pgvector: + image: pgvector/pgvector:pg16 + detach: true + preset: cpu-small + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + PGDATA: ${{ volumes.pgdata.mount }} + volumes: + # - ${{ volumes.pgdata.ref_rw }} + - ${{ volumes.pgdata_onprem.ref_rw }} diff --git a/Dockerfile.external b/Dockerfile.external index 3e2530e0..a6daaa7f 100644 --- a/Dockerfile.external +++ b/Dockerfile.external @@ -1,9 +1,9 @@ FROM python:3.11.6-slim-bookworm as base # Install poetry -RUN pip install pipx -RUN python3 -m pipx ensurepath -RUN pipx install poetry +# RUN pip install pipx +# RUN python3 -m pipx ensurepath +RUN pip install poetry ENV PATH="/root/.local/bin:$PATH" ENV PATH=".venv/bin/:$PATH" @@ -14,7 +14,8 @@ FROM base as dependencies WORKDIR /home/worker/app COPY pyproject.toml poetry.lock ./ -RUN poetry install --extras "ui vector-stores-qdrant llms-ollama embeddings-ollama" +ARG POETRY_EXTRAS="ui vector-stores-postgres llms-ollama llms-openai-like embeddings-ollama" +RUN poetry install --extras "$POETRY_EXTRAS" FROM base as app @@ -28,6 +29,7 @@ WORKDIR /home/worker/app RUN mkdir local_data; chown worker local_data RUN mkdir models; chown worker models +RUN mkdir tiktoken_cache; chown worker tiktoken_cache COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv COPY --chown=worker private_gpt/ private_gpt COPY --chown=worker fern/ fern diff --git a/settings/settings-vllm-pgvector.yaml b/settings/settings-vllm-pgvector.yaml new file mode 100644 index 00000000..35001aa4 --- /dev/null +++ b/settings/settings-vllm-pgvector.yaml @@ -0,0 +1,45 @@ +server: + env_name: ${APP_ENV:prod} + port: ${PORT:8080} + +llm: + mode: openailike + tokenizer: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5} + max_new_tokens: 5000 + context_window: 2048 + temperature: 0.1 + +openai: + api_base: ${VLLM_API_BASE:http://localhost:8000/v1} + api_key: EMPTY + model: ${VLLM_MODEL:lmsys/vicuna-7b-v1.5} + +embedding: + mode: ollama + embed_dim: 768 + # ingest_mode: simple + +ollama: + # Note: if you change embedding model, you'll need to use a dedicated DB for ingext storage + embedding_model: nomic-embed-text + # api_base: ${OLLAMA_API_BASE:http://localhost:11434} + embedding_api_base: ${OLLAMA_API_BASE:http://localhost:11434} + request_timeout: 300.0 + +nodestore: + database: postgres + +vectorstore: + database: postgres + +postgres: + host: ${POSTGRES_HOST:localhost} + port: ${POSTGRES_PORT:5432} + database: ${POSTGRES_DB:postgres} + user: ${POSTGRES_USER:postgres} + password: ${POSTGRES_PASSWORD:postgres} + schema_name: private_gpt + +ui: + enabled: true + path: / diff --git a/settings/settings.yaml b/settings/settings.yaml new file mode 100644 index 00000000..1b5be09d --- /dev/null +++ b/settings/settings.yaml @@ -0,0 +1,113 @@ +# The default configuration file. +# More information about configuration can be found in the documentation: https://docs.privategpt.dev/ +# Syntax in `private_pgt/settings/settings.py` +server: + env_name: ${APP_ENV:prod} + port: ${PORT:8001} + cors: + enabled: false + allow_origins: ["*"] + allow_methods: ["*"] + allow_headers: ["*"] + auth: + enabled: false + # python -c 'import base64; print("Basic " + base64.b64encode("secret:key".encode()).decode())' + # 'secret' is the username and 'key' is the password for basic auth by default + # If the auth is enabled, this value must be set in the "Authorization" header of the request. + secret: "Basic c2VjcmV0OmtleQ==" + +data: + local_data_folder: local_data/private_gpt + +ui: + enabled: true + path: / + default_chat_system_prompt: > + You are a helpful, respectful and honest assistant. + Always answer as helpfully as possible and follow ALL given instructions. + Do not speculate or make up information. + Do not reference any given instructions or context. + default_query_system_prompt: > + You can only answer questions about the provided context. + If you know the answer but it is not based in the provided context, don't provide + the answer, just state the answer is not in the context provided. + delete_file_button_enabled: true + delete_all_files_button_enabled: true + +llm: + mode: llamacpp + # Should be matching the selected model + max_new_tokens: 512 + context_window: 3900 + tokenizer: mistralai/Mistral-7B-Instruct-v0.2 + temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) + +rag: + similarity_top_k: 10 + #This value controls how many "top" documents the RAG returns to use in the context. + #similarity_value: 0.45 + #This value is disabled by default. If you enable this settings, the RAG will only use articles that meet a certain percentage score. + rerank: + enabled: false + model: cross-encoder/ms-marco-MiniLM-L-2-v2 + top_n: 1 + +llamacpp: + prompt_style: "mistral" + llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF + llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) + +embedding: + # Should be matching the value above in most cases + mode: huggingface + ingest_mode: simple + embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5 + +huggingface: + embedding_hf_model_name: BAAI/bge-small-en-v1.5 + +vectorstore: + database: qdrant + +nodestore: + database: simple + +qdrant: + path: local_data/private_gpt/qdrant + +postgres: + host: localhost + port: 5432 + database: postgres + user: postgres + password: postgres + schema_name: private_gpt + +sagemaker: + llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140 + embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479 + +openai: + api_key: ${OPENAI_API_KEY:} + model: gpt-3.5-turbo + +ollama: + llm_model: llama2 + embedding_model: nomic-embed-text + api_base: http://localhost:11434 + embedding_api_base: http://localhost:11434 # change if your embedding model runs on another ollama + keep_alive: 5m + request_timeout: 120.0 + +azopenai: + api_key: ${AZ_OPENAI_API_KEY:} + azure_endpoint: ${AZ_OPENAI_ENDPOINT:} + embedding_deployment_name: ${AZ_OPENAI_EMBEDDING_DEPLOYMENT_NAME:} + llm_deployment_name: ${AZ_OPENAI_LLM_DEPLOYMENT_NAME:} + api_version: "2023-05-15" + embedding_model: text-embedding-ada-002 + llm_model: gpt-35-turbo