From 17f32a5f92635c7548a33eabe0b9a113c2136931 Mon Sep 17 00:00:00 2001 From: Yevhenii Semendiak Date: Sat, 11 May 2024 11:33:11 +0300 Subject: [PATCH] Use Llama 3 --- .neuro/live.yaml | 8 +++++--- README.md | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.neuro/live.yaml b/.neuro/live.yaml index 07f321ca..355d5bbb 100644 --- a/.neuro/live.yaml +++ b/.neuro/live.yaml @@ -65,8 +65,8 @@ jobs: VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1 OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434 POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }} - VLLM_MODEL: stabilityai/StableBeluga-13B - VLLM_TOKENIZER: stabilityai/StableBeluga-13B + VLLM_MODEL: meta-llama/Meta-Llama-3-8B-Instruct + VLLM_TOKENIZER: meta-llama/Meta-Llama-3-8B-Instruct vllm: image: vllm/vllm-openai:v0.4.0 @@ -76,7 +76,9 @@ jobs: http_port: "8000" volumes: - ${{ volumes.cache.ref_rw }} - cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2 + env: + HF_TOKEN: secret:HF_TOKEN + cmd: --model meta-llama/Meta-Llama-3-8B-Instruct --tokenizer meta-llama/Meta-Llama-3-8B-Instruct --dtype=half --tensor-parallel-size=2 ollama: image: ollama/ollama:latest diff --git a/README.md b/README.md index f78caada..d24cb07f 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ Note: this setup is mostly for POC purposes. For production-ready setup, you'll 1. `git clone` this repo && `cd` into root of it. We assume you've installed CLIs for the platform and authorized to the cluster. 1. Build image for web app with `neuro-flow build privategpt` 2. Create block storage for PGVector with `neuro disk create --name pgdata 10G --timeout-unused 100d` -3. `neuro-flow run pgvector` -- start vector store -4. `neuro-flow run ollama` -- start embeddings server -5. `neuro-flow run vllm` -- start LLM inference server. Note: if you want to change LLM hosted there, change it in bash command and in `env.VLLM_MODEL` of `pgpt` job. -6. `neuro-flow run pgpt` -- start PrivateGPT web server. +3. Create secret with HuggingFace token to pull models `neuro secret add HF_TOKEN ` (see https://huggingface.co/settings/tokens) +4. `neuro-flow run pgvector` -- start vector store +5. `neuro-flow run ollama` -- start embeddings server +6. `neuro-flow run vllm` -- start LLM inference server. Note: if you want to change LLM hosted there, change it in bash command and in `env.VLLM_MODEL` of `pgpt` job. +7. `neuro-flow run pgpt` -- start PrivateGPT web server. ### Running PrivateGPT as stand-alone job