From 17f32a5f92635c7548a33eabe0b9a113c2136931 Mon Sep 17 00:00:00 2001
From: Yevhenii Semendiak <semendyak@gmail.com>
Date: Sat, 11 May 2024 11:33:11 +0300
Subject: [PATCH] Use Llama 3

---
 .neuro/live.yaml | 8 +++++---
 README.md        | 9 +++++----
 2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/.neuro/live.yaml b/.neuro/live.yaml
index 07f321ca..355d5bbb 100644
--- a/.neuro/live.yaml
+++ b/.neuro/live.yaml
@@ -65,8 +65,8 @@ jobs:
       VLLM_API_BASE: http://${{ inspect_job('vllm').internal_hostname_named }}:8000/v1
       OLLAMA_API_BASE: http://${{ inspect_job('ollama').internal_hostname_named }}:11434
       POSTGRES_HOST: ${{ inspect_job('pgvector').internal_hostname_named }}
-      VLLM_MODEL: stabilityai/StableBeluga-13B
-      VLLM_TOKENIZER: stabilityai/StableBeluga-13B
+      VLLM_MODEL: meta-llama/Meta-Llama-3-8B-Instruct
+      VLLM_TOKENIZER: meta-llama/Meta-Llama-3-8B-Instruct
 
   vllm:
     image: vllm/vllm-openai:v0.4.0
@@ -76,7 +76,9 @@ jobs:
     http_port: "8000"
     volumes:
       - ${{ volumes.cache.ref_rw }}
-    cmd: --model stabilityai/StableBeluga-13B --tokenizer stabilityai/StableBeluga-13B --dtype=half --tensor-parallel-size=2
+    env:
+      HF_TOKEN: secret:HF_TOKEN
+    cmd: --model meta-llama/Meta-Llama-3-8B-Instruct --tokenizer meta-llama/Meta-Llama-3-8B-Instruct --dtype=half --tensor-parallel-size=2
 
   ollama:
     image: ollama/ollama:latest
diff --git a/README.md b/README.md
index f78caada..d24cb07f 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@ Note: this setup is mostly for POC purposes. For production-ready setup, you'll
 1. `git clone` this repo && `cd` into root of it. We assume you've installed CLIs for the platform and authorized to the cluster.
 1. Build image for web app with `neuro-flow build privategpt`
 2. Create block storage for PGVector with `neuro disk create --name pgdata 10G --timeout-unused 100d`
-3. `neuro-flow run pgvector` -- start vector store
-4. `neuro-flow run ollama` -- start embeddings server
-5. `neuro-flow run vllm` -- start LLM inference server. Note: if you want to change LLM hosted there, change it in bash command and in `env.VLLM_MODEL` of `pgpt` job.
-6. `neuro-flow run pgpt` -- start PrivateGPT web server.
+3. Create secret with HuggingFace token to pull models `neuro secret add HF_TOKEN <token>` (see https://huggingface.co/settings/tokens)
+4. `neuro-flow run pgvector` -- start vector store
+5. `neuro-flow run ollama` -- start embeddings server
+6. `neuro-flow run vllm` -- start LLM inference server. Note: if you want to change LLM hosted there, change it in bash command and in `env.VLLM_MODEL` of `pgpt` job.
+7. `neuro-flow run pgpt` -- start PrivateGPT web server.
 
 ### Running PrivateGPT as stand-alone job
 <details>