Update vLLM config to use llama 3.1 8B by default

2025-09-23 20:17:24 +00:00 · 2024-09-20 16:32:00 +03:00
parent 1938cac2c2
commit f8fdef0f38
1 changed files with 22 additions and 4 deletions
--- a/.neuro/live.yaml
+++ b/.neuro/live.yaml
@@ -69,22 +69,40 @@ jobs:
      VLLM_TOKENIZER: meta-llama/Meta-Llama-3-8B-Instruct

  vllm:
-    image: vllm/vllm-openai:v0.5.1
+    image: vllm/vllm-openai:v0.6.1.post2
    name: vllm
-    preset: H100x1
+    preset: gpu-medium
    detach: true
    http_port: "8000"
    volumes:
      - ${{ volumes.cache.ref_rw }}
    env:
      HF_TOKEN: secret:HF_TOKEN
-    cmd: --model meta-llama/Meta-Llama-3-8B-Instruct --tokenizer meta-llama/Meta-Llama-3-8B-Instruct --dtype=half
+    cmd: >
+      --model meta-llama/Meta-Llama-3.1-8B-Instruct
+      --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct
+      --dtype=half
+      --max-model-len=50000
+      --tensor-parallel-size=2
+    # cmd: >
+    #   --model meta-llama/Meta-Llama-3.1-8B-Instruct
+    #   --tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct
+    #   --dtype=half
+    # cmd: >
+    #   --model TechxGenus/Meta-Llama-3-70B-AWQ
+    #   --tokenizer TechxGenus/Meta-Llama-3-70B-AWQ
+    #   -q=awq
+    # cmd: >
+    #   --model mgoin/Meta-Llama-3-70B-Instruct-Marlin
+    #   --tokenizer mgoin/Meta-Llama-3-70B-Instruct-Marlin
+    #   --dtype=half
+    #   -q=marlin

  ollama:
    image: ollama/ollama:0.1.35
    volumes:
      - ${{ volumes.ollama_models.ref_rw }}
-    preset: H100x1
+    preset: gpu-small
    detach: true
    env:
      MODEL: "nomic-embed-text"