gpu: Add Pod spec for NIM llama

Pod spec for the NIM inferencing service Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
2025-07-16 08:26:16 +00:00 · 2025-04-16 15:09:58 +00:00 · 2025-04-16 15:09:58 +00:00 · 06466a53c5
commit 06466a53c5
parent 66cd18ae67
1 changed files with 84 additions and 0 deletions
--- a/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in
+++ b/tests/integration/kubernetes/runtimeclass_workloads/nvidia-nim-llama-3-1-8b-instruct.yaml.in
@ -0,0 +1,84 @@
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ngc-secret-instruct
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: ${DOCKER_CONFIG_JSON}
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${POD_NAME_INSTRUCT}
+  labels:
+    app: ${POD_NAME_INSTRUCT}
+spec:
+  restartPolicy: Never
+  runtimeClassName: kata-qemu-nvidia-gpu
+  imagePullSecrets:
+    - name: ngc-secret-instruct
+  securityContext:
+    runAsUser: 0
+    runAsGroup: 0
+    fsGroup: 0
+  containers:
+  - name: ${POD_NAME_INSTRUCT}
+    image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
+    # Ports exposed by the container:
+    ports:
+      - containerPort: 8000
+        name: http-openai
+    livenessProbe:
+      httpGet:
+        path: /v1/health/live
+        port: http-openai
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+    readinessProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: http-openai
+      initialDelaySeconds: 15
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 3
+    startupProbe:
+      httpGet:
+        path: /v1/health/ready
+        port: http-openai
+      initialDelaySeconds: 40
+      periodSeconds: 10
+      timeoutSeconds: 1
+      successThreshold: 1
+      failureThreshold: 180
+    # Environment variable for NGC_API_KEY. In production, use a Secret.
+    env:
+      - name: NGC_API_KEY
+        value: "${NGC_API_KEY}"
+    # GPU resource request/limit (for NVIDIA GPU)
+    resources:
+      requests:
+        cpu: "16"
+        memory: "32Gi"
+      limits:
+        nvidia.com/pgpu: "1"
+        cpu: "16"
+        memory: "32Gi"
+    # Mount the local .cache directory into the container
+    volumeMounts:
+      - name: nim-cache
+        mountPath: /opt/nim/.cache
+
+  # Host path volume for the local .cache directory.
+  # Adjust 'path' to match your $LOCAL_NIM_CACHE location.
+  volumes:
+  - name: nim-cache
+    hostPath:
+      path: "/opr/nim/.cache"
+      type: DirectoryOrCreate
+