gpu: Add Pod spec for NIM llama

Pod spec for the NIM inferencing service

Signed-off-by: Zvonko Kaiser <zkaiser@nvidia.com>
This commit is contained in:
Zvonko Kaiser 2025-04-16 15:09:58 +00:00
parent 66cd18ae67
commit 06466a53c5

View File

@ -0,0 +1,84 @@
---
apiVersion: v1
kind: Secret
metadata:
name: ngc-secret-instruct
type: kubernetes.io/dockerconfigjson
data:
.dockerconfigjson: ${DOCKER_CONFIG_JSON}
---
apiVersion: v1
kind: Pod
metadata:
name: ${POD_NAME_INSTRUCT}
labels:
app: ${POD_NAME_INSTRUCT}
spec:
restartPolicy: Never
runtimeClassName: kata-qemu-nvidia-gpu
imagePullSecrets:
- name: ngc-secret-instruct
securityContext:
runAsUser: 0
runAsGroup: 0
fsGroup: 0
containers:
- name: ${POD_NAME_INSTRUCT}
image: nvcr.io/nim/meta/llama3-8b-instruct:1.0.0
# Ports exposed by the container:
ports:
- containerPort: 8000
name: http-openai
livenessProbe:
httpGet:
path: /v1/health/live
port: http-openai
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
readinessProbe:
httpGet:
path: /v1/health/ready
port: http-openai
initialDelaySeconds: 15
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
startupProbe:
httpGet:
path: /v1/health/ready
port: http-openai
initialDelaySeconds: 40
periodSeconds: 10
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 180
# Environment variable for NGC_API_KEY. In production, use a Secret.
env:
- name: NGC_API_KEY
value: "${NGC_API_KEY}"
# GPU resource request/limit (for NVIDIA GPU)
resources:
requests:
cpu: "16"
memory: "32Gi"
limits:
nvidia.com/pgpu: "1"
cpu: "16"
memory: "32Gi"
# Mount the local .cache directory into the container
volumeMounts:
- name: nim-cache
mountPath: /opt/nim/.cache
# Host path volume for the local .cache directory.
# Adjust 'path' to match your $LOCAL_NIM_CACHE location.
volumes:
- name: nim-cache
hostPath:
path: "/opr/nim/.cache"
type: DirectoryOrCreate