kata-deploy: add node selector to nvidia runtime classes

The CC runtime classes kata-qemu-nvidia-gpu-snp and kata-qemu-nvidia-gpu-tdx
are mutually exclusive with kata-qemu-nvidia-gpu, as dictated by the gpu
cc mode setting. In order to properly support a cluster that has both CC and
non-CC nodes, we use a node selector so the scheduling is consistent with the
GPU mode. The GPU operator sets a label nvidia.com/cc.ready.state=[true, false]
to indicate the gpu mode setting

Fixes #12431

Signed-off-by: Joji Mekkattuparamban <jojim@nvidia.com>
This commit is contained in:
Joji Mekkattuparamban
2026-02-02 08:48:05 -08:00
committed by Fabiano Fidêncio
parent f4dcb66a3c
commit f3bba08851
2 changed files with 28 additions and 1 deletions

View File

@@ -22,7 +22,7 @@
{{- end -}}
{{- end -}}
{{- /* Define runtime class configurations with their overhead settings */ -}}
{{- /* Define runtime class configurations with their overhead settings and node selectors */ -}}
{{- $runtimeClassConfigs := dict
"clh" (dict "memory" "130Mi" "cpu" "250m")
"cloud-hypervisor" (dict "memory" "130Mi" "cpu" "250m")
@@ -49,6 +49,7 @@
{{- /* Create RuntimeClass for each enabled shim */ -}}
{{- range $shim := $enabledShims }}
{{- $config := index $runtimeClassConfigs $shim }}
{{- $shimConfig := index $.Values.shims $shim }}
{{- if $config }}
---
kind: RuntimeClass
@@ -78,6 +79,11 @@ overhead:
scheduling:
nodeSelector:
katacontainers.io/kata-runtime: "true"
{{- if and $shimConfig.runtimeClass $shimConfig.runtimeClass.nodeSelector }}
{{- range $key, $value := $shimConfig.runtimeClass.nodeSelector }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -114,6 +114,11 @@ shims:
allowedHypervisorAnnotations: []
containerd:
snapshotter: ""
runtimeClass:
# This label is automatically added by gpu-operator. Override it
# if you want to use a different label.
nodeSelector:
nvidia.com/cc.ready.state: "false"
qemu-nvidia-gpu-snp:
enabled: ~
@@ -128,6 +133,14 @@ shims:
agent:
httpsProxy: ""
noProxy: ""
runtimeClass:
# These labels are automatically added by gpu-operator and NFD
# respectively. Override if you want to use a different label.
# If you don't have NFD, you need to add the snp label by other
# means to your SNP nodes.
nodeSelector:
nvidia.com/cc.ready.state: "true"
amd.feature.node.kubernetes.io/snp: "true"
qemu-nvidia-gpu-tdx:
enabled: ~
@@ -142,6 +155,14 @@ shims:
agent:
httpsProxy: ""
noProxy: ""
runtimeClass:
# These labels are automatically added by gpu-operator and NFD
# respectively. Override if you want to use a different label.
# If you don't have NFD, you need to add the tdx label by other
# means to your TDX nodes.
nodeSelector:
nvidia.com/cc.ready.state: "true"
intel.feature.node.kubernetes.io/tdx: "true"
qemu-snp:
enabled: ~