Merge pull request #56117 from jiayingz/deviceplugin-addon-config

Automatic merge from submit-queue (batch tested with PRs 56021, 55843, 55088, 56117, 55859). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Changes nvidia-gpu device plugin addon config settings: - Runs as system critical pod - Makes resource limits to match its resource requets - Modifies test/e2e/scheduling/nvidia-gpus.go to cope with the recent change of running the device plugin as a system addon. - The resource settings of the addon is based on the test results from 8 nvidia-tesla-k80 gpus. **What this PR does / why we need it**: **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: **Release note**: ```release-note ```
2025-07-25 12:43:23 +00:00 · 2017-11-21 12:16:57 -08:00 · 2017-11-21 12:16:57 -08:00 · da96ce00e5
commit da96ce00e5
parent 5242f01e8c 4a1a205109
2 changed files with 12 additions and 1 deletions
--- a/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml
+++ b/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml
@ -11,7 +11,10 @@ spec:
    metadata:
      labels:
        k8s-app: nvidia-gpu-device-plugin
      annotations:
        scheduler.alpha.kubernetes.io/critical-pod: ''
    spec:
      priorityClassName: system-node-critical
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
@ -34,7 +37,10 @@ spec:
        name: nvidia-gpu-device-plugin
        resources:
          requests:
-            cpu: 10m
+            cpu: 50m
            memory: 10Mi
          limits:
            cpu: 50m
            memory: 10Mi
        securityContext:
          privileged: true
--- a/test/e2e/scheduling/nvidia-gpus.go
+++ b/test/e2e/scheduling/nvidia-gpus.go
@ -183,6 +183,11 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
 	pods, err := framework.WaitForControlledPods(f.ClientSet, ds.Namespace, ds.Name, extensionsinternal.Kind("DaemonSet"))
 	framework.ExpectNoError(err, "getting pods controlled by the daemonset")
 	devicepluginPods, err := framework.WaitForControlledPods(f.ClientSet, "kube-system", "nvidia-gpu-device-plugin", extensionsinternal.Kind("DaemonSet"))
 	if err == nil {
 		framework.Logf("Adding deviceplugin addon pod.")
 		pods.Items = append(pods.Items, devicepluginPods.Items...)
 	}
 	framework.Logf("Starting ResourceUsageGather for the created DaemonSet pods.")
 	rsgather, err := framework.NewResourceUsageGatherer(f.ClientSet, framework.ResourceGathererOptions{false, false, 2 * time.Second, 2 * time.Second, true}, pods)
 	framework.ExpectNoError(err, "creating ResourceUsageGather for the daemonset pods")