Extends gpu_device_plugin e2e_node test to verify that scheduled pods

can continue to run even after device plugin deletion and kubelet restarts.
2025-09-14 13:45:06 +00:00 · 2017-11-03 16:38:00 -07:00
parent 1eb4e79453
commit 990113ce60
1 changed files with 31 additions and 16 deletions
--- a/test/e2e_node/gpu_device_plugin.go
+++ b/test/e2e_node/gpu_device_plugin.go
@@ -18,7 +18,6 @@ package e2e_node

 import (
 	"os/exec"
-	"path/filepath"
 	"regexp"
 	"time"

@@ -49,6 +48,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 			initialConfig.FeatureGates[string(features.DevicePlugins)] = true
 		})

+		var devicePluginPod *v1.Pod
 		BeforeEach(func() {
 			By("Ensuring that Nvidia GPUs exists on the node")
 			if !checkIfNvidiaGPUsExistOnNode() {
@@ -56,7 +56,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 			}

 			By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
-			f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
+			devicePluginPod = f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))

 			By("Waiting for GPUs to become available on the local node")
 			Eventually(func() bool {
@@ -84,7 +84,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 		It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
 			By("Creating one GPU pod on a node with at least two GPUs")
 			p1 := f.PodClient().CreateSync(makeCudaPauseImage())
-			devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
+			count1, devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
 			p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
 			framework.ExpectNoError(err)

@@ -92,17 +92,36 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
 			restartKubelet(f)

 			By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
-			devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2)
-			Expect(devIdRestart).To(Equal(devId1))
+			count1, devIdRestart1 := getDeviceId(f, p1.Name, p1.Name, count1+1)
+			Expect(devIdRestart1).To(Equal(devId1))

 			By("Restarting Kubelet and creating another pod")
 			restartKubelet(f)
 			p2 := f.PodClient().CreateSync(makeCudaPauseImage())

 			By("Checking that pods got a different GPU")
-			devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
+			count2, devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
 			Expect(devId1).To(Not(Equal(devId2)))

+			By("Deleting device plugin.")
+			f.PodClient().Delete(devicePluginPod.Name, &metav1.DeleteOptions{})
+			By("Waiting for GPUs to become unavailable on the local node")
+			Eventually(func() bool {
+				return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) <= 0
+			}, 10*time.Minute, framework.Poll).Should(BeTrue())
+			By("Checking that scheduled pods can continue to run even after we delete device plugin.")
+			count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+1)
+			Expect(devIdRestart1).To(Equal(devId1))
+			count2, devIdRestart2 := getDeviceId(f, p2.Name, p2.Name, count2+1)
+			Expect(devIdRestart2).To(Equal(devId2))
+			By("Restarting Kubelet.")
+			restartKubelet(f)
+			By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
+			count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+2)
+			Expect(devIdRestart1).To(Equal(devId1))
+			count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
+			Expect(devIdRestart2).To(Equal(devId2))
+
 			// Cleanup
 			f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
 			f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
@@ -140,9 +159,6 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis

 // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
 func restartKubelet(f *framework.Framework) {
-	beforeSocks, err := filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
-	framework.ExpectNoError(err)
-	Expect(len(beforeSocks)).NotTo(BeZero())
 	stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput()
 	framework.ExpectNoError(err)
 	regex := regexp.MustCompile("(kubelet-[0-9]+)")
@@ -152,19 +168,18 @@ func restartKubelet(f *framework.Framework) {
 	framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube)
 	stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput()
 	framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
-	Eventually(func() ([]string, error) {
-		return filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
-	}, 5*time.Minute, framework.Poll).ShouldNot(ConsistOf(beforeSocks))
 }

-func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string {
+func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) (int32, string) {
+	var count int32
 	// Wait till pod has been restarted at least restartCount times.
 	Eventually(func() bool {
 		p, err := f.PodClient().Get(podName, metav1.GetOptions{})
 		if err != nil || len(p.Status.ContainerStatuses) < 1 {
 			return false
 		}
-		return p.Status.ContainerStatuses[0].RestartCount >= restartCount
+		count = p.Status.ContainerStatuses[0].RestartCount
+		return count >= restartCount
 	}, 5*time.Minute, framework.Poll).Should(BeTrue())
 	logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
 	if err != nil {
@@ -174,7 +189,7 @@ func getDeviceId(f *framework.Framework, podName string, contName string, restar
 	regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)")
 	matches := regex.FindStringSubmatch(logs)
 	if len(matches) < 2 {
-		return ""
+		return count, ""
 	}
-	return matches[1]
+	return count, matches[1]
 }