diff --git a/test/e2e_node/gpu_device_plugin.go b/test/e2e_node/gpu_device_plugin.go index 5fe124f4fb0..4a3927218bf 100644 --- a/test/e2e_node/gpu_device_plugin.go +++ b/test/e2e_node/gpu_device_plugin.go @@ -18,7 +18,6 @@ package e2e_node import ( "os/exec" - "path/filepath" "regexp" "time" @@ -49,6 +48,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi initialConfig.FeatureGates[string(features.DevicePlugins)] = true }) + var devicePluginPod *v1.Pod BeforeEach(func() { By("Ensuring that Nvidia GPUs exists on the node") if !checkIfNvidiaGPUsExistOnNode() { @@ -56,7 +56,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi } By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE") - f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name)) + devicePluginPod = f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name)) By("Waiting for GPUs to become available on the local node") Eventually(func() bool { @@ -84,7 +84,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() { By("Creating one GPU pod on a node with at least two GPUs") p1 := f.PodClient().CreateSync(makeCudaPauseImage()) - devId1 := getDeviceId(f, p1.Name, p1.Name, 1) + count1, devId1 := getDeviceId(f, p1.Name, p1.Name, 1) p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) @@ -92,17 +92,36 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi restartKubelet(f) By("Confirming that after a kubelet and pod restart, GPU assignement is kept") - devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2) - Expect(devIdRestart).To(Equal(devId1)) + count1, devIdRestart1 := getDeviceId(f, p1.Name, p1.Name, count1+1) + Expect(devIdRestart1).To(Equal(devId1)) By("Restarting Kubelet and creating another pod") restartKubelet(f) p2 := f.PodClient().CreateSync(makeCudaPauseImage()) By("Checking that pods got a different GPU") - devId2 := getDeviceId(f, p2.Name, p2.Name, 1) + count2, devId2 := getDeviceId(f, p2.Name, p2.Name, 1) Expect(devId1).To(Not(Equal(devId2))) + By("Deleting device plugin.") + f.PodClient().Delete(devicePluginPod.Name, &metav1.DeleteOptions{}) + By("Waiting for GPUs to become unavailable on the local node") + Eventually(func() bool { + return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) <= 0 + }, 10*time.Minute, framework.Poll).Should(BeTrue()) + By("Checking that scheduled pods can continue to run even after we delete device plugin.") + count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+1) + Expect(devIdRestart1).To(Equal(devId1)) + count2, devIdRestart2 := getDeviceId(f, p2.Name, p2.Name, count2+1) + Expect(devIdRestart2).To(Equal(devId2)) + By("Restarting Kubelet.") + restartKubelet(f) + By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") + count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+2) + Expect(devIdRestart1).To(Equal(devId1)) + count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2) + Expect(devIdRestart2).To(Equal(devId2)) + // Cleanup f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) @@ -140,9 +159,6 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis // TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494 func restartKubelet(f *framework.Framework) { - beforeSocks, err := filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock") - framework.ExpectNoError(err) - Expect(len(beforeSocks)).NotTo(BeZero()) stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput() framework.ExpectNoError(err) regex := regexp.MustCompile("(kubelet-[0-9]+)") @@ -152,19 +168,18 @@ func restartKubelet(f *framework.Framework) { framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube) stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput() framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout) - Eventually(func() ([]string, error) { - return filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock") - }, 5*time.Minute, framework.Poll).ShouldNot(ConsistOf(beforeSocks)) } -func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string { +func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) (int32, string) { + var count int32 // Wait till pod has been restarted at least restartCount times. Eventually(func() bool { p, err := f.PodClient().Get(podName, metav1.GetOptions{}) if err != nil || len(p.Status.ContainerStatuses) < 1 { return false } - return p.Status.ContainerStatuses[0].RestartCount >= restartCount + count = p.Status.ContainerStatuses[0].RestartCount + return count >= restartCount }, 5*time.Minute, framework.Poll).Should(BeTrue()) logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName) if err != nil { @@ -174,7 +189,7 @@ func getDeviceId(f *framework.Framework, podName string, contName string, restar regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)") matches := regex.FindStringSubmatch(logs) if len(matches) < 2 { - return "" + return count, "" } - return matches[1] + return count, matches[1] }