From d727df1741e68f7159c95d3b5a7f55535d125e5e Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 27 Apr 2023 11:26:50 +0100 Subject: [PATCH] node: device-plugin: e2e: Additional test cases Additional test cases added: Keeps device plugin assignments across pod and kubelet restarts (no device plugin re-registration) Keeps device plugin assignments after the device plugin has re-registered (no kubelet or pod restart) Signed-off-by: Swati Sehgal --- test/e2e_node/device_plugin_test.go | 119 ++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 899c45775dc..e4d598348f9 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -340,6 +340,125 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectNoError(err, "inconsistent device assignment after pod restart") }) + // simulate kubelet and container restart, *but not* device plugin re-registration. + // The device assignment should be kept and be stable across the kubelet and container restart, because it's the kubelet which + // performs the device allocation, and both the device plugin is stable. + ginkgo.It("Keeps device plugin assignments across pod and kubelet restarts (no device plugin re-registration)", func(ctx context.Context) { + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalWithRestart) + pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) + deviceIDRE := "stub devices: (Dev-[0-9]+)" + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) + + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") + + pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Wait for node to be ready again") + e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) + + ginkgo.By("Waiting for container to restart") + ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) + + ginkgo.By("Confirming that after a container restart, fake-device assignment is kept") + devIDRestart1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) + framework.ExpectEqual(devIDRestart1, devID1) + + ginkgo.By("Restarting Kubelet") + restartKubelet(true) + + ginkgo.By("Wait for node to be ready again") + e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) + + ginkgo.By("Waiting for the pod to fail with admission error as device plugin hasn't re-registered yet") + gomega.Eventually(ctx, getPod). + WithArguments(f, pod1.Name). + WithTimeout(time.Minute). + Should(HaveFailedWithAdmissionError(), + "the pod succeeded to start, when it should fail with the admission error") + + // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. + // note we don't check again the logs of the container: the check is done at startup, the container + // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check + // is useless. + ginkgo.By("Verifying the device assignment after kubelet restart using podresources API") + gomega.Eventually(ctx, func() error { + v1PodResources, err = getV1NodeDevices(ctx) + return err + }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart") + + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + }) + + // simulate device plugin re-registration, *but not* container and kubelet restart. + // After the device plugin has re-registered, the list healthy devices is repopulated based on the devices discovered. + // Once Pod2 is running we determine the device that was allocated it. As long as the device allocation succeeds the + // test should pass. + + ginkgo.It("Keeps device plugin assignments after the device plugin has been re-registered (no kubelet, pod restart)", func(ctx context.Context) { + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalForever) + pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) + deviceIDRE := "stub devices: (Dev-[0-9]+)" + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") + + pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Wait for node to be ready again") + e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) + + ginkgo.By("Re-Register resources and delete the plugin pod") + gp := int64(0) + deleteOptions := metav1.DeleteOptions{ + GracePeriodSeconds: &gp, + } + e2epod.NewPodClient(f).DeleteSync(ctx, devicePluginPod.Name, deleteOptions, time.Minute) + waitForContainerRemoval(ctx, devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace) + + ginkgo.By("Recreating the plugin pod") + devicePluginPod = e2epod.NewPodClient(f).CreateSync(ctx, dptemplate) + err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, devicePluginPod.Name, devicePluginPod.Namespace, 1*time.Minute) + framework.ExpectNoError(err) + + ginkgo.By("Waiting for resource to become available on the local node after re-registration") + gomega.Eventually(ctx, func() bool { + node, ready := getLocalTestNode(ctx, f) + return ready && + CountSampleDeviceCapacity(node) == expectedSampleDevsAmount && + CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount + }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) + + // crosscheck that after device plugin re-registration the device assignment is preserved and + // stable from the kubelet's perspective. + // note we don't check again the logs of the container: the check is done at startup, the container + // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check + // is useless. + ginkgo.By("Verifying the device assignment after device plugin re-registration using podresources API") + gomega.Eventually(ctx, func() error { + v1PodResources, err = getV1NodeDevices(ctx) + return err + }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart") + + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + + ginkgo.By("Creating another pod") + pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) + err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod2.Name, f.Namespace.Name, 1*time.Minute) + framework.ExpectNoError(err) + + ginkgo.By("Checking that pod got a fake device") + devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) + + gomega.Expect(devID2).To(gomega.Not(gomega.Equal("")), "pod2 requested a device but started successfully without") + }) + // simulate kubelet restart *and* device plugin re-registration, while the pod and the container stays running. // The device assignment should be kept and be stable across the kubelet/device plugin restart, as both the aforementioned components // orchestrate the device allocation: the actual consumer (container) is stable.