From 0aa6a5726a67edd4ffc0efd1af7179cf47e532ce Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 10:24:11 +0530 Subject: [PATCH 1/7] node: device-plugins: e2e: Refactor parse log to return string and error Rather than only returning a string forcing us to log failure with `framework.Fail`, we return a string and error to handle error cases more conventionally. This enables us to use the `parseLog` function inside `Eventually` and `Consistently` blocks, or in general to delegate the error processing and enable better composability. Signed-off-by: Swati Sehgal Co-authored-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 34 ++++++++++++++++++----------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index f8c4420b581..bb62241af09 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -161,7 +161,8 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" - devID1 := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) v1alphaPodResources, err = getV1alpha1NodeDevices(ctx) @@ -221,16 +222,18 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" - devID1 := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) - pod1, err := e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) + pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) ginkgo.By("Confirming that device assignment persists even after container restart") - devIDAfterRestart := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devIDAfterRestart, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) framework.ExpectEqual(devIDAfterRestart, devID1) ginkgo.By("Restarting Kubelet") @@ -242,7 +245,8 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Validating that assignment is kept") ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept") - devIDRestart1 := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devIDRestart1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) framework.ExpectEqual(devIDRestart1, devID1) }) @@ -250,10 +254,12 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" - devID1 := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) - pod1, err := e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) + pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) ginkgo.By("Restarting Kubelet") @@ -275,7 +281,8 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Confirming that after a kubelet and pod restart, fake-device assignment is kept") ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) - devIDRestart1 := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + devIDRestart1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) framework.ExpectEqual(devIDRestart1, devID1) ginkgo.By("Waiting for resource to become available on the local node after re-registration") @@ -290,7 +297,8 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) ginkgo.By("Checking that pod got a different fake device") - devID2 := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) + devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2))) }) @@ -342,18 +350,18 @@ func ensurePodContainerRestart(ctx context.Context, f *framework.Framework, podN } // parseLog returns the matching string for the specified regular expression parsed from the container logs. -func parseLog(ctx context.Context, f *framework.Framework, podName string, contName string, re string) string { +func parseLog(ctx context.Context, f *framework.Framework, podName string, contName string, re string) (string, error) { logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, podName, contName) if err != nil { - framework.Failf("GetPodLogs for pod %q failed: %v", podName, err) + return "", err } framework.Logf("got pod logs: %v", logs) regex := regexp.MustCompile(re) matches := regex.FindStringSubmatch(logs) if len(matches) < 2 { - return "" + return "", fmt.Errorf("unexpected match in logs: %q", logs) } - return matches[1] + return matches[1], nil } From 5c4f397361fbf2d4639648242f968f474272c939 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 10:48:05 +0530 Subject: [PATCH 2/7] node: device-plugins: e2e: s/devLen/expectedSampleDevsAmount We rename to make the intent more explicit; We make it global to be able to reuse the value all across the module (e.g. to check the node readiness) later on. Signed-off-by: Swati Sehgal Co-authored-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index bb62241af09..16a2c8a3eda 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -65,12 +65,15 @@ func readDaemonSetV1OrDie(objBytes []byte) *appsv1.DaemonSet { return requiredObj.(*appsv1.DaemonSet) } +const ( + // TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap + // and then use the same here + expectedSampleDevsAmount int64 = 2 +) + func testDevicePlugin(f *framework.Framework, pluginSockDir string) { pluginSockDir = filepath.Join(pluginSockDir) + "/" ginkgo.Context("DevicePlugin [Serial] [Disruptive]", func() { - // TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap - // and then use the same here - devsLen := int64(2) var devicePluginPod, dptemplate *v1.Pod var v1alphaPodResources *kubeletpodresourcesv1alpha1.ListPodResourcesResponse var v1PodResources *kubeletpodresourcesv1.ListPodResourcesResponse @@ -121,12 +124,12 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) framework.Logf("Successfully created device plugin pod") - ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node") + ginkgo.By(fmt.Sprintf("Waiting for the resource exported by the sample device plugin to become available on the local node (instances: %d)", expectedSampleDevsAmount)) gomega.Eventually(ctx, func(ctx context.Context) bool { node, ready := getLocalTestNode(ctx, f) return ready && - CountSampleDeviceCapacity(node) == devsLen && - CountSampleDeviceAllocatable(node) == devsLen + CountSampleDeviceCapacity(node) == expectedSampleDevsAmount && + CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) }) @@ -289,8 +292,8 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { gomega.Eventually(ctx, func() bool { node, ready := getLocalTestNode(ctx, f) return ready && - CountSampleDeviceCapacity(node) == devsLen && - CountSampleDeviceAllocatable(node) == devsLen + CountSampleDeviceCapacity(node) == expectedSampleDevsAmount && + CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) ginkgo.By("Creating another pod") From 5ab4ba62052e3e83d34a365581583bd7282424e6 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 14:02:32 +0530 Subject: [PATCH 3/7] node: device-plugin: e2e: Annotate device check with error message With this change the error message are more helpful and easier to troubleshoot in case of test failures. Signed-off-by: Swati Sehgal --- test/e2e_node/device_plugin_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 16a2c8a3eda..b8e10c2f3fe 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -166,7 +166,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) - gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") v1alphaPodResources, err = getV1alpha1NodeDevices(ctx) framework.ExpectNoError(err) @@ -227,7 +227,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) - gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) @@ -260,7 +260,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) - gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) @@ -303,7 +303,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) - gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2))) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2)), "pod2 requested a device but started successfully without") }) }) } From fd459beeff66dd6933646cfefe0b03a6256b4b0b Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Thu, 9 Mar 2023 18:20:46 +0000 Subject: [PATCH 4/7] node: device-plugin: e2e: Isolate test to pod restart scenario Rather than testing out for both pod restart and kubelet restart, we change the tests to just handle pod restart scenario. Clarify the test purpose and add extra check to tighten the test. We would be adding additional tests to cover kubelet restart scenarios in subsequent commits. Signed-off-by: Swati Sehgal Signed-off-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 89 ++++++++++++++++++++++++----- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index b8e10c2f3fe..d582d014354 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -21,6 +21,7 @@ import ( "fmt" "path/filepath" "regexp" + "strings" "time" "github.com/onsi/ginkgo/v2" @@ -30,6 +31,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/apimachinery/pkg/util/sets" kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" admissionapi "k8s.io/pod-security-admission/api" @@ -78,6 +80,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { var v1alphaPodResources *kubeletpodresourcesv1alpha1.ListPodResourcesResponse var v1PodResources *kubeletpodresourcesv1.ListPodResourcesResponse var err error + ginkgo.BeforeEach(func(ctx context.Context) { ginkgo.By("Wait for node to be ready") gomega.Eventually(ctx, func(ctx context.Context) bool { @@ -221,7 +224,10 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectEqual(len(v1ResourcesForOurPod.Containers[0].Devices[0].DeviceIds), 1) }) - ginkgo.It("Keeps device plugin assignments across pod and kubelet restarts", func(ctx context.Context) { + // simulate container restart, while all other involved components (kubelet, device plugin) stay stable. To do so, in the container + // entry point we sleep for a limited and short period of time. The device assignment should be kept and be stable across the container + // restarts. For the sake of brevity we however check just the fist restart. + ginkgo.It("Keeps device plugin assignments across pod restarts (no kubelet restart, device plugin re-registration)", func(ctx context.Context) { podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" @@ -232,25 +238,45 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) + ginkgo.By("Waiting for container to restart") ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) - ginkgo.By("Confirming that device assignment persists even after container restart") - devIDAfterRestart, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) - framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) - framework.ExpectEqual(devIDAfterRestart, devID1) - - ginkgo.By("Restarting Kubelet") - restartKubelet(true) - - ginkgo.By("Wait for node to be ready again") - e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) - - ginkgo.By("Validating that assignment is kept") - ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) - ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept") + // check from the device assignment is preserved and stable from perspective of the container + ginkgo.By("Confirming that after a container restart, fake-device assignment is kept") devIDRestart1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) framework.ExpectEqual(devIDRestart1, devID1) + + // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. + // needs to match the container perspective. + ginkgo.By("Verifying the device assignment after container restart using podresources API") + v1PodResources, err = getV1NodeDevices(ctx) + if err != nil { + framework.ExpectNoError(err, "getting pod resources assignment after pod restart") + } + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + + ginkgo.By("Creating another pod") + pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) + err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod2.Name, f.Namespace.Name, 1*time.Minute) + framework.ExpectNoError(err) + + ginkgo.By("Checking that pod got a fake device") + devID2, err := parseLog(ctx, f, pod2.Name, pod2.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) + + gomega.Expect(devID2).To(gomega.Not(gomega.Equal("")), "pod2 requested a device but started successfully without") + + ginkgo.By("Verifying the device assignment after extra container start using podresources API") + v1PodResources, err = getV1NodeDevices(ctx) + if err != nil { + framework.ExpectNoError(err, "getting pod resources assignment after pod restart") + } + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod1") + err = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2}) + framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2") }) ginkgo.It("Keeps device plugin assignments after the device plugin has been re-registered", func(ctx context.Context) { @@ -368,3 +394,36 @@ func parseLog(ctx context.Context, f *framework.Framework, podName string, contN return matches[1], nil } + +func checkPodResourcesAssignment(v1PodRes *kubeletpodresourcesv1.ListPodResourcesResponse, podNamespace, podName, containerName, resourceName string, devs []string) error { + for _, podRes := range v1PodRes.PodResources { + if podRes.Namespace != podNamespace || podRes.Name != podName { + continue + } + for _, contRes := range podRes.Containers { + if contRes.Name != containerName { + continue + } + return matchContainerDevices(podNamespace+"/"+podName+"/"+containerName, contRes.Devices, resourceName, devs) + } + } + return fmt.Errorf("no resources found for %s/%s/%s", podNamespace, podName, containerName) +} + +func matchContainerDevices(ident string, contDevs []*kubeletpodresourcesv1.ContainerDevices, resourceName string, devs []string) error { + expected := sets.New[string](devs...) + assigned := sets.New[string]() + for _, contDev := range contDevs { + if contDev.ResourceName != resourceName { + continue + } + assigned = assigned.Insert(contDev.DeviceIds...) + } + expectedStr := strings.Join(expected.UnsortedList(), ",") + assignedStr := strings.Join(assigned.UnsortedList(), ",") + framework.Logf("%s: devices expected %q assigned %q", ident, expectedStr, assignedStr) + if !assigned.Equal(expected) { + return fmt.Errorf("device allocation mismatch for %s expected %s assigned %s", ident, expectedStr, assignedStr) + } + return nil +} From 4a0f7c791fea00d08232a5d1304cbbf38d693704 Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 12:00:35 +0530 Subject: [PATCH 5/7] node: device-plugin: e2e: Update test description to make it explicit Explicitly state that the test involves kubelet restart and device plugin re-registration (no pod restart) We remove the part of the code where we wait for the pod to restart as this test case should no longer involve pod restart. In addition to that, we use `waitForNodeReady` instead of `WaitForAllNodesSchedulable` for ensuring that the node is ready for pods to be scheduled on it. Signed-off-by: Swati Sehgal Co-authored-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 39 ++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index d582d014354..37fcf03eb03 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -279,8 +279,11 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2") }) - ginkgo.It("Keeps device plugin assignments after the device plugin has been re-registered", func(ctx context.Context) { - podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" + // simulate kubelet restart *and* device plugin re-registration, while the pod and the container stays running. + // The device assignment should be kept and be stable across the kubelet/device plugin restart, as both the aforementioned components + // archestrate the device allocation: the actual consumer (container) is stable. + ginkgo.It("Keeps device plugin assignments after kubelet restart and device plugin has been re-registered (no pod restart)", func(ctx context.Context) { + podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 24h" // the pod has to run "forever" in the timescale of this test pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) @@ -297,7 +300,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Wait for node to be ready again") e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) - ginkgo.By("Re-Register resources and delete the plugin pod") + ginkgo.By("Re-Register resources by deleting the plugin pod") gp := int64(0) deleteOptions := metav1.DeleteOptions{ GracePeriodSeconds: &gp, @@ -308,12 +311,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { ginkgo.By("Recreating the plugin pod") devicePluginPod = e2epod.NewPodClient(f).CreateSync(ctx, dptemplate) - ginkgo.By("Confirming that after a kubelet and pod restart, fake-device assignment is kept") - ensurePodContainerRestart(ctx, f, pod1.Name, pod1.Name) - devIDRestart1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) - framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) - framework.ExpectEqual(devIDRestart1, devID1) - ginkgo.By("Waiting for resource to become available on the local node after re-registration") gomega.Eventually(ctx, func() bool { node, ready := getLocalTestNode(ctx, f) @@ -322,6 +319,19 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) + // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. + // note we don't check again the logs of the container: the check is done at startup, the container + // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check + // is useless. + ginkgo.By("Verifying the device assignment after kubelet and device plugin restart using podresources API") + gomega.Eventually(ctx, func() error { + v1PodResources, err = getV1NodeDevices(ctx) + return err + }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet and device plugin restart") + + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + ginkgo.By("Creating another pod") pod2 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) @@ -330,6 +340,17 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectNoError(err, "getting logs for pod %q", pod2.Name) gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2)), "pod2 requested a device but started successfully without") + + ginkgo.By("Verifying the device assignment after kubelet restart and device plugin re-registration using podresources API") + // note we don't use eventually: the kubelet is supposed to be running and stable by now, so the call should just succeed + v1PodResources, err = getV1NodeDevices(ctx) + if err != nil { + framework.ExpectNoError(err, "getting pod resources assignment after pod restart") + } + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod1") + err = checkPodResourcesAssignment(v1PodResources, pod2.Namespace, pod2.Name, pod2.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID2}) + framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2") }) }) } From 09100804722b630efae06e813ddc5e25c3c3275c Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 15:12:30 +0530 Subject: [PATCH 6/7] node: device-plugin: e2e: Provide sleep intervals via constants Based on whether the test case requires pod restart or not, the sleep interval needs to be updated and we define constants to represent the two sleep intervals that can be used in the corresponding test cases. Signed-off-by: Swati Sehgal Co-authored-by: Francesco Romani --- test/e2e_node/device_plugin_test.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 37fcf03eb03..1ce40664186 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -71,6 +71,12 @@ const ( // TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap // and then use the same here expectedSampleDevsAmount int64 = 2 + + // This is the sleep interval specified in the command executed in the pod to ensure container is running "forever" in the test timescale + sleepIntervalForever string = "24h" + + // This is the sleep interval specified in the command executed in the pod so that container is restarted within the expected test run time + sleepIntervalWithRestart string = "60s" ) func testDevicePlugin(f *framework.Framework, pluginSockDir string) { @@ -164,7 +170,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { }) ginkgo.It("Can schedule a pod that requires a device", func(ctx context.Context) { - podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalWithRestart) pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) @@ -228,7 +234,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { // entry point we sleep for a limited and short period of time. The device assignment should be kept and be stable across the container // restarts. For the sake of brevity we however check just the fist restart. ginkgo.It("Keeps device plugin assignments across pod restarts (no kubelet restart, device plugin re-registration)", func(ctx context.Context) { - podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalWithRestart) pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) @@ -283,7 +289,7 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { // The device assignment should be kept and be stable across the kubelet/device plugin restart, as both the aforementioned components // archestrate the device allocation: the actual consumer (container) is stable. ginkgo.It("Keeps device plugin assignments after kubelet restart and device plugin has been re-registered (no pod restart)", func(ctx context.Context) { - podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 24h" // the pod has to run "forever" in the timescale of this test + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalForever) // the pod has to run "forever" in the timescale of this test pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) From 0a582431593a38ab4778c0df3ac015a93d6a0e6e Mon Sep 17 00:00:00 2001 From: Swati Sehgal Date: Mon, 13 Mar 2023 12:20:28 +0530 Subject: [PATCH 7/7] node: device-plugin: e2e: Add test case for kubelet restart Capture explicitly a test case pertaining to kubelet restart but with no pod restart and device plugin re-registration. Signed-off-by: Swati Sehgal --- test/e2e_node/device_plugin_test.go | 47 ++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index 1ce40664186..5ac7b2c1192 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -285,9 +285,54 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectNoError(err, "inconsistent device assignment after extra container restart - pod2") }) + // simulate kubelet restart, *but not* device plugin re-registration, while the pod and the container stays running. + // The device assignment should be kept and be stable across the kubelet restart, because it's the kubelet which performs the device allocation, + // and both the device plugin and the actual consumer (container) are stable. + ginkgo.It("Keeps device plugin assignments across kubelet restarts (no pod restart, no device plugin re-registration)", func(ctx context.Context) { + podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalForever) + pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD)) + deviceIDRE := "stub devices: (Dev-[0-9]+)" + devID1, err := parseLog(ctx, f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectNoError(err, "getting logs for pod %q", pod1.Name) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")), "pod1 requested a device but started successfully without") + + pod1, err = e2epod.NewPodClient(f).Get(ctx, pod1.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Restarting Kubelet") + restartKubelet(true) + + ginkgo.By("Wait for node to be ready again") + e2enode.WaitForAllNodesSchedulable(ctx, f.ClientSet, 5*time.Minute) + + ginkgo.By("Waiting for resource to become available on the local node after restart") + gomega.Eventually(ctx, func() bool { + node, ready := getLocalTestNode(ctx, f) + return ready && + CountSampleDeviceCapacity(node) == expectedSampleDevsAmount && + CountSampleDeviceAllocatable(node) == expectedSampleDevsAmount + }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) + + err = e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod1.Name, f.Namespace.Name, 1*time.Minute) + framework.ExpectNoError(err) + + // crosscheck from the device assignment is preserved and stable from perspective of the kubelet. + // note we don't check again the logs of the container: the check is done at startup, the container + // never restarted (runs "forever" from this test timescale perspective) hence re-doing this check + // is useless. + ginkgo.By("Verifying the device assignment after kubelet restart using podresources API") + gomega.Eventually(ctx, func() error { + v1PodResources, err = getV1NodeDevices(ctx) + return err + }, 30*time.Second, framework.Poll).ShouldNot(gomega.HaveOccurred(), "cannot fetch the compute resource assignment after kubelet restart") + + err = checkPodResourcesAssignment(v1PodResources, pod1.Namespace, pod1.Name, pod1.Spec.Containers[0].Name, SampleDeviceResourceName, []string{devID1}) + framework.ExpectNoError(err, "inconsistent device assignment after pod restart") + }) + // simulate kubelet restart *and* device plugin re-registration, while the pod and the container stays running. // The device assignment should be kept and be stable across the kubelet/device plugin restart, as both the aforementioned components - // archestrate the device allocation: the actual consumer (container) is stable. + // orchestrate the device allocation: the actual consumer (container) is stable. ginkgo.It("Keeps device plugin assignments after kubelet restart and device plugin has been re-registered (no pod restart)", func(ctx context.Context) { podRECMD := fmt.Sprintf("devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep %s", sleepIntervalForever) // the pod has to run "forever" in the timescale of this test pod1 := e2epod.NewPodClient(f).CreateSync(ctx, makeBusyboxPod(SampleDeviceResourceName, podRECMD))