diff --git a/test/e2e_node/device_plugin_test.go b/test/e2e_node/device_plugin_test.go index c96180034fc..56ba767162b 100644 --- a/test/e2e_node/device_plugin_test.go +++ b/test/e2e_node/device_plugin_test.go @@ -45,8 +45,6 @@ import ( const ( // sampleResourceName is the name of the example resource which is used in the e2e test sampleResourceName = "example.com/resource" - // sampleDevicePluginDSYAML is the path of the daemonset template of the sample device plugin. // TODO: Parametrize it by making it a feature in TestFramework. - sampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml" // sampleDevicePluginName is the name of the device plugin pod sampleDevicePluginName = "sample-device-plugin" @@ -79,7 +77,7 @@ func numberOfSampleResources(node *v1.Node) int64 { // getSampleDevicePluginPod returns the Device Plugin pod for sample resources in e2e tests. func getSampleDevicePluginPod() *v1.Pod { - data, err := e2etestfiles.Read(sampleDevicePluginDSYAML) + data, err := e2etestfiles.Read(SampleDevicePluginDSYAML) if err != nil { framework.Fail(err.Error()) } @@ -109,21 +107,30 @@ func readDaemonSetV1OrDie(objBytes []byte) *appsv1.DaemonSet { func testDevicePlugin(f *framework.Framework, pluginSockDir string) { pluginSockDir = filepath.Join(pluginSockDir) + "/" - ginkgo.Context("DevicePlugin", func() { - ginkgo.It("[Flaky] Verifies the Kubelet device plugin functionality.", func() { - ginkgo.By("Wait for node is ready to start with") - e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) + ginkgo.Context("DevicePlugin [Serial] [Disruptive]", func() { + // TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap + // and then use the same here + devsLen := int64(2) + var devicePluginPod, dptemplate *v1.Pod + + ginkgo.BeforeEach(func() { + ginkgo.By("Wait for node to be ready") + gomega.Eventually(func() bool { + nodes, err := e2enode.TotalReady(f.ClientSet) + framework.ExpectNoError(err) + return nodes == 1 + }, time.Minute, time.Second).Should(gomega.BeTrue()) + + ginkgo.By("Scheduling a sample device plugin pod") dp := getSampleDevicePluginPod() + dp.Namespace = "" for i := range dp.Spec.Containers[0].Env { if dp.Spec.Containers[0].Env[i].Name == envVarNamePluginSockDir { dp.Spec.Containers[0].Env[i].Value = pluginSockDir } } - framework.Logf("env %v", dp.Spec.Containers[0].Env) - dp.Spec.NodeName = framework.TestContext.NodeName - ginkgo.By("Create sample device plugin pod") - devicePluginPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{}) - framework.ExpectNoError(err) + dptemplate = dp + devicePluginPod = f.PodClient().CreateSync(dp) ginkgo.By("Waiting for devices to become available on the local node") gomega.Eventually(func() bool { @@ -132,18 +139,39 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.Logf("Successfully created device plugin pod") ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node") - // TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap - // and then use the same here - devsLen := int64(2) gomega.Eventually(func() bool { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) + node := getLocalNode(f) return numberOfDevicesCapacity(node, resourceName) == devsLen && numberOfDevicesAllocatable(node, resourceName) == devsLen }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) + }) - ginkgo.By("Creating one pod on node with at least one fake-device") - podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs" + ginkgo.AfterEach(func() { + ginkgo.By("Deleting the device plugin pod") + f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, time.Minute) + + ginkgo.By("Deleting any Pods created by the test") + l, err := f.PodClient().List(context.TODO(), metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, p := range l.Items { + if p.Namespace != f.Namespace.Name { + continue + } + + framework.Logf("Deleting pod: %s", p.Name) + f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute) + } + + restartKubelet(true) + + ginkgo.By("Waiting for devices to become unavailable on the local node") + gomega.Eventually(func() bool { + return numberOfSampleResources(getLocalNode(f)) <= 0 + }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) + }) + + ginkgo.It("Can schedule a pod that requires a device", func() { + podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD)) deviceIDRE := "stub devices: (Dev-[0-9]+)" devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) @@ -151,11 +179,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { v1alphaPodResources, err := getV1alpha1NodeDevices() framework.ExpectNoError(err) - framework.Logf("v1alpha pod resources %v", v1alphaPodResources) v1PodResources, err := getV1NodeDevices() framework.ExpectNoError(err) - framework.Logf("v1 pod resources %v", v1PodResources) framework.ExpectEqual(len(v1alphaPodResources.PodResources), 2) framework.ExpectEqual(len(v1PodResources.PodResources), 2) @@ -166,7 +192,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { v1alphaResourcesForOurPod = res } } - framework.Logf("v1alphaResourcesForOurPod %v", v1alphaResourcesForOurPod) var v1ResourcesForOurPod *kubeletpodresourcesv1.PodResources for _, res := range v1PodResources.GetPodResources() { @@ -174,7 +199,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { v1ResourcesForOurPod = res } } - framework.Logf("v1ResourcesForOurPod %v", v1ResourcesForOurPod) gomega.Expect(v1alphaResourcesForOurPod).NotTo(gomega.BeNil()) gomega.Expect(v1ResourcesForOurPod).NotTo(gomega.BeNil()) @@ -199,8 +223,16 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { framework.ExpectEqual(len(v1alphaResourcesForOurPod.Containers[0].Devices[0].DeviceIds), 1) framework.ExpectEqual(len(v1ResourcesForOurPod.Containers[0].Devices[0].DeviceIds), 1) + }) - pod1, err = f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{}) + ginkgo.It("Keeps device plugin assignments across pod and kubelet restarts", func() { + podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" + pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD)) + deviceIDRE := "stub devices: (Dev-[0-9]+)" + devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) + + pod1, err := f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{}) framework.ExpectNoError(err) ensurePodContainerRestart(f, pod1.Name, pod1.Name) @@ -209,49 +241,54 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { devIDAfterRestart := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectEqual(devIDAfterRestart, devID1) - restartTime := time.Now() ginkgo.By("Restarting Kubelet") restartKubelet(true) - // We need to wait for node to be ready before re-registering stub device plugin. - // Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts. - ginkgo.By("Wait for node is ready") - gomega.Eventually(func() bool { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - for _, cond := range node.Status.Conditions { - if cond.Type == v1.NodeReady && cond.Status == v1.ConditionTrue && cond.LastHeartbeatTime.After(restartTime) { - return true - } - } - return false - }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) + ginkgo.By("Wait for node to be ready again") + framework.WaitForAllNodesSchedulable(f.ClientSet, 5*time.Minute) - ginkgo.By("Re-Register resources and deleting the pods and waiting for container removal") - getOptions := metav1.GetOptions{} + ginkgo.By("Validating that assignment is kept") + ensurePodContainerRestart(f, pod1.Name, pod1.Name) + ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept") + devIDRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) + framework.ExpectEqual(devIDRestart1, devID1) + }) + + ginkgo.It("Keeps device plugin assignments after the device plugin has been re-registered", func() { + podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60" + pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD)) + deviceIDRE := "stub devices: (Dev-[0-9]+)" + devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) + gomega.Expect(devID1).To(gomega.Not(gomega.Equal(""))) + + pod1, err := f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + + ginkgo.By("Restarting Kubelet") + restartKubelet(true) + + ginkgo.By("Wait for node to be ready again") + framework.WaitForAllNodesSchedulable(f.ClientSet, 5*time.Minute) + + ginkgo.By("Re-Register resources and delete the plugin pod") gp := int64(0) deleteOptions := metav1.DeleteOptions{ GracePeriodSeconds: &gp, } - err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions) - framework.ExpectNoError(err) + f.PodClient().DeleteSync(devicePluginPod.Name, deleteOptions, time.Minute) waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace) - _, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Get(context.TODO(), dp.Name, getOptions) - framework.Logf("Trying to get dp pod after deletion. err must be non-nil. err: %v", err) - framework.ExpectError(err) - devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{}) - framework.ExpectNoError(err) + ginkgo.By("Recreating the plugin pod") + devicePluginPod = f.PodClient().CreateSync(dptemplate) + ginkgo.By("Confirming that after a kubelet and pod restart, fake-device assignment is kept") ensurePodContainerRestart(f, pod1.Name, pod1.Name) - ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept") devIDRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE) framework.ExpectEqual(devIDRestart1, devID1) ginkgo.By("Waiting for resource to become available on the local node after re-registration") gomega.Eventually(func() bool { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) + node := getLocalNode(f) return numberOfDevicesCapacity(node, resourceName) == devsLen && numberOfDevicesAllocatable(node, resourceName) == devsLen }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) @@ -263,54 +300,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) { devID2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE) gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2))) - - ginkgo.By("By deleting the pods and waiting for container removal") - err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions) - framework.ExpectNoError(err) - waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace) - - ginkgo.By("Waiting for stub device plugin to become unhealthy on the local node") - gomega.Eventually(func() int64 { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - return numberOfDevicesAllocatable(node, resourceName) - }, 30*time.Second, framework.Poll).Should(gomega.Equal(int64(0))) - - ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.") - ensurePodContainerRestart(f, pod1.Name, pod1.Name) - devIDRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart1, devID1) - - ensurePodContainerRestart(f, pod2.Name, pod2.Name) - devIDRestart2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart2, devID2) - - ginkgo.By("Re-register resources") - devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{}) - framework.ExpectNoError(err) - - ginkgo.By("Waiting for the resource exported by the stub device plugin to become healthy on the local node") - gomega.Eventually(func() int64 { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - return numberOfDevicesAllocatable(node, resourceName) - }, 30*time.Second, framework.Poll).Should(gomega.Equal(devsLen)) - - ginkgo.By("by deleting the pods and waiting for container removal") - err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions) - framework.ExpectNoError(err) - waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace) - - ginkgo.By("Waiting for stub device plugin to become unavailable on the local node") - gomega.Eventually(func() bool { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - return numberOfDevicesCapacity(node, resourceName) <= 0 - }, 10*time.Minute, framework.Poll).Should(gomega.BeTrue()) - - // Cleanup - f.PodClient().DeleteSync(pod1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) - f.PodClient().DeleteSync(pod2.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) }) }) } diff --git a/test/e2e_node/gpu_device_plugin_test.go b/test/e2e_node/gpu_device_plugin_test.go deleted file mode 100644 index 54f787846da..00000000000 --- a/test/e2e_node/gpu_device_plugin_test.go +++ /dev/null @@ -1,211 +0,0 @@ -/* -Copyright 2017 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2enode - -import ( - "context" - "os/exec" - "time" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/uuid" - "k8s.io/kubernetes/test/e2e/framework" - e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu" - e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest" - - "github.com/onsi/ginkgo" - "github.com/onsi/gomega" -) - -// numberOfNVIDIAGPUs returns the number of GPUs advertised by a node -// This is based on the Device Plugin system and expected to run on a COS based node -// After the NVIDIA drivers were installed -// TODO make this generic and not linked to COS only -func numberOfNVIDIAGPUs(node *v1.Node) int64 { - val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName] - if !ok { - return 0 - } - return val.Value() -} - -// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE -func NVIDIADevicePlugin() *v1.Pod { - ds, err := e2emanifest.DaemonSetFromURL(e2egpu.GPUDevicePluginDSYAML) - framework.ExpectNoError(err) - p := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()), - }, - Spec: ds.Spec.Template.Spec, - } - // Remove node affinity - p.Spec.Affinity = nil - return p -} - -// Serial because the test restarts Kubelet -var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() { - f := framework.NewDefaultFramework("device-plugin-gpus-errors") - - ginkgo.Context("DevicePlugin", func() { - var devicePluginPod *v1.Pod - ginkgo.BeforeEach(func() { - ginkgo.By("Ensuring that Nvidia GPUs exists on the node") - if !checkIfNvidiaGPUsExistOnNode() { - ginkgo.Skip("Nvidia GPUs do not exist on the node. Skipping test.") - } - - if framework.TestContext.ContainerRuntime != "docker" { - ginkgo.Skip("Test works only with in-tree dockershim. Skipping test.") - } - - ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU") - devicePluginPod = f.PodClient().Create(NVIDIADevicePlugin()) - - ginkgo.By("Waiting for GPUs to become available on the local node") - gomega.Eventually(func() bool { - return numberOfNVIDIAGPUs(getLocalNode(f)) > 0 - }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue(), "GPUs never became available on the local node") - - if numberOfNVIDIAGPUs(getLocalNode(f)) < 2 { - ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)") - } - }) - - ginkgo.AfterEach(func() { - l, err := f.PodClient().List(context.TODO(), metav1.ListOptions{}) - framework.ExpectNoError(err) - - f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute) - - for _, p := range l.Items { - if p.Namespace != f.Namespace.Name { - continue - } - - framework.Logf("Deleting pod: %s", p.Name) - f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute) - } - - restartKubelet(true) - - ginkgo.By("Waiting for GPUs to become unavailable on the local node") - gomega.Eventually(func() bool { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - return numberOfNVIDIAGPUs(node) <= 0 - }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) - }) - - // This test is disabled as this behaviour has not existed since at least - // kubernetes 0.19. If this is a bug, then this test should pass when the - // issue is resolved. If the behaviour is intentional then it can be removed. - ginkgo.XIt("keeps GPU assignation to pods after the device plugin has been removed.", func() { - ginkgo.By("Creating one GPU pod") - podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 180" - p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD)) - - deviceIDRE := "gpu devices: (nvidia[0-9]+)" - devID1 := parseLog(f, p1.Name, p1.Name, deviceIDRE) - p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - - ginkgo.By("Deleting the device plugin") - f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute) - - ginkgo.By("Waiting for GPUs to become unavailable on the local node") - gomega.Eventually(func() int64 { - node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{}) - framework.ExpectNoError(err) - return numberOfNVIDIAGPUs(node) - }, 10*time.Minute, framework.Poll).Should(gomega.BeZero(), "Expected GPUs to eventually be unavailable") - - ginkgo.By("Checking that scheduled pods can continue to run even after we delete the device plugin") - ensurePodContainerRestart(f, p1.Name, p1.Name) - devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart1, devID1) - - ginkgo.By("Restarting Kubelet") - restartKubelet(true) - framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) - - ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") - ensurePodContainerRestart(f, p1.Name, p1.Name) - devIDRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart1, devID1) - - // Cleanup - f.PodClient().DeleteSync(p1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) - }) - - ginkgo.It("keeps GPU assignment to pods across pod and kubelet restarts.", func() { - ginkgo.By("Creating one GPU pod on a node with at least two GPUs") - podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 40" - p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD)) - - deviceIDRE := "gpu devices: (nvidia[0-9]+)" - devID1 := parseLog(f, p1.Name, p1.Name, deviceIDRE) - p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{}) - framework.ExpectNoError(err) - - ginkgo.By("Confirming that after many pod restarts, GPU assignment is kept") - for i := 0; i < 3; i++ { - ensurePodContainerRestart(f, p1.Name, p1.Name) - devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart1, devID1) - } - - ginkgo.By("Restarting Kubelet") - restartKubelet(true) - - ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept") - ensurePodContainerRestart(f, p1.Name, p1.Name) - devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE) - framework.ExpectEqual(devIDRestart1, devID1) - - ginkgo.By("Restarting Kubelet and creating another pod") - - restartKubelet(true) - framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute) - - ensurePodContainerRestart(f, p1.Name, p1.Name) - - gomega.Eventually(func() bool { - return numberOfNVIDIAGPUs(getLocalNode(f)) >= 2 - }, 5*time.Minute, framework.Poll).Should(gomega.BeTrue()) - - p2 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD)) - - ginkgo.By("Checking that pods got a different GPU") - devID2 := parseLog(f, p2.Name, p2.Name, deviceIDRE) - - framework.ExpectNotEqual(devID1, devID2) - }) - }) -}) - -func checkIfNvidiaGPUsExistOnNode() bool { - // Cannot use `lspci` because it is not installed on all distros by default. - err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run() - if err != nil { - framework.Logf("check for nvidia GPUs failed. Got Error: %v", err) - return false - } - return true -} diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 7c6aaa6002d..d27f806b13b 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -46,6 +46,9 @@ const ( imagePullRetryDelay = time.Second // Number of parallel count to pull images. maxParallelImagePullCount = 5 + + // SampleDevicePluginDSYAML is the path of the daemonset template of the sample device plugin. // TODO: Parametrize it by making it a feature in TestFramework. + SampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml" ) // NodePrePullImageList is a list of images used in node e2e test. These images will be prepulled @@ -89,6 +92,11 @@ func updateImageAllowList() { } else { framework.ImagePrePullList.Insert(kubeVirtPluginImage) } + if samplePluginImage, err := getSampleDevicePluginImage(); err != nil { + klog.Errorln(err) + } else { + framework.ImagePrePullList.Insert(samplePluginImage) + } } func getNodeProblemDetectorImage() string { @@ -241,6 +249,23 @@ func getGPUDevicePluginImage() (string, error) { return ds.Spec.Template.Spec.Containers[0].Image, nil } +func getSampleDevicePluginImage() (string, error) { + data, err := e2etestfiles.Read(SampleDevicePluginDSYAML) + if err != nil { + return "", fmt.Errorf("failed to read the sample plugin yaml: %v", err) + } + + ds, err := e2emanifest.DaemonSetFromData(data) + if err != nil { + return "", fmt.Errorf("failed to parse daemon set for sample plugin: %v", err) + } + + if len(ds.Spec.Template.Spec.Containers) < 1 { + return "", fmt.Errorf("failed to parse the sample plugin image: cannot extract the container from YAML") + } + return ds.Spec.Template.Spec.Containers[0].Image, nil +} + // getSRIOVDevicePluginImage returns the image of SRIOV device plugin. func getSRIOVDevicePluginImage() (string, error) { data, err := e2etestfiles.Read(SRIOVDevicePluginDSYAML)