mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
e2e_node: unify device tests
The device_plugin_tests have not run successfully in a very long time, initially being marked flaky and then eventually becoming stale. The gpu_device_plugin_tests have been used to test the same behaviour, but are incredibly high maintenance due to external changes in behaviour from GCP/Nvidia that we have no control over. This commit takes the existing device plugin tests, makes them look more like the GPU tests, and removes the cases that have been unsupported for a long time (namely restarting containers while the plugin is unavailable). It also removes the GPU plugin tests, as we do not get more signal by using real devices here.
This commit is contained in:
parent
9351ea291a
commit
03de802434
@ -45,8 +45,6 @@ import (
|
||||
const (
|
||||
// sampleResourceName is the name of the example resource which is used in the e2e test
|
||||
sampleResourceName = "example.com/resource"
|
||||
// sampleDevicePluginDSYAML is the path of the daemonset template of the sample device plugin. // TODO: Parametrize it by making it a feature in TestFramework.
|
||||
sampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml"
|
||||
// sampleDevicePluginName is the name of the device plugin pod
|
||||
sampleDevicePluginName = "sample-device-plugin"
|
||||
|
||||
@ -79,7 +77,7 @@ func numberOfSampleResources(node *v1.Node) int64 {
|
||||
|
||||
// getSampleDevicePluginPod returns the Device Plugin pod for sample resources in e2e tests.
|
||||
func getSampleDevicePluginPod() *v1.Pod {
|
||||
data, err := e2etestfiles.Read(sampleDevicePluginDSYAML)
|
||||
data, err := e2etestfiles.Read(SampleDevicePluginDSYAML)
|
||||
if err != nil {
|
||||
framework.Fail(err.Error())
|
||||
}
|
||||
@ -109,21 +107,30 @@ func readDaemonSetV1OrDie(objBytes []byte) *appsv1.DaemonSet {
|
||||
|
||||
func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
pluginSockDir = filepath.Join(pluginSockDir) + "/"
|
||||
ginkgo.Context("DevicePlugin", func() {
|
||||
ginkgo.It("[Flaky] Verifies the Kubelet device plugin functionality.", func() {
|
||||
ginkgo.By("Wait for node is ready to start with")
|
||||
e2enode.WaitForNodeToBeReady(f.ClientSet, framework.TestContext.NodeName, 5*time.Minute)
|
||||
ginkgo.Context("DevicePlugin [Serial] [Disruptive]", func() {
|
||||
// TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap
|
||||
// and then use the same here
|
||||
devsLen := int64(2)
|
||||
var devicePluginPod, dptemplate *v1.Pod
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
ginkgo.By("Wait for node to be ready")
|
||||
gomega.Eventually(func() bool {
|
||||
nodes, err := e2enode.TotalReady(f.ClientSet)
|
||||
framework.ExpectNoError(err)
|
||||
return nodes == 1
|
||||
}, time.Minute, time.Second).Should(gomega.BeTrue())
|
||||
|
||||
ginkgo.By("Scheduling a sample device plugin pod")
|
||||
dp := getSampleDevicePluginPod()
|
||||
dp.Namespace = ""
|
||||
for i := range dp.Spec.Containers[0].Env {
|
||||
if dp.Spec.Containers[0].Env[i].Name == envVarNamePluginSockDir {
|
||||
dp.Spec.Containers[0].Env[i].Value = pluginSockDir
|
||||
}
|
||||
}
|
||||
framework.Logf("env %v", dp.Spec.Containers[0].Env)
|
||||
dp.Spec.NodeName = framework.TestContext.NodeName
|
||||
ginkgo.By("Create sample device plugin pod")
|
||||
devicePluginPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
dptemplate = dp
|
||||
devicePluginPod = f.PodClient().CreateSync(dp)
|
||||
|
||||
ginkgo.By("Waiting for devices to become available on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
@ -132,18 +139,39 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
framework.Logf("Successfully created device plugin pod")
|
||||
|
||||
ginkgo.By("Waiting for the resource exported by the sample device plugin to become available on the local node")
|
||||
// TODO(vikasc): Instead of hard-coding number of devices, provide number of devices in the sample-device-plugin using configmap
|
||||
// and then use the same here
|
||||
devsLen := int64(2)
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
node := getLocalNode(f)
|
||||
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
|
||||
})
|
||||
|
||||
ginkgo.By("Creating one pod on node with at least one fake-device")
|
||||
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs"
|
||||
ginkgo.AfterEach(func() {
|
||||
ginkgo.By("Deleting the device plugin pod")
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, time.Minute)
|
||||
|
||||
ginkgo.By("Deleting any Pods created by the test")
|
||||
l, err := f.PodClient().List(context.TODO(), metav1.ListOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
for _, p := range l.Items {
|
||||
if p.Namespace != f.Namespace.Name {
|
||||
continue
|
||||
}
|
||||
|
||||
framework.Logf("Deleting pod: %s", p.Name)
|
||||
f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
}
|
||||
|
||||
restartKubelet(true)
|
||||
|
||||
ginkgo.By("Waiting for devices to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfSampleResources(getLocalNode(f)) <= 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
})
|
||||
|
||||
ginkgo.It("Can schedule a pod that requires a device", func() {
|
||||
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60"
|
||||
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
||||
deviceIDRE := "stub devices: (Dev-[0-9]+)"
|
||||
devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
@ -151,11 +179,9 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
|
||||
v1alphaPodResources, err := getV1alpha1NodeDevices()
|
||||
framework.ExpectNoError(err)
|
||||
framework.Logf("v1alpha pod resources %v", v1alphaPodResources)
|
||||
|
||||
v1PodResources, err := getV1NodeDevices()
|
||||
framework.ExpectNoError(err)
|
||||
framework.Logf("v1 pod resources %v", v1PodResources)
|
||||
|
||||
framework.ExpectEqual(len(v1alphaPodResources.PodResources), 2)
|
||||
framework.ExpectEqual(len(v1PodResources.PodResources), 2)
|
||||
@ -166,7 +192,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
v1alphaResourcesForOurPod = res
|
||||
}
|
||||
}
|
||||
framework.Logf("v1alphaResourcesForOurPod %v", v1alphaResourcesForOurPod)
|
||||
|
||||
var v1ResourcesForOurPod *kubeletpodresourcesv1.PodResources
|
||||
for _, res := range v1PodResources.GetPodResources() {
|
||||
@ -174,7 +199,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
v1ResourcesForOurPod = res
|
||||
}
|
||||
}
|
||||
framework.Logf("v1ResourcesForOurPod %v", v1ResourcesForOurPod)
|
||||
|
||||
gomega.Expect(v1alphaResourcesForOurPod).NotTo(gomega.BeNil())
|
||||
gomega.Expect(v1ResourcesForOurPod).NotTo(gomega.BeNil())
|
||||
@ -199,8 +223,16 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
|
||||
framework.ExpectEqual(len(v1alphaResourcesForOurPod.Containers[0].Devices[0].DeviceIds), 1)
|
||||
framework.ExpectEqual(len(v1ResourcesForOurPod.Containers[0].Devices[0].DeviceIds), 1)
|
||||
})
|
||||
|
||||
pod1, err = f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{})
|
||||
ginkgo.It("Keeps device plugin assignments across pod and kubelet restarts", func() {
|
||||
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60"
|
||||
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
||||
deviceIDRE := "stub devices: (Dev-[0-9]+)"
|
||||
devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")))
|
||||
|
||||
pod1, err := f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||
@ -209,49 +241,54 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
devIDAfterRestart := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDAfterRestart, devID1)
|
||||
|
||||
restartTime := time.Now()
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet(true)
|
||||
|
||||
// We need to wait for node to be ready before re-registering stub device plugin.
|
||||
// Otherwise, Kubelet DeviceManager may remove the re-registered sockets after it starts.
|
||||
ginkgo.By("Wait for node is ready")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
for _, cond := range node.Status.Conditions {
|
||||
if cond.Type == v1.NodeReady && cond.Status == v1.ConditionTrue && cond.LastHeartbeatTime.After(restartTime) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
ginkgo.By("Wait for node to be ready again")
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 5*time.Minute)
|
||||
|
||||
ginkgo.By("Re-Register resources and deleting the pods and waiting for container removal")
|
||||
getOptions := metav1.GetOptions{}
|
||||
ginkgo.By("Validating that assignment is kept")
|
||||
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||
ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept")
|
||||
devIDRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
})
|
||||
|
||||
ginkgo.It("Keeps device plugin assignments after the device plugin has been re-registered", func() {
|
||||
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs && sleep 60"
|
||||
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
||||
deviceIDRE := "stub devices: (Dev-[0-9]+)"
|
||||
devID1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
gomega.Expect(devID1).To(gomega.Not(gomega.Equal("")))
|
||||
|
||||
pod1, err := f.PodClient().Get(context.TODO(), pod1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet(true)
|
||||
|
||||
ginkgo.By("Wait for node to be ready again")
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 5*time.Minute)
|
||||
|
||||
ginkgo.By("Re-Register resources and delete the plugin pod")
|
||||
gp := int64(0)
|
||||
deleteOptions := metav1.DeleteOptions{
|
||||
GracePeriodSeconds: &gp,
|
||||
}
|
||||
err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, deleteOptions, time.Minute)
|
||||
waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace)
|
||||
_, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Get(context.TODO(), dp.Name, getOptions)
|
||||
framework.Logf("Trying to get dp pod after deletion. err must be non-nil. err: %v", err)
|
||||
framework.ExpectError(err)
|
||||
|
||||
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
ginkgo.By("Recreating the plugin pod")
|
||||
devicePluginPod = f.PodClient().CreateSync(dptemplate)
|
||||
|
||||
ginkgo.By("Confirming that after a kubelet and pod restart, fake-device assignment is kept")
|
||||
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||
ginkgo.By("Confirming that after a kubelet restart, fake-device assignment is kept")
|
||||
devIDRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ginkgo.By("Waiting for resource to become available on the local node after re-registration")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
node := getLocalNode(f)
|
||||
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.BeTrue())
|
||||
@ -263,54 +300,6 @@ func testDevicePlugin(f *framework.Framework, pluginSockDir string) {
|
||||
devID2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
|
||||
|
||||
gomega.Expect(devID1).To(gomega.Not(gomega.Equal(devID2)))
|
||||
|
||||
ginkgo.By("By deleting the pods and waiting for container removal")
|
||||
err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace)
|
||||
|
||||
ginkgo.By("Waiting for stub device plugin to become unhealthy on the local node")
|
||||
gomega.Eventually(func() int64 {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfDevicesAllocatable(node, resourceName)
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.Equal(int64(0)))
|
||||
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||
devIDRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ensurePodContainerRestart(f, pod2.Name, pod2.Name)
|
||||
devIDRestart2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart2, devID2)
|
||||
|
||||
ginkgo.By("Re-register resources")
|
||||
devicePluginPod, err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(context.TODO(), dp, metav1.CreateOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Waiting for the resource exported by the stub device plugin to become healthy on the local node")
|
||||
gomega.Eventually(func() int64 {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfDevicesAllocatable(node, resourceName)
|
||||
}, 30*time.Second, framework.Poll).Should(gomega.Equal(devsLen))
|
||||
|
||||
ginkgo.By("by deleting the pods and waiting for container removal")
|
||||
err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(context.TODO(), dp.Name, deleteOptions)
|
||||
framework.ExpectNoError(err)
|
||||
waitForContainerRemoval(devicePluginPod.Spec.Containers[0].Name, devicePluginPod.Name, devicePluginPod.Namespace)
|
||||
|
||||
ginkgo.By("Waiting for stub device plugin to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfDevicesCapacity(node, resourceName) <= 0
|
||||
}, 10*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
|
||||
// Cleanup
|
||||
f.PodClient().DeleteSync(pod1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
f.PodClient().DeleteSync(pod2.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
@ -1,211 +0,0 @@
|
||||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2enode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
|
||||
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
|
||||
|
||||
"github.com/onsi/ginkgo"
|
||||
"github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// numberOfNVIDIAGPUs returns the number of GPUs advertised by a node
|
||||
// This is based on the Device Plugin system and expected to run on a COS based node
|
||||
// After the NVIDIA drivers were installed
|
||||
// TODO make this generic and not linked to COS only
|
||||
func numberOfNVIDIAGPUs(node *v1.Node) int64 {
|
||||
val, ok := node.Status.Capacity[e2egpu.NVIDIAGPUResourceName]
|
||||
if !ok {
|
||||
return 0
|
||||
}
|
||||
return val.Value()
|
||||
}
|
||||
|
||||
// NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
|
||||
func NVIDIADevicePlugin() *v1.Pod {
|
||||
ds, err := e2emanifest.DaemonSetFromURL(e2egpu.GPUDevicePluginDSYAML)
|
||||
framework.ExpectNoError(err)
|
||||
p := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
|
||||
},
|
||||
Spec: ds.Spec.Template.Spec,
|
||||
}
|
||||
// Remove node affinity
|
||||
p.Spec.Affinity = nil
|
||||
return p
|
||||
}
|
||||
|
||||
// Serial because the test restarts Kubelet
|
||||
var _ = SIGDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugin][NodeFeature:GPUDevicePlugin][Serial] [Disruptive]", func() {
|
||||
f := framework.NewDefaultFramework("device-plugin-gpus-errors")
|
||||
|
||||
ginkgo.Context("DevicePlugin", func() {
|
||||
var devicePluginPod *v1.Pod
|
||||
ginkgo.BeforeEach(func() {
|
||||
ginkgo.By("Ensuring that Nvidia GPUs exists on the node")
|
||||
if !checkIfNvidiaGPUsExistOnNode() {
|
||||
ginkgo.Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
||||
}
|
||||
|
||||
if framework.TestContext.ContainerRuntime != "docker" {
|
||||
ginkgo.Skip("Test works only with in-tree dockershim. Skipping test.")
|
||||
}
|
||||
|
||||
ginkgo.By("Creating the Google Device Plugin pod for NVIDIA GPU")
|
||||
devicePluginPod = f.PodClient().Create(NVIDIADevicePlugin())
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become available on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfNVIDIAGPUs(getLocalNode(f)) > 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue(), "GPUs never became available on the local node")
|
||||
|
||||
if numberOfNVIDIAGPUs(getLocalNode(f)) < 2 {
|
||||
ginkgo.Skip("Not enough GPUs to execute this test (at least two needed)")
|
||||
}
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
l, err := f.PodClient().List(context.TODO(), metav1.ListOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
|
||||
for _, p := range l.Items {
|
||||
if p.Namespace != f.Namespace.Name {
|
||||
continue
|
||||
}
|
||||
|
||||
framework.Logf("Deleting pod: %s", p.Name)
|
||||
f.PodClient().DeleteSync(p.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
}
|
||||
|
||||
restartKubelet(true)
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
|
||||
gomega.Eventually(func() bool {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfNVIDIAGPUs(node) <= 0
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
})
|
||||
|
||||
// This test is disabled as this behaviour has not existed since at least
|
||||
// kubernetes 0.19. If this is a bug, then this test should pass when the
|
||||
// issue is resolved. If the behaviour is intentional then it can be removed.
|
||||
ginkgo.XIt("keeps GPU assignation to pods after the device plugin has been removed.", func() {
|
||||
ginkgo.By("Creating one GPU pod")
|
||||
podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 180"
|
||||
p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
||||
devID1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Deleting the device plugin")
|
||||
f.PodClient().DeleteSync(devicePluginPod.Name, metav1.DeleteOptions{}, 2*time.Minute)
|
||||
|
||||
ginkgo.By("Waiting for GPUs to become unavailable on the local node")
|
||||
gomega.Eventually(func() int64 {
|
||||
node, err := f.ClientSet.CoreV1().Nodes().Get(context.TODO(), framework.TestContext.NodeName, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return numberOfNVIDIAGPUs(node)
|
||||
}, 10*time.Minute, framework.Poll).Should(gomega.BeZero(), "Expected GPUs to eventually be unavailable")
|
||||
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete the device plugin")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet(true)
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
|
||||
|
||||
ginkgo.By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
// Cleanup
|
||||
f.PodClient().DeleteSync(p1.Name, metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
})
|
||||
|
||||
ginkgo.It("keeps GPU assignment to pods across pod and kubelet restarts.", func() {
|
||||
ginkgo.By("Creating one GPU pod on a node with at least two GPUs")
|
||||
podRECMD := "devs=$(ls /dev/ | egrep '^nvidia[0-9]+$') && echo gpu devices: $devs && sleep 40"
|
||||
p1 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
||||
devID1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
p1, err := f.PodClient().Get(context.TODO(), p1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Confirming that after many pod restarts, GPU assignment is kept")
|
||||
for i := 0; i < 3; i++ {
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
}
|
||||
|
||||
ginkgo.By("Restarting Kubelet")
|
||||
restartKubelet(true)
|
||||
|
||||
ginkgo.By("Confirming that after a kubelet and pod restart, GPU assignment is kept")
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
devIDRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||
framework.ExpectEqual(devIDRestart1, devID1)
|
||||
|
||||
ginkgo.By("Restarting Kubelet and creating another pod")
|
||||
|
||||
restartKubelet(true)
|
||||
framework.WaitForAllNodesSchedulable(f.ClientSet, 30*time.Minute)
|
||||
|
||||
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||
|
||||
gomega.Eventually(func() bool {
|
||||
return numberOfNVIDIAGPUs(getLocalNode(f)) >= 2
|
||||
}, 5*time.Minute, framework.Poll).Should(gomega.BeTrue())
|
||||
|
||||
p2 := f.PodClient().CreateSync(makeBusyboxPod(e2egpu.NVIDIAGPUResourceName, podRECMD))
|
||||
|
||||
ginkgo.By("Checking that pods got a different GPU")
|
||||
devID2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||
|
||||
framework.ExpectNotEqual(devID1, devID2)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func checkIfNvidiaGPUsExistOnNode() bool {
|
||||
// Cannot use `lspci` because it is not installed on all distros by default.
|
||||
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
|
||||
if err != nil {
|
||||
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
@ -46,6 +46,9 @@ const (
|
||||
imagePullRetryDelay = time.Second
|
||||
// Number of parallel count to pull images.
|
||||
maxParallelImagePullCount = 5
|
||||
|
||||
// SampleDevicePluginDSYAML is the path of the daemonset template of the sample device plugin. // TODO: Parametrize it by making it a feature in TestFramework.
|
||||
SampleDevicePluginDSYAML = "test/e2e/testing-manifests/sample-device-plugin.yaml"
|
||||
)
|
||||
|
||||
// NodePrePullImageList is a list of images used in node e2e test. These images will be prepulled
|
||||
@ -89,6 +92,11 @@ func updateImageAllowList() {
|
||||
} else {
|
||||
framework.ImagePrePullList.Insert(kubeVirtPluginImage)
|
||||
}
|
||||
if samplePluginImage, err := getSampleDevicePluginImage(); err != nil {
|
||||
klog.Errorln(err)
|
||||
} else {
|
||||
framework.ImagePrePullList.Insert(samplePluginImage)
|
||||
}
|
||||
}
|
||||
|
||||
func getNodeProblemDetectorImage() string {
|
||||
@ -241,6 +249,23 @@ func getGPUDevicePluginImage() (string, error) {
|
||||
return ds.Spec.Template.Spec.Containers[0].Image, nil
|
||||
}
|
||||
|
||||
func getSampleDevicePluginImage() (string, error) {
|
||||
data, err := e2etestfiles.Read(SampleDevicePluginDSYAML)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read the sample plugin yaml: %v", err)
|
||||
}
|
||||
|
||||
ds, err := e2emanifest.DaemonSetFromData(data)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to parse daemon set for sample plugin: %v", err)
|
||||
}
|
||||
|
||||
if len(ds.Spec.Template.Spec.Containers) < 1 {
|
||||
return "", fmt.Errorf("failed to parse the sample plugin image: cannot extract the container from YAML")
|
||||
}
|
||||
return ds.Spec.Template.Spec.Containers[0].Image, nil
|
||||
}
|
||||
|
||||
// getSRIOVDevicePluginImage returns the image of SRIOV device plugin.
|
||||
func getSRIOVDevicePluginImage() (string, error) {
|
||||
data, err := e2etestfiles.Read(SRIOVDevicePluginDSYAML)
|
||||
|
Loading…
Reference in New Issue
Block a user