mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-14 13:45:06 +00:00
Extends gpu_device_plugin e2e_node test to verify that scheduled pods
can continue to run even after device plugin deletion and kubelet restarts.
This commit is contained in:
@@ -18,7 +18,6 @@ package e2e_node
|
||||
|
||||
import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"time"
|
||||
|
||||
@@ -49,6 +48,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
||||
initialConfig.FeatureGates[string(features.DevicePlugins)] = true
|
||||
})
|
||||
|
||||
var devicePluginPod *v1.Pod
|
||||
BeforeEach(func() {
|
||||
By("Ensuring that Nvidia GPUs exists on the node")
|
||||
if !checkIfNvidiaGPUsExistOnNode() {
|
||||
@@ -56,7 +56,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
||||
}
|
||||
|
||||
By("Creating the Google Device Plugin pod for NVIDIA GPU in GKE")
|
||||
f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
||||
devicePluginPod = f.PodClient().CreateSync(framework.NVIDIADevicePlugin(f.Namespace.Name))
|
||||
|
||||
By("Waiting for GPUs to become available on the local node")
|
||||
Eventually(func() bool {
|
||||
@@ -84,7 +84,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
||||
It("checks that when Kubelet restarts exclusive GPU assignation to pods is kept.", func() {
|
||||
By("Creating one GPU pod on a node with at least two GPUs")
|
||||
p1 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||
devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
|
||||
count1, devId1 := getDeviceId(f, p1.Name, p1.Name, 1)
|
||||
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
@@ -92,17 +92,36 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
||||
restartKubelet(f)
|
||||
|
||||
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
||||
devIdRestart := getDeviceId(f, p1.Name, p1.Name, 2)
|
||||
Expect(devIdRestart).To(Equal(devId1))
|
||||
count1, devIdRestart1 := getDeviceId(f, p1.Name, p1.Name, count1+1)
|
||||
Expect(devIdRestart1).To(Equal(devId1))
|
||||
|
||||
By("Restarting Kubelet and creating another pod")
|
||||
restartKubelet(f)
|
||||
p2 := f.PodClient().CreateSync(makeCudaPauseImage())
|
||||
|
||||
By("Checking that pods got a different GPU")
|
||||
devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
|
||||
count2, devId2 := getDeviceId(f, p2.Name, p2.Name, 1)
|
||||
Expect(devId1).To(Not(Equal(devId2)))
|
||||
|
||||
By("Deleting device plugin.")
|
||||
f.PodClient().Delete(devicePluginPod.Name, &metav1.DeleteOptions{})
|
||||
By("Waiting for GPUs to become unavailable on the local node")
|
||||
Eventually(func() bool {
|
||||
return framework.NumberOfNVIDIAGPUs(getLocalNode(f)) <= 0
|
||||
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
||||
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||
count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+1)
|
||||
Expect(devIdRestart1).To(Equal(devId1))
|
||||
count2, devIdRestart2 := getDeviceId(f, p2.Name, p2.Name, count2+1)
|
||||
Expect(devIdRestart2).To(Equal(devId2))
|
||||
By("Restarting Kubelet.")
|
||||
restartKubelet(f)
|
||||
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||
count1, devIdRestart1 = getDeviceId(f, p1.Name, p1.Name, count1+2)
|
||||
Expect(devIdRestart1).To(Equal(devId1))
|
||||
count2, devIdRestart2 = getDeviceId(f, p2.Name, p2.Name, count2+2)
|
||||
Expect(devIdRestart2).To(Equal(devId2))
|
||||
|
||||
// Cleanup
|
||||
f.PodClient().DeleteSync(p1.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
f.PodClient().DeleteSync(p2.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
|
||||
@@ -140,9 +159,6 @@ func newDecimalResourceList(name v1.ResourceName, quantity int64) v1.ResourceLis
|
||||
|
||||
// TODO: Find a uniform way to deal with systemctl/initctl/service operations. #34494
|
||||
func restartKubelet(f *framework.Framework) {
|
||||
beforeSocks, err := filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
|
||||
framework.ExpectNoError(err)
|
||||
Expect(len(beforeSocks)).NotTo(BeZero())
|
||||
stdout, err := exec.Command("sudo", "systemctl", "list-units", "kubelet*", "--state=running").CombinedOutput()
|
||||
framework.ExpectNoError(err)
|
||||
regex := regexp.MustCompile("(kubelet-[0-9]+)")
|
||||
@@ -152,19 +168,18 @@ func restartKubelet(f *framework.Framework) {
|
||||
framework.Logf("Get running kubelet with systemctl: %v, %v", string(stdout), kube)
|
||||
stdout, err = exec.Command("sudo", "systemctl", "restart", kube).CombinedOutput()
|
||||
framework.ExpectNoError(err, "Failed to restart kubelet with systemctl: %v, %v", err, stdout)
|
||||
Eventually(func() ([]string, error) {
|
||||
return filepath.Glob("/var/lib/kubelet/device-plugins/nvidiaGPU*.sock")
|
||||
}, 5*time.Minute, framework.Poll).ShouldNot(ConsistOf(beforeSocks))
|
||||
}
|
||||
|
||||
func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) string {
|
||||
func getDeviceId(f *framework.Framework, podName string, contName string, restartCount int32) (int32, string) {
|
||||
var count int32
|
||||
// Wait till pod has been restarted at least restartCount times.
|
||||
Eventually(func() bool {
|
||||
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
||||
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
||||
return false
|
||||
}
|
||||
return p.Status.ContainerStatuses[0].RestartCount >= restartCount
|
||||
count = p.Status.ContainerStatuses[0].RestartCount
|
||||
return count >= restartCount
|
||||
}, 5*time.Minute, framework.Poll).Should(BeTrue())
|
||||
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
||||
if err != nil {
|
||||
@@ -174,7 +189,7 @@ func getDeviceId(f *framework.Framework, podName string, contName string, restar
|
||||
regex := regexp.MustCompile("gpu devices: (nvidia[0-9]+)")
|
||||
matches := regex.FindStringSubmatch(logs)
|
||||
if len(matches) < 2 {
|
||||
return ""
|
||||
return count, ""
|
||||
}
|
||||
return matches[1]
|
||||
return count, matches[1]
|
||||
}
|
||||
|
Reference in New Issue
Block a user