mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 22:46:12 +00:00
Merge pull request #63130 from vikaschoudhary16/dp_e2e_alloc
Automatic merge from submit-queue (batch tested with PRs 61455, 63346, 63130, 63404). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. [Device-Plugin]: Extend e2e test to cover node allocatables **What this PR does / why we need it**: Extends device plugin e2e to cover node allocatable **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: **Release note**: ```release-note None ``` /sig node /area hw-accelerators /cc @jiayingz @vishh @RenaudWasTaken
This commit is contained in:
commit
a244d8a48f
@ -69,33 +69,41 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
|
|||||||
|
|
||||||
By("Waiting for the resource exported by the stub device plugin to become available on the local node")
|
By("Waiting for the resource exported by the stub device plugin to become available on the local node")
|
||||||
devsLen := int64(len(devs))
|
devsLen := int64(len(devs))
|
||||||
Eventually(func() int64 {
|
Eventually(func() bool {
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
return numberOfDevices(node, resourceName)
|
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||||
}, 30*time.Second, framework.Poll).Should(Equal(devsLen))
|
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||||
|
}, 30*time.Second, framework.Poll).Should(BeTrue())
|
||||||
|
|
||||||
By("Creating one pod on node with at least one fake-device")
|
By("Creating one pod on node with at least one fake-device")
|
||||||
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs"
|
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs"
|
||||||
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
||||||
deviceIDRE := "stub devices: (Dev-[0-9]+)"
|
deviceIDRE := "stub devices: (Dev-[0-9]+)"
|
||||||
count1, devId1 := parseLogFromNRuns(f, pod1.Name, pod1.Name, 0, deviceIDRE)
|
devId1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||||
Expect(devId1).To(Not(Equal("")))
|
Expect(devId1).To(Not(Equal("")))
|
||||||
|
|
||||||
pod1, err = f.PodClient().Get(pod1.Name, metav1.GetOptions{})
|
pod1, err = f.PodClient().Get(pod1.Name, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
By("Restarting Kubelet and waiting for the current running pod to restart")
|
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||||
|
|
||||||
|
By("Confirming that device assignment persists even after container restart")
|
||||||
|
devIdAfterRestart := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||||
|
Expect(devIdAfterRestart).To(Equal(devId1))
|
||||||
|
|
||||||
|
By("Restarting Kubelet")
|
||||||
restartKubelet()
|
restartKubelet()
|
||||||
|
|
||||||
By("Confirming that after a kubelet and pod restart, fake-device assignement is kept")
|
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||||
count1, devIdRestart1 := parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+1, deviceIDRE)
|
By("Confirming that after a kubelet restart, fake-device assignement is kept")
|
||||||
|
devIdRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
|
||||||
By("Wait for node is ready")
|
By("Wait for node is ready")
|
||||||
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
|
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
|
||||||
|
|
||||||
By("Re-Register resources")
|
By("Re-Register resources after kubelet restart")
|
||||||
dp1 = dm.NewDevicePluginStub(devs, socketPath)
|
dp1 = dm.NewDevicePluginStub(devs, socketPath)
|
||||||
dp1.SetAllocFunc(stubAllocFunc)
|
dp1.SetAllocFunc(stubAllocFunc)
|
||||||
err = dp1.Start()
|
err = dp1.Start()
|
||||||
@ -105,17 +113,18 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
|
|||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
By("Waiting for resource to become available on the local node after re-registration")
|
By("Waiting for resource to become available on the local node after re-registration")
|
||||||
Eventually(func() int64 {
|
Eventually(func() bool {
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
return numberOfDevices(node, resourceName)
|
return numberOfDevicesCapacity(node, resourceName) == devsLen &&
|
||||||
}, 30*time.Second, framework.Poll).Should(Equal(devsLen))
|
numberOfDevicesAllocatable(node, resourceName) == devsLen
|
||||||
|
}, 30*time.Second, framework.Poll).Should(BeTrue())
|
||||||
|
|
||||||
By("Creating another pod")
|
By("Creating another pod")
|
||||||
pod2 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
pod2 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
|
||||||
|
|
||||||
By("Checking that pods got a different GPU")
|
By("Checking that pod got a different fake device")
|
||||||
count2, devId2 := parseLogFromNRuns(f, pod2.Name, pod2.Name, 1, deviceIDRE)
|
devId2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
|
||||||
|
|
||||||
Expect(devId1).To(Not(Equal(devId2)))
|
Expect(devId1).To(Not(Equal(devId2)))
|
||||||
|
|
||||||
@ -123,26 +132,59 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
|
|||||||
err = dp1.Stop()
|
err = dp1.Stop()
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
By("Waiting for stub device plugin to become unhealthy on the local node")
|
||||||
|
Eventually(func() int64 {
|
||||||
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
return numberOfDevicesAllocatable(node, resourceName)
|
||||||
|
}, 30*time.Second, framework.Poll).Should(Equal(int64(0)))
|
||||||
|
|
||||||
|
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||||
|
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||||
|
devIdRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||||
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
|
||||||
|
ensurePodContainerRestart(f, pod2.Name, pod2.Name)
|
||||||
|
devIdRestart2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
|
||||||
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
|
|
||||||
|
By("Re-register resources")
|
||||||
|
dp1 = dm.NewDevicePluginStub(devs, socketPath)
|
||||||
|
dp1.SetAllocFunc(stubAllocFunc)
|
||||||
|
err = dp1.Start()
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
err = dp1.Register(pluginapi.KubeletSocket, resourceName, false)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
By("Waiting for the resource exported by the stub device plugin to become healthy on the local node")
|
||||||
|
Eventually(func() int64 {
|
||||||
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
return numberOfDevicesAllocatable(node, resourceName)
|
||||||
|
}, 30*time.Second, framework.Poll).Should(Equal(devsLen))
|
||||||
|
|
||||||
|
By("Deleting device plugin again.")
|
||||||
|
err = dp1.Stop()
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
By("Waiting for stub device plugin to become unavailable on the local node")
|
By("Waiting for stub device plugin to become unavailable on the local node")
|
||||||
Eventually(func() bool {
|
Eventually(func() bool {
|
||||||
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
return numberOfDevices(node, resourceName) <= 0
|
return numberOfDevicesCapacity(node, resourceName) <= 0
|
||||||
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
||||||
|
|
||||||
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
By("Restarting Kubelet second time.")
|
||||||
count1, devIdRestart1 = parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+1, deviceIDRE)
|
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
|
||||||
count2, devIdRestart2 := parseLogFromNRuns(f, pod2.Name, pod2.Name, count2+1, deviceIDRE)
|
|
||||||
Expect(devIdRestart2).To(Equal(devId2))
|
|
||||||
|
|
||||||
By("Restarting Kubelet.")
|
|
||||||
restartKubelet()
|
restartKubelet()
|
||||||
|
|
||||||
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet Eventually.")
|
||||||
count1, devIdRestart1 = parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+2, deviceIDRE)
|
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
|
||||||
|
devIdRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
count2, devIdRestart2 = parseLogFromNRuns(f, pod2.Name, pod2.Name, count2+2, deviceIDRE)
|
|
||||||
|
ensurePodContainerRestart(f, pod2.Name, pod2.Name)
|
||||||
|
devIdRestart2 = parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
|
||||||
Expect(devIdRestart2).To(Equal(devId2))
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
@ -176,21 +218,28 @@ func makeBusyboxPod(resourceName, cmd string) *v1.Pod {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseLogFromNRuns returns restart count of the specified container
|
// ensurePodContainerRestart confirms that pod container has restarted at least once
|
||||||
// after it has been restarted at least restartCount times,
|
func ensurePodContainerRestart(f *framework.Framework, podName string, contName string) {
|
||||||
// and the matching string for the specified regular expression parsed from the container logs.
|
var initialCount int32
|
||||||
func parseLogFromNRuns(f *framework.Framework, podName string, contName string, restartCount int32, re string) (int32, string) {
|
var currentCount int32
|
||||||
var count int32
|
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
||||||
// Wait till pod has been restarted at least restartCount times.
|
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
||||||
|
framework.Failf("ensurePodContainerRestart failed for pod %q: %v", podName, err)
|
||||||
|
}
|
||||||
|
initialCount = p.Status.ContainerStatuses[0].RestartCount
|
||||||
Eventually(func() bool {
|
Eventually(func() bool {
|
||||||
p, err := f.PodClient().Get(podName, metav1.GetOptions{})
|
p, err = f.PodClient().Get(podName, metav1.GetOptions{})
|
||||||
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
if err != nil || len(p.Status.ContainerStatuses) < 1 {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
count = p.Status.ContainerStatuses[0].RestartCount
|
currentCount = p.Status.ContainerStatuses[0].RestartCount
|
||||||
return count >= restartCount
|
framework.Logf("initial %v, current %v", initialCount, currentCount)
|
||||||
}, 5*time.Minute, framework.Poll).Should(BeTrue())
|
return currentCount > initialCount
|
||||||
|
}, 2*time.Minute, framework.Poll).Should(BeTrue())
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseLog returns the matching string for the specified regular expression parsed from the container logs.
|
||||||
|
func parseLog(f *framework.Framework, podName string, contName string, re string) string {
|
||||||
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
framework.Failf("GetPodLogs for pod %q failed: %v", podName, err)
|
framework.Failf("GetPodLogs for pod %q failed: %v", podName, err)
|
||||||
@ -200,14 +249,14 @@ func parseLogFromNRuns(f *framework.Framework, podName string, contName string,
|
|||||||
regex := regexp.MustCompile(re)
|
regex := regexp.MustCompile(re)
|
||||||
matches := regex.FindStringSubmatch(logs)
|
matches := regex.FindStringSubmatch(logs)
|
||||||
if len(matches) < 2 {
|
if len(matches) < 2 {
|
||||||
return count, ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
return count, matches[1]
|
return matches[1]
|
||||||
}
|
}
|
||||||
|
|
||||||
// numberOfDevices returns the number of devices of resourceName advertised by a node
|
// numberOfDevicesCapacity returns the number of devices of resourceName advertised by a node capacity
|
||||||
func numberOfDevices(node *v1.Node, resourceName string) int64 {
|
func numberOfDevicesCapacity(node *v1.Node, resourceName string) int64 {
|
||||||
val, ok := node.Status.Capacity[v1.ResourceName(resourceName)]
|
val, ok := node.Status.Capacity[v1.ResourceName(resourceName)]
|
||||||
if !ok {
|
if !ok {
|
||||||
return 0
|
return 0
|
||||||
@ -216,6 +265,16 @@ func numberOfDevices(node *v1.Node, resourceName string) int64 {
|
|||||||
return val.Value()
|
return val.Value()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// numberOfDevicesAllocatable returns the number of devices of resourceName advertised by a node allocatable
|
||||||
|
func numberOfDevicesAllocatable(node *v1.Node, resourceName string) int64 {
|
||||||
|
val, ok := node.Status.Allocatable[v1.ResourceName(resourceName)]
|
||||||
|
if !ok {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return val.Value()
|
||||||
|
}
|
||||||
|
|
||||||
// stubAllocFunc will pass to stub device plugin
|
// stubAllocFunc will pass to stub device plugin
|
||||||
func stubAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
|
func stubAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
|
||||||
var responses pluginapi.AllocateResponse
|
var responses pluginapi.AllocateResponse
|
||||||
|
@ -80,7 +80,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
p1 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
|
p1 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
|
||||||
|
|
||||||
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
deviceIDRE := "gpu devices: (nvidia[0-9]+)"
|
||||||
count1, devId1 := parseLogFromNRuns(f, p1.Name, p1.Name, 1, deviceIDRE)
|
devId1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||||
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
@ -88,7 +88,8 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
restartKubelet()
|
restartKubelet()
|
||||||
|
|
||||||
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
|
||||||
count1, devIdRestart1 := parseLogFromNRuns(f, p1.Name, p1.Name, count1+1, deviceIDRE)
|
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||||
|
devIdRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
|
|
||||||
By("Restarting Kubelet and creating another pod")
|
By("Restarting Kubelet and creating another pod")
|
||||||
@ -100,7 +101,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
p2 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
|
p2 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
|
||||||
|
|
||||||
By("Checking that pods got a different GPU")
|
By("Checking that pods got a different GPU")
|
||||||
count2, devId2 := parseLogFromNRuns(f, p2.Name, p2.Name, 1, deviceIDRE)
|
devId2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||||
|
|
||||||
Expect(devId1).To(Not(Equal(devId2)))
|
Expect(devId1).To(Not(Equal(devId2)))
|
||||||
|
|
||||||
@ -113,16 +114,21 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
|||||||
return framework.NumberOfNVIDIAGPUs(node) <= 0
|
return framework.NumberOfNVIDIAGPUs(node) <= 0
|
||||||
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
}, 10*time.Minute, framework.Poll).Should(BeTrue())
|
||||||
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
|
||||||
count1, devIdRestart1 = parseLogFromNRuns(f, p1.Name, p1.Name, count1+1, deviceIDRE)
|
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||||
|
devIdRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
count2, devIdRestart2 := parseLogFromNRuns(f, p2.Name, p2.Name, count2+1, deviceIDRE)
|
|
||||||
|
ensurePodContainerRestart(f, p2.Name, p2.Name)
|
||||||
|
devIdRestart2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||||
Expect(devIdRestart2).To(Equal(devId2))
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
By("Restarting Kubelet.")
|
By("Restarting Kubelet.")
|
||||||
restartKubelet()
|
restartKubelet()
|
||||||
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
|
||||||
count1, devIdRestart1 = parseLogFromNRuns(f, p1.Name, p1.Name, count1+2, deviceIDRE)
|
ensurePodContainerRestart(f, p1.Name, p1.Name)
|
||||||
|
devIdRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
|
||||||
Expect(devIdRestart1).To(Equal(devId1))
|
Expect(devIdRestart1).To(Equal(devId1))
|
||||||
count2, devIdRestart2 = parseLogFromNRuns(f, p2.Name, p2.Name, count2+2, deviceIDRE)
|
ensurePodContainerRestart(f, p2.Name, p2.Name)
|
||||||
|
devIdRestart2 = parseLog(f, p2.Name, p2.Name, deviceIDRE)
|
||||||
Expect(devIdRestart2).To(Equal(devId2))
|
Expect(devIdRestart2).To(Equal(devId2))
|
||||||
logDevicePluginMetrics()
|
logDevicePluginMetrics()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user