Merge pull request #63130 from vikaschoudhary16/dp_e2e_alloc

Automatic merge from submit-queue (batch tested with PRs 61455, 63346, 63130, 63404). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

[Device-Plugin]: Extend e2e test to cover node allocatables

**What this PR does / why we need it**:
 Extends device plugin e2e to cover node allocatable
**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes #

**Special notes for your reviewer**:

**Release note**:

```release-note
None
```
/sig node
/area hw-accelerators
/cc @jiayingz @vishh @RenaudWasTaken
This commit is contained in:
Kubernetes Submit Queue 2018-05-03 14:24:10 -07:00 committed by GitHub
commit a244d8a48f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 110 additions and 45 deletions

View File

@ -69,33 +69,41 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
By("Waiting for the resource exported by the stub device plugin to become available on the local node") By("Waiting for the resource exported by the stub device plugin to become available on the local node")
devsLen := int64(len(devs)) devsLen := int64(len(devs))
Eventually(func() int64 { Eventually(func() bool {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{}) node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
return numberOfDevices(node, resourceName) return numberOfDevicesCapacity(node, resourceName) == devsLen &&
}, 30*time.Second, framework.Poll).Should(Equal(devsLen)) numberOfDevicesAllocatable(node, resourceName) == devsLen
}, 30*time.Second, framework.Poll).Should(BeTrue())
By("Creating one pod on node with at least one fake-device") By("Creating one pod on node with at least one fake-device")
podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs" podRECMD := "devs=$(ls /tmp/ | egrep '^Dev-[0-9]+$') && echo stub devices: $devs"
pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD)) pod1 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
deviceIDRE := "stub devices: (Dev-[0-9]+)" deviceIDRE := "stub devices: (Dev-[0-9]+)"
count1, devId1 := parseLogFromNRuns(f, pod1.Name, pod1.Name, 0, deviceIDRE) devId1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
Expect(devId1).To(Not(Equal(""))) Expect(devId1).To(Not(Equal("")))
pod1, err = f.PodClient().Get(pod1.Name, metav1.GetOptions{}) pod1, err = f.PodClient().Get(pod1.Name, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
By("Restarting Kubelet and waiting for the current running pod to restart") ensurePodContainerRestart(f, pod1.Name, pod1.Name)
By("Confirming that device assignment persists even after container restart")
devIdAfterRestart := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
Expect(devIdAfterRestart).To(Equal(devId1))
By("Restarting Kubelet")
restartKubelet() restartKubelet()
By("Confirming that after a kubelet and pod restart, fake-device assignement is kept") ensurePodContainerRestart(f, pod1.Name, pod1.Name)
count1, devIdRestart1 := parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+1, deviceIDRE) By("Confirming that after a kubelet restart, fake-device assignement is kept")
devIdRestart1 := parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
By("Wait for node is ready") By("Wait for node is ready")
framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout) framework.WaitForAllNodesSchedulable(f.ClientSet, framework.TestContext.NodeSchedulableTimeout)
By("Re-Register resources") By("Re-Register resources after kubelet restart")
dp1 = dm.NewDevicePluginStub(devs, socketPath) dp1 = dm.NewDevicePluginStub(devs, socketPath)
dp1.SetAllocFunc(stubAllocFunc) dp1.SetAllocFunc(stubAllocFunc)
err = dp1.Start() err = dp1.Start()
@ -105,17 +113,18 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
framework.ExpectNoError(err) framework.ExpectNoError(err)
By("Waiting for resource to become available on the local node after re-registration") By("Waiting for resource to become available on the local node after re-registration")
Eventually(func() int64 { Eventually(func() bool {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{}) node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
return numberOfDevices(node, resourceName) return numberOfDevicesCapacity(node, resourceName) == devsLen &&
}, 30*time.Second, framework.Poll).Should(Equal(devsLen)) numberOfDevicesAllocatable(node, resourceName) == devsLen
}, 30*time.Second, framework.Poll).Should(BeTrue())
By("Creating another pod") By("Creating another pod")
pod2 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD)) pod2 := f.PodClient().CreateSync(makeBusyboxPod(resourceName, podRECMD))
By("Checking that pods got a different GPU") By("Checking that pod got a different fake device")
count2, devId2 := parseLogFromNRuns(f, pod2.Name, pod2.Name, 1, deviceIDRE) devId2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
Expect(devId1).To(Not(Equal(devId2))) Expect(devId1).To(Not(Equal(devId2)))
@ -123,26 +132,59 @@ var _ = framework.KubeDescribe("Device Plugin [Feature:DevicePlugin] [Serial]",
err = dp1.Stop() err = dp1.Stop()
framework.ExpectNoError(err) framework.ExpectNoError(err)
By("Waiting for stub device plugin to become unhealthy on the local node")
Eventually(func() int64 {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err)
return numberOfDevicesAllocatable(node, resourceName)
}, 30*time.Second, framework.Poll).Should(Equal(int64(0)))
By("Checking that scheduled pods can continue to run even after we delete device plugin.")
ensurePodContainerRestart(f, pod1.Name, pod1.Name)
devIdRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1))
ensurePodContainerRestart(f, pod2.Name, pod2.Name)
devIdRestart2 := parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
Expect(devIdRestart2).To(Equal(devId2))
By("Re-register resources")
dp1 = dm.NewDevicePluginStub(devs, socketPath)
dp1.SetAllocFunc(stubAllocFunc)
err = dp1.Start()
framework.ExpectNoError(err)
err = dp1.Register(pluginapi.KubeletSocket, resourceName, false)
framework.ExpectNoError(err)
By("Waiting for the resource exported by the stub device plugin to become healthy on the local node")
Eventually(func() int64 {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err)
return numberOfDevicesAllocatable(node, resourceName)
}, 30*time.Second, framework.Poll).Should(Equal(devsLen))
By("Deleting device plugin again.")
err = dp1.Stop()
framework.ExpectNoError(err)
By("Waiting for stub device plugin to become unavailable on the local node") By("Waiting for stub device plugin to become unavailable on the local node")
Eventually(func() bool { Eventually(func() bool {
node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{}) node, err := f.ClientSet.CoreV1().Nodes().Get(framework.TestContext.NodeName, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
return numberOfDevices(node, resourceName) <= 0 return numberOfDevicesCapacity(node, resourceName) <= 0
}, 10*time.Minute, framework.Poll).Should(BeTrue()) }, 10*time.Minute, framework.Poll).Should(BeTrue())
By("Checking that scheduled pods can continue to run even after we delete device plugin.") By("Restarting Kubelet second time.")
count1, devIdRestart1 = parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+1, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1))
count2, devIdRestart2 := parseLogFromNRuns(f, pod2.Name, pod2.Name, count2+1, deviceIDRE)
Expect(devIdRestart2).To(Equal(devId2))
By("Restarting Kubelet.")
restartKubelet() restartKubelet()
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet Eventually.")
count1, devIdRestart1 = parseLogFromNRuns(f, pod1.Name, pod1.Name, count1+2, deviceIDRE) ensurePodContainerRestart(f, pod1.Name, pod1.Name)
devIdRestart1 = parseLog(f, pod1.Name, pod1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
count2, devIdRestart2 = parseLogFromNRuns(f, pod2.Name, pod2.Name, count2+2, deviceIDRE)
ensurePodContainerRestart(f, pod2.Name, pod2.Name)
devIdRestart2 = parseLog(f, pod2.Name, pod2.Name, deviceIDRE)
Expect(devIdRestart2).To(Equal(devId2)) Expect(devIdRestart2).To(Equal(devId2))
// Cleanup // Cleanup
@ -176,21 +218,28 @@ func makeBusyboxPod(resourceName, cmd string) *v1.Pod {
} }
} }
// parseLogFromNRuns returns restart count of the specified container // ensurePodContainerRestart confirms that pod container has restarted at least once
// after it has been restarted at least restartCount times, func ensurePodContainerRestart(f *framework.Framework, podName string, contName string) {
// and the matching string for the specified regular expression parsed from the container logs. var initialCount int32
func parseLogFromNRuns(f *framework.Framework, podName string, contName string, restartCount int32, re string) (int32, string) { var currentCount int32
var count int32 p, err := f.PodClient().Get(podName, metav1.GetOptions{})
// Wait till pod has been restarted at least restartCount times. if err != nil || len(p.Status.ContainerStatuses) < 1 {
framework.Failf("ensurePodContainerRestart failed for pod %q: %v", podName, err)
}
initialCount = p.Status.ContainerStatuses[0].RestartCount
Eventually(func() bool { Eventually(func() bool {
p, err := f.PodClient().Get(podName, metav1.GetOptions{}) p, err = f.PodClient().Get(podName, metav1.GetOptions{})
if err != nil || len(p.Status.ContainerStatuses) < 1 { if err != nil || len(p.Status.ContainerStatuses) < 1 {
return false return false
} }
count = p.Status.ContainerStatuses[0].RestartCount currentCount = p.Status.ContainerStatuses[0].RestartCount
return count >= restartCount framework.Logf("initial %v, current %v", initialCount, currentCount)
}, 5*time.Minute, framework.Poll).Should(BeTrue()) return currentCount > initialCount
}, 2*time.Minute, framework.Poll).Should(BeTrue())
}
// parseLog returns the matching string for the specified regular expression parsed from the container logs.
func parseLog(f *framework.Framework, podName string, contName string, re string) string {
logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName) logs, err := framework.GetPodLogs(f.ClientSet, f.Namespace.Name, podName, contName)
if err != nil { if err != nil {
framework.Failf("GetPodLogs for pod %q failed: %v", podName, err) framework.Failf("GetPodLogs for pod %q failed: %v", podName, err)
@ -200,14 +249,14 @@ func parseLogFromNRuns(f *framework.Framework, podName string, contName string,
regex := regexp.MustCompile(re) regex := regexp.MustCompile(re)
matches := regex.FindStringSubmatch(logs) matches := regex.FindStringSubmatch(logs)
if len(matches) < 2 { if len(matches) < 2 {
return count, "" return ""
} }
return count, matches[1] return matches[1]
} }
// numberOfDevices returns the number of devices of resourceName advertised by a node // numberOfDevicesCapacity returns the number of devices of resourceName advertised by a node capacity
func numberOfDevices(node *v1.Node, resourceName string) int64 { func numberOfDevicesCapacity(node *v1.Node, resourceName string) int64 {
val, ok := node.Status.Capacity[v1.ResourceName(resourceName)] val, ok := node.Status.Capacity[v1.ResourceName(resourceName)]
if !ok { if !ok {
return 0 return 0
@ -216,6 +265,16 @@ func numberOfDevices(node *v1.Node, resourceName string) int64 {
return val.Value() return val.Value()
} }
// numberOfDevicesAllocatable returns the number of devices of resourceName advertised by a node allocatable
func numberOfDevicesAllocatable(node *v1.Node, resourceName string) int64 {
val, ok := node.Status.Allocatable[v1.ResourceName(resourceName)]
if !ok {
return 0
}
return val.Value()
}
// stubAllocFunc will pass to stub device plugin // stubAllocFunc will pass to stub device plugin
func stubAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) { func stubAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
var responses pluginapi.AllocateResponse var responses pluginapi.AllocateResponse

View File

@ -80,7 +80,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
p1 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD)) p1 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
deviceIDRE := "gpu devices: (nvidia[0-9]+)" deviceIDRE := "gpu devices: (nvidia[0-9]+)"
count1, devId1 := parseLogFromNRuns(f, p1.Name, p1.Name, 1, deviceIDRE) devId1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{}) p1, err := f.PodClient().Get(p1.Name, metav1.GetOptions{})
framework.ExpectNoError(err) framework.ExpectNoError(err)
@ -88,7 +88,8 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
restartKubelet() restartKubelet()
By("Confirming that after a kubelet and pod restart, GPU assignement is kept") By("Confirming that after a kubelet and pod restart, GPU assignement is kept")
count1, devIdRestart1 := parseLogFromNRuns(f, p1.Name, p1.Name, count1+1, deviceIDRE) ensurePodContainerRestart(f, p1.Name, p1.Name)
devIdRestart1 := parseLog(f, p1.Name, p1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
By("Restarting Kubelet and creating another pod") By("Restarting Kubelet and creating another pod")
@ -100,7 +101,7 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
p2 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD)) p2 := f.PodClient().CreateSync(makeBusyboxPod(framework.NVIDIAGPUResourceName, podRECMD))
By("Checking that pods got a different GPU") By("Checking that pods got a different GPU")
count2, devId2 := parseLogFromNRuns(f, p2.Name, p2.Name, 1, deviceIDRE) devId2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
Expect(devId1).To(Not(Equal(devId2))) Expect(devId1).To(Not(Equal(devId2)))
@ -113,16 +114,21 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
return framework.NumberOfNVIDIAGPUs(node) <= 0 return framework.NumberOfNVIDIAGPUs(node) <= 0
}, 10*time.Minute, framework.Poll).Should(BeTrue()) }, 10*time.Minute, framework.Poll).Should(BeTrue())
By("Checking that scheduled pods can continue to run even after we delete device plugin.") By("Checking that scheduled pods can continue to run even after we delete device plugin.")
count1, devIdRestart1 = parseLogFromNRuns(f, p1.Name, p1.Name, count1+1, deviceIDRE) ensurePodContainerRestart(f, p1.Name, p1.Name)
devIdRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
count2, devIdRestart2 := parseLogFromNRuns(f, p2.Name, p2.Name, count2+1, deviceIDRE)
ensurePodContainerRestart(f, p2.Name, p2.Name)
devIdRestart2 := parseLog(f, p2.Name, p2.Name, deviceIDRE)
Expect(devIdRestart2).To(Equal(devId2)) Expect(devIdRestart2).To(Equal(devId2))
By("Restarting Kubelet.") By("Restarting Kubelet.")
restartKubelet() restartKubelet()
By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.") By("Checking that scheduled pods can continue to run even after we delete device plugin and restart Kubelet.")
count1, devIdRestart1 = parseLogFromNRuns(f, p1.Name, p1.Name, count1+2, deviceIDRE) ensurePodContainerRestart(f, p1.Name, p1.Name)
devIdRestart1 = parseLog(f, p1.Name, p1.Name, deviceIDRE)
Expect(devIdRestart1).To(Equal(devId1)) Expect(devIdRestart1).To(Equal(devId1))
count2, devIdRestart2 = parseLogFromNRuns(f, p2.Name, p2.Name, count2+2, deviceIDRE) ensurePodContainerRestart(f, p2.Name, p2.Name)
devIdRestart2 = parseLog(f, p2.Name, p2.Name, deviceIDRE)
Expect(devIdRestart2).To(Equal(devId2)) Expect(devIdRestart2).To(Equal(devId2))
logDevicePluginMetrics() logDevicePluginMetrics()