Merge pull request #64356 from losipiuk/lo/e2e-tests-gpu-cas-test-all-types

Automatic merge from submit-queue (batch tested with PRs 64383, 64356, 64390). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

E2E tests of cluster autoscaler with different gpu types

* Iterate over all supported GPU types (k80, p100, v100 for now) when running cluster autoscaler tests which verify GPU and CA interoperability.
* Verify if GPU pods are actually scaled up in "scale from 1" e2e test

```release-note
NONE
```
This commit is contained in:
Kubernetes Submit Queue 2018-05-28 06:01:05 -07:00 committed by GitHub
commit 9f5b76c9c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -207,101 +207,106 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
func() { simpleScaleUpTest(0) }) func() { simpleScaleUpTest(0) })
It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() { supportedGpuTypes := []string{"nvidia-tesla-k80", "nvidia-tesla-v100", "nvidia-tesla-p100"}
framework.SkipUnlessProviderIs("gke") for _, gpuType := range supportedGpuTypes {
const gpuPoolName = "gpu-pool" It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) framework.SkipUnlessProviderIs("gke")
defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName)
By("Enable autoscaler") installNvidiaDriversDaemonSet()
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
By("Schedule a pod which requires GPU") By("Enable autoscaler")
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, By("Schedule a pod which requires GPU")
func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout)) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
})
It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
framework.SkipUnlessProviderIs("gke") func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
})
const gpuPoolName = "gpu-pool" It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) framework.SkipUnlessProviderIs("gke")
defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName)
By("Schedule a single pod which requires GPU") installNvidiaDriversDaemonSet()
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
By("Enable autoscaler") By("Schedule a single pod which requires GPU")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2)) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
defer disableAutoscaler(gpuPoolName, 0, 2)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false) By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
defer disableAutoscaler(gpuPoolName, 0, 2)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, By("Scale GPU deployment")
func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout)) framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
})
It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() { framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
framework.SkipUnlessProviderIs("gke") func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
})
const gpuPoolName = "gpu-pool" It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) framework.SkipUnlessProviderIs("gke")
defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName)
By("Enable autoscaler") installNvidiaDriversDaemonSet()
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs") By("Enable autoscaler")
ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation") defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
// Verify that cluster size is increased By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
// Verify that cluster size is increased
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
// Expect gpu pool to stay intact // Expect gpu pool to stay intact
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
By("Schedule a single pod which requires GPU") By("Schedule a single pod which requires GPU")
framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc")) framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
By("Enable autoscaler") By("Enable autoscaler")
framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1)) framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") By("Remove the only POD requiring GPU")
framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount }, scaleDownTimeout)) func(size int) bool { return size == nodeCount }, scaleDownTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
}
It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
func() { func() {