Merge pull request #64356 from losipiuk/lo/e2e-tests-gpu-cas-test-all-types

Automatic merge from submit-queue (batch tested with PRs 64383, 64356, 64390). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

E2E tests of cluster autoscaler with different gpu types

* Iterate over all supported GPU types (k80, p100, v100 for now) when running cluster autoscaler tests which verify GPU and CA interoperability.
* Verify if GPU pods are actually scaled up in "scale from 1" e2e test

```release-note
NONE
```
This commit is contained in:
Kubernetes Submit Queue 2018-05-28 06:01:05 -07:00 committed by GitHub
commit 9f5b76c9c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -207,11 +207,14 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
func() { simpleScaleUpTest(0) }) func() { simpleScaleUpTest(0) })
It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() { supportedGpuTypes := []string{"nvidia-tesla-k80", "nvidia-tesla-v100", "nvidia-tesla-p100"}
for _, gpuType := range supportedGpuTypes {
It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -229,11 +232,11 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
}) })
It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -246,18 +249,19 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
defer disableAutoscaler(gpuPoolName, 0, 2) defer disableAutoscaler(gpuPoolName, 0, 2)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false) By("Scale GPU deployment")
framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true)
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout)) func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
}) })
It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -270,7 +274,6 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs") By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
// Verify that cluster size is increased // Verify that cluster size is increased
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
@ -279,11 +282,11 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -296,12 +299,14 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
By("Remove the only POD requiring GPU")
framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount }, scaleDownTimeout)) func(size int) bool { return size == nodeCount }, scaleDownTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
}
It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
func() { func() {