Run cluster-autoscaler+GPU e2e tests for all gpu types

This commit is contained in:
Łukasz Osipiuk 2018-05-24 13:24:58 +02:00
parent 9872a0502b
commit 350d2c2402

View File

@ -207,11 +207,14 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
func() { simpleScaleUpTest(0) }) func() { simpleScaleUpTest(0) })
It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() { supportedGpuTypes := []string{"nvidia-tesla-k80", "nvidia-tesla-v100", "nvidia-tesla-p100"}
for _, gpuType := range supportedGpuTypes {
It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -229,11 +232,11 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
}) })
It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -246,6 +249,7 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
defer disableAutoscaler(gpuPoolName, 0, 2) defer disableAutoscaler(gpuPoolName, 0, 2)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
By("Scale GPU deployment")
framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false) framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
@ -253,11 +257,11 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
}) })
It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0) addGpuNodePool(gpuPoolName, gpuType, 1, 0)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -270,7 +274,6 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs") By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second) ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation") defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
// Verify that cluster size is increased // Verify that cluster size is increased
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout)) func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
@ -279,11 +282,11 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() { It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
framework.SkipUnlessProviderIs("gke") framework.SkipUnlessProviderIs("gke")
const gpuPoolName = "gpu-pool" const gpuPoolName = "gpu-pool"
addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1) addGpuNodePool(gpuPoolName, gpuType, 1, 1)
defer deleteNodePool(gpuPoolName) defer deleteNodePool(gpuPoolName)
installNvidiaDriversDaemonSet() installNvidiaDriversDaemonSet()
@ -296,12 +299,14 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
defer disableAutoscaler(gpuPoolName, 0, 1) defer disableAutoscaler(gpuPoolName, 0, 1)
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
By("Remove the only POD requiring GPU")
framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc") framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet, framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
func(size int) bool { return size == nodeCount }, scaleDownTimeout)) func(size int) bool { return size == nodeCount }, scaleDownTimeout))
Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0)) Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
}) })
}
It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]", It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
func() { func() {