Merge pull request #64356 from losipiuk/lo/e2e-tests-gpu-cas-test-all-types

Automatic merge from submit-queue (batch tested with PRs 64383, 64356, 64390). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. E2E tests of cluster autoscaler with different gpu types * Iterate over all supported GPU types (k80, p100, v100 for now) when running cluster autoscaler tests which verify GPU and CA interoperability. * Verify if GPU pods are actually scaled up in "scale from 1" e2e test ```release-note NONE ```
2025-07-29 14:37:00 +00:00 · 2018-05-28 06:01:05 -07:00 · 2018-05-28 06:01:05 -07:00 · 9f5b76c9c8
commit 9f5b76c9c8
parent cf27c256ea 3c8bd9ae24
1 changed files with 74 additions and 69 deletions
--- a/test/e2e/autoscaling/cluster_size_autoscaling.go
+++ b/test/e2e/autoscaling/cluster_size_autoscaling.go
@ -207,101 +207,106 @@ var _ = SIGDescribe("Cluster size autoscaling [Slow]", func() {
 	It("should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]",
 		func() { simpleScaleUpTest(0) })
-	It("Should scale up GPU pool from 0 [Feature:ClusterSizeAutoscalingGpu]", func() {
+	supportedGpuTypes := []string{"nvidia-tesla-k80", "nvidia-tesla-v100", "nvidia-tesla-p100"}
-		framework.SkipUnlessProviderIs("gke")
+	for _, gpuType := range supportedGpuTypes {
-		const gpuPoolName = "gpu-pool"
+		It(fmt.Sprintf("Should scale up GPU pool from 0 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
-		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
+			framework.SkipUnlessProviderIs("gke")
 		defer deleteNodePool(gpuPoolName)
-		installNvidiaDriversDaemonSet()
+			const gpuPoolName = "gpu-pool"
 			addGpuNodePool(gpuPoolName, gpuType, 1, 0)
 			defer deleteNodePool(gpuPoolName)
-		By("Enable autoscaler")
+			installNvidiaDriversDaemonSet()
 		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
 		defer disableAutoscaler(gpuPoolName, 0, 1)
 		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-		By("Schedule a pod which requires GPU")
+			By("Enable autoscaler")
-		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
+			framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
 			defer disableAutoscaler(gpuPoolName, 0, 1)
 			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			By("Schedule a pod which requires GPU")
-			func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
+			framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
 		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
 	})
-	It("Should scale up GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
+			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
-		framework.SkipUnlessProviderIs("gke")
+				func(size int) bool { return size == nodeCount+1 }, scaleUpTimeout))
 			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
 		})
-		const gpuPoolName = "gpu-pool"
+		It(fmt.Sprintf("Should scale up GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
-		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
+			framework.SkipUnlessProviderIs("gke")
 		defer deleteNodePool(gpuPoolName)
-		installNvidiaDriversDaemonSet()
+			const gpuPoolName = "gpu-pool"
 			addGpuNodePool(gpuPoolName, gpuType, 1, 1)
 			defer deleteNodePool(gpuPoolName)
-		By("Schedule a single pod which requires GPU")
+			installNvidiaDriversDaemonSet()
 		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
-		By("Enable autoscaler")
+			By("Schedule a single pod which requires GPU")
-		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
+			framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
 		defer disableAutoscaler(gpuPoolName, 0, 2)
 		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
-		framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, false)
+			By("Enable autoscaler")
 			framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 2))
 			defer disableAutoscaler(gpuPoolName, 0, 2)
 			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
-		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			By("Scale GPU deployment")
-			func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
+			framework.ScaleRC(f.ClientSet, f.ScalesGetter, f.Namespace.Name, "gpu-pod-rc", 2, true)
 		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
 	})
-	It("Should not scale GPU pool up if pod does not require GPUs [Feature:ClusterSizeAutoscalingGpu]", func() {
+			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
-		framework.SkipUnlessProviderIs("gke")
+				func(size int) bool { return size == nodeCount+2 }, scaleUpTimeout))
 			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(2))
 		})
-		const gpuPoolName = "gpu-pool"
+		It(fmt.Sprintf("Should not scale GPU pool up if pod does not require GPUs [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
-		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 0)
+			framework.SkipUnlessProviderIs("gke")
 		defer deleteNodePool(gpuPoolName)
-		installNvidiaDriversDaemonSet()
+			const gpuPoolName = "gpu-pool"
 			addGpuNodePool(gpuPoolName, gpuType, 1, 0)
 			defer deleteNodePool(gpuPoolName)
-		By("Enable autoscaler")
+			installNvidiaDriversDaemonSet()
 		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
 		defer disableAutoscaler(gpuPoolName, 0, 1)
 		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-		By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
+			By("Enable autoscaler")
-		ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
+			framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
-		defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
+			defer disableAutoscaler(gpuPoolName, 0, 1)
 			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-		// Verify that cluster size is increased
+			By("Schedule bunch of pods beyond point of filling default pool but do not request any GPUs")
-		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			ReserveMemory(f, "memory-reservation", 100, nodeCount*memAllocatableMb, false, 1*time.Second)
-			func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
+			defer framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "memory-reservation")
 			// Verify that cluster size is increased
 			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
 				func(size int) bool { return size >= nodeCount+1 }, scaleUpTimeout))
-		// Expect gpu pool to stay intact
+			// Expect gpu pool to stay intact
-		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
+			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-	})
+		})
-	It("Should scale down GPU pool from 1 [Feature:ClusterSizeAutoscalingGpu]", func() {
+		It(fmt.Sprintf("Should scale down GPU pool from 1 [GpuType:%s] [Feature:ClusterSizeAutoscalingGpu]", gpuType), func() {
-		framework.SkipUnlessProviderIs("gke")
+			framework.SkipUnlessProviderIs("gke")
-		const gpuPoolName = "gpu-pool"
+			const gpuPoolName = "gpu-pool"
-		addGpuNodePool(gpuPoolName, "nvidia-tesla-k80", 1, 1)
+			addGpuNodePool(gpuPoolName, gpuType, 1, 1)
-		defer deleteNodePool(gpuPoolName)
+			defer deleteNodePool(gpuPoolName)
-		installNvidiaDriversDaemonSet()
+			installNvidiaDriversDaemonSet()
-		By("Schedule a single pod which requires GPU")
+			By("Schedule a single pod which requires GPU")
-		framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
+			framework.ExpectNoError(scheduleGpuPod(f, "gpu-pod-rc"))
-		By("Enable autoscaler")
+			By("Enable autoscaler")
-		framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
+			framework.ExpectNoError(enableAutoscaler(gpuPoolName, 0, 1))
-		defer disableAutoscaler(gpuPoolName, 0, 1)
+			defer disableAutoscaler(gpuPoolName, 0, 1)
-		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
+			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(1))
-		framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
+			By("Remove the only POD requiring GPU")
 			framework.DeleteRCAndWaitForGC(f.ClientSet, f.Namespace.Name, "gpu-pod-rc")
-		framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
+			framework.ExpectNoError(WaitForClusterSizeFunc(f.ClientSet,
-			func(size int) bool { return size == nodeCount }, scaleDownTimeout))
+				func(size int) bool { return size == nodeCount }, scaleDownTimeout))
-		Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
+			Expect(len(getPoolNodes(f, gpuPoolName))).Should(Equal(0))
-	})
+		})
 	}
 	It("should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]",
 		func() {