Merge pull request #98181 from wojtek-t/deflake_network_tests

Attempt to deflake networking tests in large clusters
2025-07-29 14:37:00 +00:00 · 2021-01-20 01:19:59 -08:00 · 2021-01-20 01:19:59 -08:00 · 1e21a2eb35
commit 1e21a2eb35
parent 48b2edf0cb fa0b7dee9e
2 changed files with 22 additions and 18 deletions
--- a/test/e2e/framework/service/const.go
+++ b/test/e2e/framework/service/const.go
@ -51,10 +51,10 @@ const (
 	// LoadBalancerCreateTimeoutDefault is the default time to wait for a load balancer to be created/modified.
 	// TODO: once support ticket 21807001 is resolved, reduce this timeout back to something reasonable
 	// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
-	loadBalancerCreateTimeoutDefault = 20 * time.Minute
+	loadBalancerCreateTimeoutDefault = 10 * time.Minute
 	// LoadBalancerCreateTimeoutLarge is the maximum time to wait for a load balancer to be created/modified.
 	// Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
-	loadBalancerCreateTimeoutLarge = 2 * time.Hour
+	loadBalancerCreateTimeoutLarge = 45 * time.Minute

 	// LoadBalancerPropagationTimeoutDefault is the default time to wait for pods to
 	// be targeted by load balancers.
--- a/test/e2e/network/service.go
+++ b/test/e2e/network/service.go
@ -2151,10 +2151,6 @@ var _ = SIGDescribe("Services", func() {
 		// this feature currently supported only on GCE/GKE/AWS
 		e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws")

-		loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault
-		if framework.ProviderIs("aws") {
-			loadBalancerLagTimeout = e2eservice.LoadBalancerLagTimeoutAWS
-		}
 		loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(cs)

 		namespace := f.Namespace.Name
@ -2193,17 +2189,16 @@ var _ = SIGDescribe("Services", func() {
 		svc, err = jig.WaitForLoadBalancer(loadBalancerCreateTimeout)
 		framework.ExpectNoError(err)

-		// timeout when we haven't just created the load balancer
-		normalReachabilityTimeout := 2 * time.Minute
-
 		ginkgo.By("check reachability from different sources")
 		svcIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0])
-		// Wait longer as this is our first request after creation.  We can't check using a separate method,
-		// because the LB should only be reachable from the "accept" pod
-		checkReachabilityFromPod(true, loadBalancerLagTimeout, namespace, acceptPod.Name, svcIP)
-		checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)
+		// We should wait until service changes are actually propagated in the cloud-provider,
+		// as this may take significant amount of time, especially in large clusters.
+		// However, the information whether it was already programmed isn't achievable.
+		// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
+		checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
+		checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)

-		// Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons.		dropPod, err = cs.CoreV1().Pods(namespace).Get(dropPod.Name, metav1.GetOptions{})
+		// Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons.
 		dropPod, err = cs.CoreV1().Pods(namespace).Get(context.TODO(), dropPod.Name, metav1.GetOptions{})
 		framework.ExpectNoError(err, "Unable to get pod %s", dropPod.Name)
 		framework.ExpectEqual(acceptPod.Status.Phase, v1.PodRunning)
@ -2215,16 +2210,25 @@ var _ = SIGDescribe("Services", func() {
 			svc.Spec.LoadBalancerSourceRanges = []string{dropPod.Status.PodIP + "/32"}
 		})
 		framework.ExpectNoError(err)
-		checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP)
-		checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)
+
+		// We should wait until service changes are actually propagates, as this may take
+		// significant amount of time, especially in large clusters.
+		// However, the information whether it was already programmed isn't achievable.
+		// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
+		checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
+		checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)

 		ginkgo.By("Delete LoadBalancerSourceRange field and check reachability")
 		_, err = jig.UpdateService(func(svc *v1.Service) {
 			svc.Spec.LoadBalancerSourceRanges = nil
 		})
 		framework.ExpectNoError(err)
-		checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP)
-		checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP)
+		// We should wait until service changes are actually propagates, as this may take
+		// significant amount of time, especially in large clusters.
+		// However, the information whether it was already programmed isn't achievable.
+		// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
+		checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
+		checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)
 	})

 	ginkgo.It("should be able to create an internal type load balancer [Slow]", func() {