Attempt to deflake networking tests in large clusters

This commit is contained in:
wojtekt 2021-01-19 16:20:24 +01:00
parent a410c14020
commit fa0b7dee9e
2 changed files with 22 additions and 18 deletions

View File

@ -51,10 +51,10 @@ const (
// LoadBalancerCreateTimeoutDefault is the default time to wait for a load balancer to be created/modified. // LoadBalancerCreateTimeoutDefault is the default time to wait for a load balancer to be created/modified.
// TODO: once support ticket 21807001 is resolved, reduce this timeout back to something reasonable // TODO: once support ticket 21807001 is resolved, reduce this timeout back to something reasonable
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead. // Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutDefault = 20 * time.Minute loadBalancerCreateTimeoutDefault = 10 * time.Minute
// LoadBalancerCreateTimeoutLarge is the maximum time to wait for a load balancer to be created/modified. // LoadBalancerCreateTimeoutLarge is the maximum time to wait for a load balancer to be created/modified.
// Hideen - use GetServiceLoadBalancerCreateTimeout function instead. // Hideen - use GetServiceLoadBalancerCreateTimeout function instead.
loadBalancerCreateTimeoutLarge = 2 * time.Hour loadBalancerCreateTimeoutLarge = 45 * time.Minute
// LoadBalancerPropagationTimeoutDefault is the default time to wait for pods to // LoadBalancerPropagationTimeoutDefault is the default time to wait for pods to
// be targeted by load balancers. // be targeted by load balancers.

View File

@ -2151,10 +2151,6 @@ var _ = SIGDescribe("Services", func() {
// this feature currently supported only on GCE/GKE/AWS // this feature currently supported only on GCE/GKE/AWS
e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws") e2eskipper.SkipUnlessProviderIs("gce", "gke", "aws")
loadBalancerLagTimeout := e2eservice.LoadBalancerLagTimeoutDefault
if framework.ProviderIs("aws") {
loadBalancerLagTimeout = e2eservice.LoadBalancerLagTimeoutAWS
}
loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(cs) loadBalancerCreateTimeout := e2eservice.GetServiceLoadBalancerCreationTimeout(cs)
namespace := f.Namespace.Name namespace := f.Namespace.Name
@ -2193,17 +2189,16 @@ var _ = SIGDescribe("Services", func() {
svc, err = jig.WaitForLoadBalancer(loadBalancerCreateTimeout) svc, err = jig.WaitForLoadBalancer(loadBalancerCreateTimeout)
framework.ExpectNoError(err) framework.ExpectNoError(err)
// timeout when we haven't just created the load balancer
normalReachabilityTimeout := 2 * time.Minute
ginkgo.By("check reachability from different sources") ginkgo.By("check reachability from different sources")
svcIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0]) svcIP := e2eservice.GetIngressPoint(&svc.Status.LoadBalancer.Ingress[0])
// Wait longer as this is our first request after creation. We can't check using a separate method, // We should wait until service changes are actually propagated in the cloud-provider,
// because the LB should only be reachable from the "accept" pod // as this may take significant amount of time, especially in large clusters.
checkReachabilityFromPod(true, loadBalancerLagTimeout, namespace, acceptPod.Name, svcIP) // However, the information whether it was already programmed isn't achievable.
checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, dropPod.Name, svcIP) // So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)
// Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons. dropPod, err = cs.CoreV1().Pods(namespace).Get(dropPod.Name, metav1.GetOptions{}) // Make sure dropPod is running. There are certain chances that the pod might be teminated due to unexpected reasons.
dropPod, err = cs.CoreV1().Pods(namespace).Get(context.TODO(), dropPod.Name, metav1.GetOptions{}) dropPod, err = cs.CoreV1().Pods(namespace).Get(context.TODO(), dropPod.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "Unable to get pod %s", dropPod.Name) framework.ExpectNoError(err, "Unable to get pod %s", dropPod.Name)
framework.ExpectEqual(acceptPod.Status.Phase, v1.PodRunning) framework.ExpectEqual(acceptPod.Status.Phase, v1.PodRunning)
@ -2215,16 +2210,25 @@ var _ = SIGDescribe("Services", func() {
svc.Spec.LoadBalancerSourceRanges = []string{dropPod.Status.PodIP + "/32"} svc.Spec.LoadBalancerSourceRanges = []string{dropPod.Status.PodIP + "/32"}
}) })
framework.ExpectNoError(err) framework.ExpectNoError(err)
checkReachabilityFromPod(false, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP) // We should wait until service changes are actually propagates, as this may take
// significant amount of time, especially in large clusters.
// However, the information whether it was already programmed isn't achievable.
// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(false, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)
ginkgo.By("Delete LoadBalancerSourceRange field and check reachability") ginkgo.By("Delete LoadBalancerSourceRange field and check reachability")
_, err = jig.UpdateService(func(svc *v1.Service) { _, err = jig.UpdateService(func(svc *v1.Service) {
svc.Spec.LoadBalancerSourceRanges = nil svc.Spec.LoadBalancerSourceRanges = nil
}) })
framework.ExpectNoError(err) framework.ExpectNoError(err)
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, acceptPod.Name, svcIP) // We should wait until service changes are actually propagates, as this may take
checkReachabilityFromPod(true, normalReachabilityTimeout, namespace, dropPod.Name, svcIP) // significant amount of time, especially in large clusters.
// However, the information whether it was already programmed isn't achievable.
// So we're resolving it by using loadBalancerCreateTimeout that takes cluster size into account.
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, acceptPod.Name, svcIP)
checkReachabilityFromPod(true, loadBalancerCreateTimeout, namespace, dropPod.Name, svcIP)
}) })
ginkgo.It("should be able to create an internal type load balancer [Slow]", func() { ginkgo.It("should be able to create an internal type load balancer [Slow]", func() {