diff --git a/test/e2e/network/netpol/kubemanager.go b/test/e2e/network/netpol/kubemanager.go index 766a269be48..e9270ad23f7 100644 --- a/test/e2e/network/netpol/kubemanager.go +++ b/test/e2e/network/netpol/kubemanager.go @@ -34,15 +34,10 @@ import ( ) // defaultPollIntervalSeconds [seconds] is the default value for which the Prober will wait before attempting next attempt. -const defaultPollIntervalSeconds = 2 +const defaultPollIntervalSeconds = 1 // defaultPollTimeoutSeconds [seconds] is the default timeout when polling on probes. -// using this value leads to a minimum of 2 attempts for every probe -const defaultPollTimeoutSeconds = 1 - -// maxPollTimeoutSeconds [seconds] is the max timeout when polling on probes, this should only be used when expect a -// successful probe; use defaultPollTimeout otherwise -const maxPollTimeoutSeconds = 10 +const defaultPollTimeoutSeconds = 10 // TestPod represents an actual running pod. For each Pod defined by the model, // there will be a corresponding TestPod. TestPod includes some runtime info @@ -185,18 +180,57 @@ func (k *kubeManager) probeConnectivity(args *probeConnectivityArgs) (bool, stri attempt := 0 - err := wait.PollImmediate(time.Duration(args.pollIntervalSeconds)*time.Second, time.Duration(args.pollTimeoutSeconds)*time.Second, - func() (bool, error) { - stdout, stderr, err := k.executeRemoteCommand(args.nsFrom, args.podFrom, args.containerFrom, cmd) - if err != nil { - framework.Logf("retrying probe #%d :: %s/%s -> %s: error when running command: err - %v /// stdout - %s /// stderr - %s", attempt+1, args.nsFrom, args.podFrom, args.addrTo, err, stdout, stderr) + // NOTE: The return value of this function[probeConnectivity] should be true if the probe is successful and false otherwise. + + // probeError will be the return value of this function[probeConnectivity] call. + var probeError error + var stderr string + + // Instead of re-running the job on connectivity failure, the following conditionFunc when passed to PollImmediate, reruns + // the job when the observed value don't match the expected value, so we don't rely on return value of PollImmediate, we + // simply discard it and use probeError, defined outside scope of conditionFunc, for returning the result of probeConnectivity. + conditionFunc := func() (bool, error) { + _, stderr, probeError = k.executeRemoteCommand(args.nsFrom, args.podFrom, args.containerFrom, cmd) + // retry should only occur if expected and observed value don't match. + if args.expectConnectivity { + if probeError != nil { + // since we expect connectivity here, we fail the condition for PollImmediate to reattempt the probe. + // this happens in the cases where network is congested, we don't have any policy rejecting traffic + // from "podFrom" to "podTo" and probes from "podFrom" to "podTo" are failing. + framework.Logf("probe #%d :: connectivity expected :: %s/%s -> %s :: stderr - %s", + attempt+1, args.nsFrom, args.podFrom, args.addrTo, stderr, + ) + attempt++ + return false, nil + } else { + // we got the expected results, exit immediately. + return true, nil + } + } else { + if probeError != nil { + // we got the expected results, exit immediately. + return true, nil + } else { + // since we don't expect connectivity here, we fail the condition for PollImmediate to reattempt the probe. + // this happens in the cases where we have policy rejecting traffic from "podFrom" to "podTo", but CNI takes + // time to implement the policy and probe from "podFrom" to "podTo" was successful in that window. + framework.Logf(" probe #%d :: connectivity not expected :: %s/%s -> %s", + attempt+1, args.nsFrom, args.podFrom, args.addrTo, + ) attempt++ return false, nil } - return true, nil - }) + } + } - if err != nil { + // ignore the result of PollImmediate, we are only concerned with probeError. + _ = wait.PollImmediate( + time.Duration(args.pollIntervalSeconds)*time.Second, + time.Duration(args.pollTimeoutSeconds)*time.Second, + conditionFunc, + ) + + if probeError != nil { return false, commandDebugString, nil } return true, commandDebugString, nil @@ -309,10 +343,10 @@ func getPollIntervalSeconds() int { return defaultPollIntervalSeconds } -// getPollTimeout returns the timeout for polling on probes. -func getPollTimeoutSeconds(useMaxPollTimout bool) int { - if useMaxPollTimout { - return maxPollTimeoutSeconds +// getPollTimeout returns the timeout for polling on probes, and takes windows heuristics into account, where requests can take longer sometimes. +func getPollTimeoutSeconds() int { + if framework.NodeOSDistroIs("windows") { + return defaultPollTimeoutSeconds * 2 } return defaultPollTimeoutSeconds } diff --git a/test/e2e/network/netpol/probe.go b/test/e2e/network/netpol/probe.go index 216203ea432..13bb963231c 100644 --- a/test/e2e/network/netpol/probe.go +++ b/test/e2e/network/netpol/probe.go @@ -33,6 +33,7 @@ type probeConnectivityArgs struct { addrTo string protocol v1.Protocol toPort int + expectConnectivity bool timeoutSeconds int pollIntervalSeconds int pollTimeoutSeconds int @@ -45,13 +46,13 @@ type Prober interface { // ProbeJob packages the data for the input of a pod->pod connectivity probe type ProbeJob struct { - PodFrom TestPod - PodTo TestPod - PodToServiceIP string - ToPort int - ToPodDNSDomain string - Protocol v1.Protocol - UseMaxPollTimeout bool + PodFrom TestPod + PodTo TestPod + PodToServiceIP string + ToPort int + ToPodDNSDomain string + Protocol v1.Protocol + ExpectConnectivity bool } // ProbeJobResults packages the data for the results of a pod->pod connectivity probe @@ -72,18 +73,17 @@ func ProbePodToPodConnectivity(prober Prober, allPods []TestPod, dnsDomain strin } for _, podFrom := range allPods { for _, podTo := range allPods { - useMaxPollTimeout := false - // we only want to use max poll timeout for the probes where we expect connectivity from "podFrom" to "podTo". - if testCase.Reachability.Expected.Get(podFrom.PodString().String(), podTo.PodString().String()) { - useMaxPollTimeout = true - } + // set connectivity expectation for the probe job, this allows to retry probe when observed value + // don't match expected value. + expectConnectivity := testCase.Reachability.Expected.Get(podFrom.PodString().String(), podTo.PodString().String()) + jobs <- &ProbeJob{ - PodFrom: podFrom, - PodTo: podTo, - ToPort: testCase.ToPort, - ToPodDNSDomain: dnsDomain, - Protocol: testCase.Protocol, - UseMaxPollTimeout: useMaxPollTimeout, + PodFrom: podFrom, + PodTo: podTo, + ToPort: testCase.ToPort, + ToPodDNSDomain: dnsDomain, + Protocol: testCase.Protocol, + ExpectConnectivity: expectConnectivity, } } } @@ -136,9 +136,10 @@ func probeWorker(prober Prober, jobs <-chan *ProbeJob, results chan<- *ProbeJobR addrTo: job.PodTo.ServiceIP, protocol: job.Protocol, toPort: job.ToPort, + expectConnectivity: job.ExpectConnectivity, timeoutSeconds: getProbeTimeoutSeconds(), pollIntervalSeconds: getPollIntervalSeconds(), - pollTimeoutSeconds: getPollTimeoutSeconds(job.UseMaxPollTimeout), + pollTimeoutSeconds: getPollTimeoutSeconds(), }) result := &ProbeJobResults{ Job: job,