Merge pull request #115283 from daman1807/master

e2e/network_policy: using expected==observed as condition for polling probeConnectivity
This commit is contained in:
Kubernetes Prow Robot 2023-01-24 09:04:17 -08:00 committed by GitHub
commit 703361f381
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 73 additions and 38 deletions

View File

@ -34,15 +34,10 @@ import (
)
// defaultPollIntervalSeconds [seconds] is the default value for which the Prober will wait before attempting next attempt.
const defaultPollIntervalSeconds = 2
const defaultPollIntervalSeconds = 1
// defaultPollTimeoutSeconds [seconds] is the default timeout when polling on probes.
// using this value leads to a minimum of 2 attempts for every probe
const defaultPollTimeoutSeconds = 1
// maxPollTimeoutSeconds [seconds] is the max timeout when polling on probes, this should only be used when expect a
// successful probe; use defaultPollTimeout otherwise
const maxPollTimeoutSeconds = 10
const defaultPollTimeoutSeconds = 10
// TestPod represents an actual running pod. For each Pod defined by the model,
// there will be a corresponding TestPod. TestPod includes some runtime info
@ -185,18 +180,57 @@ func (k *kubeManager) probeConnectivity(args *probeConnectivityArgs) (bool, stri
attempt := 0
err := wait.PollImmediate(time.Duration(args.pollIntervalSeconds)*time.Second, time.Duration(args.pollTimeoutSeconds)*time.Second,
func() (bool, error) {
stdout, stderr, err := k.executeRemoteCommand(args.nsFrom, args.podFrom, args.containerFrom, cmd)
if err != nil {
framework.Logf("retrying probe #%d :: %s/%s -> %s: error when running command: err - %v /// stdout - %s /// stderr - %s", attempt+1, args.nsFrom, args.podFrom, args.addrTo, err, stdout, stderr)
// NOTE: The return value of this function[probeConnectivity] should be true if the probe is successful and false otherwise.
// probeError will be the return value of this function[probeConnectivity] call.
var probeError error
var stderr string
// Instead of re-running the job on connectivity failure, the following conditionFunc when passed to PollImmediate, reruns
// the job when the observed value don't match the expected value, so we don't rely on return value of PollImmediate, we
// simply discard it and use probeError, defined outside scope of conditionFunc, for returning the result of probeConnectivity.
conditionFunc := func() (bool, error) {
_, stderr, probeError = k.executeRemoteCommand(args.nsFrom, args.podFrom, args.containerFrom, cmd)
// retry should only occur if expected and observed value don't match.
if args.expectConnectivity {
if probeError != nil {
// since we expect connectivity here, we fail the condition for PollImmediate to reattempt the probe.
// this happens in the cases where network is congested, we don't have any policy rejecting traffic
// from "podFrom" to "podTo" and probes from "podFrom" to "podTo" are failing.
framework.Logf("probe #%d :: connectivity expected :: %s/%s -> %s :: stderr - %s",
attempt+1, args.nsFrom, args.podFrom, args.addrTo, stderr,
)
attempt++
return false, nil
} else {
// we got the expected results, exit immediately.
return true, nil
}
} else {
if probeError != nil {
// we got the expected results, exit immediately.
return true, nil
} else {
// since we don't expect connectivity here, we fail the condition for PollImmediate to reattempt the probe.
// this happens in the cases where we have policy rejecting traffic from "podFrom" to "podTo", but CNI takes
// time to implement the policy and probe from "podFrom" to "podTo" was successful in that window.
framework.Logf(" probe #%d :: connectivity not expected :: %s/%s -> %s",
attempt+1, args.nsFrom, args.podFrom, args.addrTo,
)
attempt++
return false, nil
}
return true, nil
})
}
}
if err != nil {
// ignore the result of PollImmediate, we are only concerned with probeError.
_ = wait.PollImmediate(
time.Duration(args.pollIntervalSeconds)*time.Second,
time.Duration(args.pollTimeoutSeconds)*time.Second,
conditionFunc,
)
if probeError != nil {
return false, commandDebugString, nil
}
return true, commandDebugString, nil
@ -309,10 +343,10 @@ func getPollIntervalSeconds() int {
return defaultPollIntervalSeconds
}
// getPollTimeout returns the timeout for polling on probes.
func getPollTimeoutSeconds(useMaxPollTimout bool) int {
if useMaxPollTimout {
return maxPollTimeoutSeconds
// getPollTimeout returns the timeout for polling on probes, and takes windows heuristics into account, where requests can take longer sometimes.
func getPollTimeoutSeconds() int {
if framework.NodeOSDistroIs("windows") {
return defaultPollTimeoutSeconds * 2
}
return defaultPollTimeoutSeconds
}

View File

@ -33,6 +33,7 @@ type probeConnectivityArgs struct {
addrTo string
protocol v1.Protocol
toPort int
expectConnectivity bool
timeoutSeconds int
pollIntervalSeconds int
pollTimeoutSeconds int
@ -45,13 +46,13 @@ type Prober interface {
// ProbeJob packages the data for the input of a pod->pod connectivity probe
type ProbeJob struct {
PodFrom TestPod
PodTo TestPod
PodToServiceIP string
ToPort int
ToPodDNSDomain string
Protocol v1.Protocol
UseMaxPollTimeout bool
PodFrom TestPod
PodTo TestPod
PodToServiceIP string
ToPort int
ToPodDNSDomain string
Protocol v1.Protocol
ExpectConnectivity bool
}
// ProbeJobResults packages the data for the results of a pod->pod connectivity probe
@ -72,18 +73,17 @@ func ProbePodToPodConnectivity(prober Prober, allPods []TestPod, dnsDomain strin
}
for _, podFrom := range allPods {
for _, podTo := range allPods {
useMaxPollTimeout := false
// we only want to use max poll timeout for the probes where we expect connectivity from "podFrom" to "podTo".
if testCase.Reachability.Expected.Get(podFrom.PodString().String(), podTo.PodString().String()) {
useMaxPollTimeout = true
}
// set connectivity expectation for the probe job, this allows to retry probe when observed value
// don't match expected value.
expectConnectivity := testCase.Reachability.Expected.Get(podFrom.PodString().String(), podTo.PodString().String())
jobs <- &ProbeJob{
PodFrom: podFrom,
PodTo: podTo,
ToPort: testCase.ToPort,
ToPodDNSDomain: dnsDomain,
Protocol: testCase.Protocol,
UseMaxPollTimeout: useMaxPollTimeout,
PodFrom: podFrom,
PodTo: podTo,
ToPort: testCase.ToPort,
ToPodDNSDomain: dnsDomain,
Protocol: testCase.Protocol,
ExpectConnectivity: expectConnectivity,
}
}
}
@ -136,9 +136,10 @@ func probeWorker(prober Prober, jobs <-chan *ProbeJob, results chan<- *ProbeJobR
addrTo: job.PodTo.ServiceIP,
protocol: job.Protocol,
toPort: job.ToPort,
expectConnectivity: job.ExpectConnectivity,
timeoutSeconds: getProbeTimeoutSeconds(),
pollIntervalSeconds: getPollIntervalSeconds(),
pollTimeoutSeconds: getPollTimeoutSeconds(job.UseMaxPollTimeout),
pollTimeoutSeconds: getPollTimeoutSeconds(),
})
result := &ProbeJobResults{
Job: job,