diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 20bfa5069a6..67efd377cab 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -8,6 +8,7 @@ algorithm-provider all-namespaces allocate-node-cidrs allow-privileged +allowed-not-ready-nodes api-burst api-prefix api-rate diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index 448e6b858b9..8b2293c5a16 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -52,6 +52,7 @@ type TestContextType struct { NodeOSDistro string VerifyServiceAccount bool DeleteNamespace bool + AllowedNotReadyNodes int CleanStart bool // If set to 'true' or 'all' framework will start a goroutine monitoring resource usage of system add-ons. // It will read the data every 30 seconds from all Nodes and print summary during afterEach. If set to 'master' @@ -120,6 +121,7 @@ func RegisterCommonFlags() { flag.StringVar(&TestContext.OutputPrintType, "output-print-type", "hr", "Comma separated list: 'hr' for human readable summaries 'json' for JSON ones.") flag.BoolVar(&TestContext.DumpLogsOnFailure, "dump-logs-on-failure", true, "If set to true test will dump data about the namespace in which test was running.") flag.BoolVar(&TestContext.DeleteNamespace, "delete-namespace", true, "If true tests will delete namespace after completion. It is only designed to make debugging easier, DO NOT turn it off by default.") + flag.IntVar(&TestContext.AllowedNotReadyNodes, "allowed-not-ready-nodes", 0, "If non-zero, framework will allow for that many non-ready nodes when checking for all ready nodes.") flag.StringVar(&TestContext.Host, "host", "http://127.0.0.1:8080", "The host, or apiserver, to connect to") flag.StringVar(&TestContext.ReportPrefix, "report-prefix", "", "Optional prefix for JUnit XML reports. Default is empty, which doesn't prepend anything to the default name.") flag.StringVar(&TestContext.ReportDir, "report-dir", "", "Path to the directory where the JUnit XML reports should be saved. Default is empty, which doesn't generate these reports.") diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index 0178dc507c2..2b653b86871 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -4189,6 +4189,23 @@ func WaitForNodeToBe(c *client.Client, name string, conditionType api.NodeCondit return false } +// Checks whether not-ready nodes can be ignored while checking if all nodes are +// ready (we allow e.g. for incorrect provisioning of some small percentage of nodes +// while validating cluster, and those nodes may never become healthy). +// Currently we allow only for: +// - not present CNI plugins on node +// TODO: we should extend it for other reasons. +func allowedNotReadyReasons(nodes []*api.Node) bool { + for _, node := range nodes { + index, condition := api.GetNodeCondition(&node.Status, api.NodeReady) + if index == -1 || + !strings.Contains(condition.Reason, "could not locate kubenet required CNI plugins") { + return false + } + } + return true +} + // Checks whether all registered nodes are ready. // TODO: we should change the AllNodesReady call in AfterEach to WaitForAllNodesHealthy, // and figure out how to do it in a configurable way, as we can't expect all setups to run @@ -4196,7 +4213,7 @@ func WaitForNodeToBe(c *client.Client, name string, conditionType api.NodeCondit func AllNodesReady(c *client.Client, timeout time.Duration) error { Logf("Waiting up to %v for all nodes to be ready", timeout) - var notReady []api.Node + var notReady []*api.Node err := wait.PollImmediate(Poll, timeout, func() (bool, error) { notReady = nil // It should be OK to list unschedulable Nodes here. @@ -4204,12 +4221,23 @@ func AllNodesReady(c *client.Client, timeout time.Duration) error { if err != nil { return false, err } - for _, node := range nodes.Items { - if !IsNodeConditionSetAsExpected(&node, api.NodeReady, true) { + for i := range nodes.Items { + node := &nodes.Items[i] + if !IsNodeConditionSetAsExpected(node, api.NodeReady, true) { notReady = append(notReady, node) } } - return len(notReady) == 0, nil + // Framework allows for nodes to be non-ready, + // to make it possible e.g. for incorrect deployment of some small percentage + // of nodes (which we allow in cluster validation). Some nodes that are not + // provisioned correctly at startup will never become ready (e.g. when something + // won't install correctly), so we can't expect them to be ready at any point. + // + // However, we only allow non-ready nodes with some specific reasons. + if len(notReady) > TestContext.AllowedNotReadyNodes { + return false, nil + } + return allowedNotReadyReasons(notReady), nil }) if err != nil && err != wait.ErrWaitTimeout {