Merge pull request #56130 from anguslees/kubeadm-nodehealth

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Make healthchecks skippable, and check masters only

**What this PR does / why we need it**:

Previously kubeadm would abort if _any_ node was not Ready.  This is obviously infeasible in a non-trivial (esp. baremetal) cluster.

This PR makes two changes:
- Allows kubeadm healthchecks to be selectively skipped (made non-fatal) with --ignore-checks-errors.
- Check only that the *master* nodes are Ready.

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes kubernetes/kubeadm#539

**Special notes for your reviewer**:

Builds on #56072

**Release note**:

```release-note
kubeadm health checks can also be skipped with `--ignore-checks-errors`
```
This commit is contained in:
Kubernetes Submit Queue 2017-11-24 04:20:26 -08:00 committed by GitHub
commit 58fca39de3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 51 deletions

View File

@ -119,7 +119,7 @@ func NewCmdApply(parentFlags *cmdUpgradeFlags) *cobra.Command {
func RunApply(flags *applyFlags) error {
// Start with the basics, verify that the cluster is healthy and get the configuration from the cluster (using the ConfigMap)
upgradeVars, err := enforceRequirements(flags.parent.featureGatesString, flags.parent.kubeConfigPath, flags.parent.cfgPath, flags.parent.printConfig, flags.dryRun)
upgradeVars, err := enforceRequirements(flags.parent.featureGatesString, flags.parent.kubeConfigPath, flags.parent.cfgPath, flags.parent.printConfig, flags.dryRun, flags.parent.ignorePreflightErrorsSet)
if err != nil {
return err
}

View File

@ -48,14 +48,14 @@ type upgradeVariables struct {
}
// enforceRequirements verifies that it's okay to upgrade and then returns the variables needed for the rest of the procedure
func enforceRequirements(featureGatesString, kubeConfigPath, cfgPath string, printConfig, dryRun bool) (*upgradeVariables, error) {
func enforceRequirements(featureGatesString, kubeConfigPath, cfgPath string, printConfig, dryRun bool, ignoreChecksErrors sets.String) (*upgradeVariables, error) {
client, err := getClient(kubeConfigPath, dryRun)
if err != nil {
return nil, fmt.Errorf("couldn't create a Kubernetes client from file %q: %v", kubeConfigPath, err)
}
// Run healthchecks against the cluster
if err := upgrade.CheckClusterHealth(client); err != nil {
if err := upgrade.CheckClusterHealth(client, ignoreChecksErrors); err != nil {
return nil, fmt.Errorf("[upgrade/health] FATAL: %v", err)
}

View File

@ -55,7 +55,7 @@ func NewCmdPlan(parentFlags *cmdUpgradeFlags) *cobra.Command {
// RunPlan takes care of outputting available versions to upgrade to for the user
func RunPlan(parentFlags *cmdUpgradeFlags) error {
// Start with the basics, verify that the cluster is healthy, build a client and a versionGetter. Never set dry-run for plan.
upgradeVars, err := enforceRequirements(parentFlags.featureGatesString, parentFlags.kubeConfigPath, parentFlags.cfgPath, parentFlags.printConfig, false)
upgradeVars, err := enforceRequirements(parentFlags.featureGatesString, parentFlags.kubeConfigPath, parentFlags.cfgPath, parentFlags.printConfig, false, parentFlags.ignorePreflightErrorsSet)
if err != nil {
return err
}

View File

@ -32,6 +32,7 @@ go_library(
"//cmd/kubeadm/app/phases/etcd:go_default_library",
"//cmd/kubeadm/app/phases/selfhosting:go_default_library",
"//cmd/kubeadm/app/phases/uploadconfig:go_default_library",
"//cmd/kubeadm/app/preflight:go_default_library",
"//cmd/kubeadm/app/util:go_default_library",
"//cmd/kubeadm/app/util/apiclient:go_default_library",
"//cmd/kubeadm/app/util/config:go_default_library",
@ -42,8 +43,10 @@ go_library(
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
],
)

View File

@ -24,73 +24,72 @@ import (
apps "k8s.io/api/apps/v1beta2"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
)
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
type healthCheck struct {
description, okMessage, failMessage string
// f is invoked with a k8s client passed to it. Should return an optional warning and/or an error
name string
client clientset.Interface
// f is invoked with a k8s client passed to it. Should return an optional error
f func(clientset.Interface) error
}
// Check is part of the preflight.Checker interface
func (c *healthCheck) Check() (warnings, errors []error) {
if err := c.f(c.client); err != nil {
return nil, []error{err}
}
return nil, nil
}
// Name is part of the preflight.Checker interface
func (c *healthCheck) Name() string {
return c.name
}
// CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy
// - all Nodes are Ready
// - all master Nodes are Ready
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface) error {
func CheckClusterHealth(client clientset.Interface, ignoreChecksErrors sets.String) error {
fmt.Println("[upgrade] Making sure the cluster is healthy:")
healthChecks := []healthCheck{
{
description: "API Server health",
okMessage: "Healthy",
failMessage: "Unhealthy",
f: apiServerHealthy,
healthChecks := []preflight.Checker{
&healthCheck{
name: "APIServerHealth",
client: client,
f: apiServerHealthy,
},
{
description: "Node health",
okMessage: "All Nodes are healthy",
failMessage: "More than one Node unhealthy",
f: nodesHealthy,
&healthCheck{
name: "MasterNodesReady",
client: client,
f: masterNodesReady,
},
// TODO: Add a check for ComponentStatuses here?
}
// Run slightly different health checks depending on control plane hosting type
if IsControlPlaneSelfHosted(client) {
healthChecks = append(healthChecks, healthCheck{
description: "Control plane DaemonSet health",
okMessage: "All control plane DaemonSets are healthy",
failMessage: "More than one control plane DaemonSet unhealthy",
f: controlPlaneHealth,
healthChecks = append(healthChecks, &healthCheck{
name: "ControlPlaneHealth",
client: client,
f: controlPlaneHealth,
})
} else {
healthChecks = append(healthChecks, healthCheck{
description: "Static Pod manifests exists on disk",
okMessage: "All manifests exist on disk",
failMessage: "Some manifests don't exist on disk",
f: staticPodManifestHealth,
healthChecks = append(healthChecks, &healthCheck{
name: "StaticPodManifest",
client: client,
f: staticPodManifestHealth,
})
}
return runHealthChecks(client, healthChecks)
}
// runHealthChecks runs a set of health checks against the cluster
func runHealthChecks(client clientset.Interface, healthChecks []healthCheck) error {
for _, check := range healthChecks {
err := check.f(client)
if err != nil {
fmt.Printf("[upgrade/health] Checking %s: %s\n", check.description, check.failMessage)
return fmt.Errorf("The cluster is not in an upgradeable state due to: %v", err)
}
fmt.Printf("[upgrade/health] Checking %s: %s\n", check.description, check.okMessage)
}
return nil
return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
}
// apiServerHealthy checks whether the API server's /healthz endpoint is healthy
@ -108,16 +107,25 @@ func apiServerHealthy(client clientset.Interface) error {
return nil
}
// nodesHealthy checks whether all Nodes in the cluster are in the Running state
func nodesHealthy(client clientset.Interface) error {
nodes, err := client.CoreV1().Nodes().List(metav1.ListOptions{})
// masterNodesReady checks whether all master Nodes in the cluster are in the Running state
func masterNodesReady(client clientset.Interface) error {
selector := labels.SelectorFromSet(labels.Set(map[string]string{
constants.LabelNodeRoleMaster: "",
}))
masters, err := client.CoreV1().Nodes().List(metav1.ListOptions{
LabelSelector: selector.String(),
})
if err != nil {
return fmt.Errorf("couldn't list all nodes in cluster: %v", err)
return fmt.Errorf("couldn't list masters in cluster: %v", err)
}
notReadyNodes := getNotReadyNodes(nodes.Items)
if len(notReadyNodes) != 0 {
return fmt.Errorf("there are NotReady Nodes in the cluster: %v", notReadyNodes)
if len(masters.Items) == 0 {
return fmt.Errorf("failed to find any nodes with master role")
}
notReadyMasters := getNotReadyNodes(masters.Items)
if len(notReadyMasters) != 0 {
return fmt.Errorf("there are NotReady masters in the cluster: %v", notReadyMasters)
}
return nil
}