mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 04:06:03 +00:00
Merge pull request #56130 from anguslees/kubeadm-nodehealth
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Make healthchecks skippable, and check masters only **What this PR does / why we need it**: Previously kubeadm would abort if _any_ node was not Ready. This is obviously infeasible in a non-trivial (esp. baremetal) cluster. This PR makes two changes: - Allows kubeadm healthchecks to be selectively skipped (made non-fatal) with --ignore-checks-errors. - Check only that the *master* nodes are Ready. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes kubernetes/kubeadm#539 **Special notes for your reviewer**: Builds on #56072 **Release note**: ```release-note kubeadm health checks can also be skipped with `--ignore-checks-errors` ```
This commit is contained in:
commit
58fca39de3
@ -119,7 +119,7 @@ func NewCmdApply(parentFlags *cmdUpgradeFlags) *cobra.Command {
|
||||
func RunApply(flags *applyFlags) error {
|
||||
|
||||
// Start with the basics, verify that the cluster is healthy and get the configuration from the cluster (using the ConfigMap)
|
||||
upgradeVars, err := enforceRequirements(flags.parent.featureGatesString, flags.parent.kubeConfigPath, flags.parent.cfgPath, flags.parent.printConfig, flags.dryRun)
|
||||
upgradeVars, err := enforceRequirements(flags.parent.featureGatesString, flags.parent.kubeConfigPath, flags.parent.cfgPath, flags.parent.printConfig, flags.dryRun, flags.parent.ignorePreflightErrorsSet)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -48,14 +48,14 @@ type upgradeVariables struct {
|
||||
}
|
||||
|
||||
// enforceRequirements verifies that it's okay to upgrade and then returns the variables needed for the rest of the procedure
|
||||
func enforceRequirements(featureGatesString, kubeConfigPath, cfgPath string, printConfig, dryRun bool) (*upgradeVariables, error) {
|
||||
func enforceRequirements(featureGatesString, kubeConfigPath, cfgPath string, printConfig, dryRun bool, ignoreChecksErrors sets.String) (*upgradeVariables, error) {
|
||||
client, err := getClient(kubeConfigPath, dryRun)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't create a Kubernetes client from file %q: %v", kubeConfigPath, err)
|
||||
}
|
||||
|
||||
// Run healthchecks against the cluster
|
||||
if err := upgrade.CheckClusterHealth(client); err != nil {
|
||||
if err := upgrade.CheckClusterHealth(client, ignoreChecksErrors); err != nil {
|
||||
return nil, fmt.Errorf("[upgrade/health] FATAL: %v", err)
|
||||
}
|
||||
|
||||
|
@ -55,7 +55,7 @@ func NewCmdPlan(parentFlags *cmdUpgradeFlags) *cobra.Command {
|
||||
// RunPlan takes care of outputting available versions to upgrade to for the user
|
||||
func RunPlan(parentFlags *cmdUpgradeFlags) error {
|
||||
// Start with the basics, verify that the cluster is healthy, build a client and a versionGetter. Never set dry-run for plan.
|
||||
upgradeVars, err := enforceRequirements(parentFlags.featureGatesString, parentFlags.kubeConfigPath, parentFlags.cfgPath, parentFlags.printConfig, false)
|
||||
upgradeVars, err := enforceRequirements(parentFlags.featureGatesString, parentFlags.kubeConfigPath, parentFlags.cfgPath, parentFlags.printConfig, false, parentFlags.ignorePreflightErrorsSet)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -32,6 +32,7 @@ go_library(
|
||||
"//cmd/kubeadm/app/phases/etcd:go_default_library",
|
||||
"//cmd/kubeadm/app/phases/selfhosting:go_default_library",
|
||||
"//cmd/kubeadm/app/phases/uploadconfig:go_default_library",
|
||||
"//cmd/kubeadm/app/preflight:go_default_library",
|
||||
"//cmd/kubeadm/app/util:go_default_library",
|
||||
"//cmd/kubeadm/app/util/apiclient:go_default_library",
|
||||
"//cmd/kubeadm/app/util/config:go_default_library",
|
||||
@ -42,8 +43,10 @@ go_library(
|
||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
||||
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
|
||||
],
|
||||
)
|
||||
|
@ -24,73 +24,72 @@ import (
|
||||
apps "k8s.io/api/apps/v1beta2"
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
||||
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
|
||||
)
|
||||
|
||||
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
|
||||
type healthCheck struct {
|
||||
description, okMessage, failMessage string
|
||||
// f is invoked with a k8s client passed to it. Should return an optional warning and/or an error
|
||||
name string
|
||||
client clientset.Interface
|
||||
// f is invoked with a k8s client passed to it. Should return an optional error
|
||||
f func(clientset.Interface) error
|
||||
}
|
||||
|
||||
// Check is part of the preflight.Checker interface
|
||||
func (c *healthCheck) Check() (warnings, errors []error) {
|
||||
if err := c.f(c.client); err != nil {
|
||||
return nil, []error{err}
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Name is part of the preflight.Checker interface
|
||||
func (c *healthCheck) Name() string {
|
||||
return c.name
|
||||
}
|
||||
|
||||
// CheckClusterHealth makes sure:
|
||||
// - the API /healthz endpoint is healthy
|
||||
// - all Nodes are Ready
|
||||
// - all master Nodes are Ready
|
||||
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
|
||||
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
|
||||
func CheckClusterHealth(client clientset.Interface) error {
|
||||
func CheckClusterHealth(client clientset.Interface, ignoreChecksErrors sets.String) error {
|
||||
fmt.Println("[upgrade] Making sure the cluster is healthy:")
|
||||
|
||||
healthChecks := []healthCheck{
|
||||
{
|
||||
description: "API Server health",
|
||||
okMessage: "Healthy",
|
||||
failMessage: "Unhealthy",
|
||||
f: apiServerHealthy,
|
||||
healthChecks := []preflight.Checker{
|
||||
&healthCheck{
|
||||
name: "APIServerHealth",
|
||||
client: client,
|
||||
f: apiServerHealthy,
|
||||
},
|
||||
{
|
||||
description: "Node health",
|
||||
okMessage: "All Nodes are healthy",
|
||||
failMessage: "More than one Node unhealthy",
|
||||
f: nodesHealthy,
|
||||
&healthCheck{
|
||||
name: "MasterNodesReady",
|
||||
client: client,
|
||||
f: masterNodesReady,
|
||||
},
|
||||
// TODO: Add a check for ComponentStatuses here?
|
||||
}
|
||||
|
||||
// Run slightly different health checks depending on control plane hosting type
|
||||
if IsControlPlaneSelfHosted(client) {
|
||||
healthChecks = append(healthChecks, healthCheck{
|
||||
description: "Control plane DaemonSet health",
|
||||
okMessage: "All control plane DaemonSets are healthy",
|
||||
failMessage: "More than one control plane DaemonSet unhealthy",
|
||||
f: controlPlaneHealth,
|
||||
healthChecks = append(healthChecks, &healthCheck{
|
||||
name: "ControlPlaneHealth",
|
||||
client: client,
|
||||
f: controlPlaneHealth,
|
||||
})
|
||||
} else {
|
||||
healthChecks = append(healthChecks, healthCheck{
|
||||
description: "Static Pod manifests exists on disk",
|
||||
okMessage: "All manifests exist on disk",
|
||||
failMessage: "Some manifests don't exist on disk",
|
||||
f: staticPodManifestHealth,
|
||||
healthChecks = append(healthChecks, &healthCheck{
|
||||
name: "StaticPodManifest",
|
||||
client: client,
|
||||
f: staticPodManifestHealth,
|
||||
})
|
||||
}
|
||||
|
||||
return runHealthChecks(client, healthChecks)
|
||||
}
|
||||
|
||||
// runHealthChecks runs a set of health checks against the cluster
|
||||
func runHealthChecks(client clientset.Interface, healthChecks []healthCheck) error {
|
||||
for _, check := range healthChecks {
|
||||
|
||||
err := check.f(client)
|
||||
if err != nil {
|
||||
fmt.Printf("[upgrade/health] Checking %s: %s\n", check.description, check.failMessage)
|
||||
return fmt.Errorf("The cluster is not in an upgradeable state due to: %v", err)
|
||||
}
|
||||
fmt.Printf("[upgrade/health] Checking %s: %s\n", check.description, check.okMessage)
|
||||
}
|
||||
return nil
|
||||
return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
|
||||
}
|
||||
|
||||
// apiServerHealthy checks whether the API server's /healthz endpoint is healthy
|
||||
@ -108,16 +107,25 @@ func apiServerHealthy(client clientset.Interface) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// nodesHealthy checks whether all Nodes in the cluster are in the Running state
|
||||
func nodesHealthy(client clientset.Interface) error {
|
||||
nodes, err := client.CoreV1().Nodes().List(metav1.ListOptions{})
|
||||
// masterNodesReady checks whether all master Nodes in the cluster are in the Running state
|
||||
func masterNodesReady(client clientset.Interface) error {
|
||||
selector := labels.SelectorFromSet(labels.Set(map[string]string{
|
||||
constants.LabelNodeRoleMaster: "",
|
||||
}))
|
||||
masters, err := client.CoreV1().Nodes().List(metav1.ListOptions{
|
||||
LabelSelector: selector.String(),
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("couldn't list all nodes in cluster: %v", err)
|
||||
return fmt.Errorf("couldn't list masters in cluster: %v", err)
|
||||
}
|
||||
|
||||
notReadyNodes := getNotReadyNodes(nodes.Items)
|
||||
if len(notReadyNodes) != 0 {
|
||||
return fmt.Errorf("there are NotReady Nodes in the cluster: %v", notReadyNodes)
|
||||
if len(masters.Items) == 0 {
|
||||
return fmt.Errorf("failed to find any nodes with master role")
|
||||
}
|
||||
|
||||
notReadyMasters := getNotReadyNodes(masters.Items)
|
||||
if len(notReadyMasters) != 0 {
|
||||
return fmt.Errorf("there are NotReady masters in the cluster: %v", notReadyMasters)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user