mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 20:24:09 +00:00
Merge pull request #81319 from neolit123/1.16-kubeadm-upgrade-health-check
kubeadm: add a upgrade health check that deploys a Job
This commit is contained in:
commit
2bc3804a0e
@ -126,7 +126,7 @@ func enforceRequirements(flags *applyPlanFlags, dryRun bool, newK8sVersion strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run healthchecks against the cluster
|
// Run healthchecks against the cluster
|
||||||
if err := upgrade.CheckClusterHealth(client, ignorePreflightErrorsSet); err != nil {
|
if err := upgrade.CheckClusterHealth(client, &cfg.ClusterConfiguration, ignorePreflightErrorsSet); err != nil {
|
||||||
return nil, nil, nil, errors.Wrap(err, "[upgrade/health] FATAL")
|
return nil, nil, nil, errors.Wrap(err, "[upgrade/health] FATAL")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,6 +36,7 @@ go_library(
|
|||||||
"//cmd/kubeadm/app/util/etcd:go_default_library",
|
"//cmd/kubeadm/app/util/etcd:go_default_library",
|
||||||
"//cmd/kubeadm/app/util/staticpod:go_default_library",
|
"//cmd/kubeadm/app/util/staticpod:go_default_library",
|
||||||
"//staging/src/k8s.io/api/apps/v1:go_default_library",
|
"//staging/src/k8s.io/api/apps/v1:go_default_library",
|
||||||
|
"//staging/src/k8s.io/api/batch/v1:go_default_library",
|
||||||
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
"//staging/src/k8s.io/api/core/v1:go_default_library",
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||||
@ -43,6 +44,7 @@ go_library(
|
|||||||
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
|
"//staging/src/k8s.io/apimachinery/pkg/util/errors:go_default_library",
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
||||||
"//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library",
|
"//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library",
|
||||||
|
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
|
||||||
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
|
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
|
||||||
"//staging/src/k8s.io/component-base/version:go_default_library",
|
"//staging/src/k8s.io/component-base/version:go_default_library",
|
||||||
"//vendor/github.com/coredns/corefile-migration/migration:go_default_library",
|
"//vendor/github.com/coredns/corefile-migration/migration:go_default_library",
|
||||||
|
@ -18,32 +18,39 @@ package upgrade
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
|
|
||||||
apps "k8s.io/api/apps/v1"
|
apps "k8s.io/api/apps/v1"
|
||||||
"k8s.io/api/core/v1"
|
batchv1 "k8s.io/api/batch/v1"
|
||||||
|
v1 "k8s.io/api/core/v1"
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
clientset "k8s.io/client-go/kubernetes"
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
|
"k8s.io/klog"
|
||||||
|
kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
|
||||||
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
"k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
||||||
|
"k8s.io/kubernetes/cmd/kubeadm/app/images"
|
||||||
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
|
"k8s.io/kubernetes/cmd/kubeadm/app/preflight"
|
||||||
|
utilpointer "k8s.io/utils/pointer"
|
||||||
)
|
)
|
||||||
|
|
||||||
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
|
// healthCheck is a helper struct for easily performing healthchecks against the cluster and printing the output
|
||||||
type healthCheck struct {
|
type healthCheck struct {
|
||||||
name string
|
name string
|
||||||
client clientset.Interface
|
client clientset.Interface
|
||||||
// f is invoked with a k8s client passed to it. Should return an optional error
|
cfg *kubeadmapi.ClusterConfiguration
|
||||||
f func(clientset.Interface) error
|
// f is invoked with a k8s client and a kubeadm ClusterConfiguration passed to it. Should return an optional error
|
||||||
|
f func(clientset.Interface, *kubeadmapi.ClusterConfiguration) error
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check is part of the preflight.Checker interface
|
// Check is part of the preflight.Checker interface
|
||||||
func (c *healthCheck) Check() (warnings, errors []error) {
|
func (c *healthCheck) Check() (warnings, errors []error) {
|
||||||
if err := c.f(c.client); err != nil {
|
if err := c.f(c.client, c.cfg); err != nil {
|
||||||
return nil, []error{err}
|
return nil, []error{err}
|
||||||
}
|
}
|
||||||
return nil, nil
|
return nil, nil
|
||||||
@ -59,49 +66,150 @@ func (c *healthCheck) Name() string {
|
|||||||
// - all control-plane Nodes are Ready
|
// - all control-plane Nodes are Ready
|
||||||
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
|
// - (if self-hosted) that there are DaemonSets with at least one Pod for all control plane components
|
||||||
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
|
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
|
||||||
func CheckClusterHealth(client clientset.Interface, ignoreChecksErrors sets.String) error {
|
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.String) error {
|
||||||
fmt.Println("[upgrade] Making sure the cluster is healthy:")
|
fmt.Println("[upgrade] Running cluster health checks")
|
||||||
|
|
||||||
healthChecks := []preflight.Checker{
|
healthChecks := []preflight.Checker{
|
||||||
&healthCheck{
|
&healthCheck{
|
||||||
name: "APIServerHealth",
|
name: "CreateJob",
|
||||||
client: client,
|
client: client,
|
||||||
f: apiServerHealthy,
|
cfg: cfg,
|
||||||
|
f: createJob,
|
||||||
},
|
},
|
||||||
&healthCheck{
|
&healthCheck{
|
||||||
name: "ControlPlaneNodesReady",
|
name: "ControlPlaneNodesReady",
|
||||||
client: client,
|
client: client,
|
||||||
f: controlPlaneNodesReady,
|
f: controlPlaneNodesReady,
|
||||||
},
|
},
|
||||||
// TODO: Add a check for ComponentStatuses here?
|
&healthCheck{
|
||||||
}
|
|
||||||
|
|
||||||
healthChecks = append(healthChecks, &healthCheck{
|
|
||||||
name: "StaticPodManifest",
|
name: "StaticPodManifest",
|
||||||
client: client,
|
client: client,
|
||||||
|
cfg: cfg,
|
||||||
f: staticPodManifestHealth,
|
f: staticPodManifestHealth,
|
||||||
})
|
},
|
||||||
|
}
|
||||||
|
|
||||||
return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
|
return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
|
||||||
}
|
}
|
||||||
|
|
||||||
// apiServerHealthy checks whether the API server's /healthz endpoint is healthy
|
// CreateJob is a check that verifies that a Job can be created in the cluster
|
||||||
func apiServerHealthy(client clientset.Interface) error {
|
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
|
||||||
healthStatus := 0
|
const (
|
||||||
|
jobName = "upgrade-health-check"
|
||||||
|
ns = metav1.NamespaceSystem
|
||||||
|
timeout = 15 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
// If client.Discovery().RESTClient() is nil, the fake client is used, and that means we are dry-running. Just proceed
|
// If client.Discovery().RESTClient() is nil, the fake client is used.
|
||||||
|
// Return early because the kubeadm dryrun dynamic client only handles the core/v1 GroupVersion.
|
||||||
if client.Discovery().RESTClient() == nil {
|
if client.Discovery().RESTClient() == nil {
|
||||||
|
fmt.Printf("[dryrun] Would create the Job %q in namespace %q and wait until it completes\n", jobName, ns)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
client.Discovery().RESTClient().Get().AbsPath("/healthz").Do().StatusCode(&healthStatus)
|
|
||||||
if healthStatus != http.StatusOK {
|
// Prepare Job
|
||||||
return errors.Errorf("the API Server is unhealthy; /healthz didn't return %q", "ok")
|
job := &batchv1.Job{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: jobName,
|
||||||
|
Namespace: ns,
|
||||||
|
},
|
||||||
|
Spec: batchv1.JobSpec{
|
||||||
|
BackoffLimit: utilpointer.Int32Ptr(0),
|
||||||
|
Template: v1.PodTemplateSpec{
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyNever,
|
||||||
|
SecurityContext: &v1.PodSecurityContext{
|
||||||
|
RunAsUser: utilpointer.Int64Ptr(999),
|
||||||
|
RunAsGroup: utilpointer.Int64Ptr(999),
|
||||||
|
RunAsNonRoot: utilpointer.BoolPtr(true),
|
||||||
|
},
|
||||||
|
Tolerations: []v1.Toleration{
|
||||||
|
{
|
||||||
|
Key: "node-role.kubernetes.io/master",
|
||||||
|
Effect: v1.TaintEffectNoSchedule,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: jobName,
|
||||||
|
Image: images.GetPauseImage(cfg),
|
||||||
|
Args: []string{"-v"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the Job already exists and delete it
|
||||||
|
if _, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{}); err == nil {
|
||||||
|
if err = deleteHealthCheckJob(client, ns, jobName); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup the Job on exit
|
||||||
|
defer func() {
|
||||||
|
lastError = deleteHealthCheckJob(client, ns, jobName)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Create the Job, but retry in case it is being currently deleted
|
||||||
|
klog.V(2).Infof("Creating Job %q in the namespace %q", jobName, ns)
|
||||||
|
err := wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
|
||||||
|
if _, err := client.BatchV1().Jobs(ns).Create(job); err != nil {
|
||||||
|
klog.V(2).Infof("Could not create Job %q in the namespace %q, retrying: %v", jobName, ns, err)
|
||||||
|
lastError = err
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrapf(lastError, "could not create Job %q in the namespace %q", jobName, ns)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Waiting and manually deleteing the Job is a workaround to not enabling the TTL controller.
|
||||||
|
// TODO: refactor this if the TTL controller is enabled in kubeadm once it goes Beta.
|
||||||
|
|
||||||
|
// Wait for the Job to complete
|
||||||
|
err = wait.PollImmediate(time.Second*1, timeout, func() (bool, error) {
|
||||||
|
job, err := client.BatchV1().Jobs(ns).Get(jobName, metav1.GetOptions{})
|
||||||
|
if err != nil {
|
||||||
|
lastError = err
|
||||||
|
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
for _, cond := range job.Status.Conditions {
|
||||||
|
if cond.Type == batchv1.JobComplete {
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lastError = errors.Errorf("no condition of type %v", batchv1.JobComplete)
|
||||||
|
klog.V(2).Infof("Job %q in the namespace %q is not yet complete, retrying", jobName, ns)
|
||||||
|
return false, nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrapf(lastError, "Job %q in the namespace %q did not complete in %v", jobName, ns, timeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
klog.V(2).Infof("Job %q in the namespace %q completed", jobName, ns)
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func deleteHealthCheckJob(client clientset.Interface, ns, jobName string) error {
|
||||||
|
klog.V(2).Infof("Deleting Job %q in the namespace %q", jobName, ns)
|
||||||
|
propagation := metav1.DeletePropagationForeground
|
||||||
|
deleteOptions := &metav1.DeleteOptions{
|
||||||
|
PropagationPolicy: &propagation,
|
||||||
|
}
|
||||||
|
if err := client.BatchV1().Jobs(ns).Delete(jobName, deleteOptions); err != nil {
|
||||||
|
return errors.Wrapf(err, "could not delete Job %q in the namespace %q", jobName, ns)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state
|
// controlPlaneNodesReady checks whether all control-plane Nodes in the cluster are in the Running state
|
||||||
func controlPlaneNodesReady(client clientset.Interface) error {
|
func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
|
||||||
selector := labels.SelectorFromSet(labels.Set(map[string]string{
|
selector := labels.SelectorFromSet(labels.Set(map[string]string{
|
||||||
constants.LabelNodeRoleMaster: "",
|
constants.LabelNodeRoleMaster: "",
|
||||||
}))
|
}))
|
||||||
@ -124,7 +232,7 @@ func controlPlaneNodesReady(client clientset.Interface) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// staticPodManifestHealth makes sure the required static pods are presents
|
// staticPodManifestHealth makes sure the required static pods are presents
|
||||||
func staticPodManifestHealth(_ clientset.Interface) error {
|
func staticPodManifestHealth(_ clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
|
||||||
nonExistentManifests := []string{}
|
nonExistentManifests := []string{}
|
||||||
for _, component := range constants.ControlPlaneComponents {
|
for _, component := range constants.ControlPlaneComponents {
|
||||||
manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory())
|
manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory())
|
||||||
|
Loading…
Reference in New Issue
Block a user