mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 19:31:44 +00:00
kubeadm: Detect kubelet readiness and error out if the kubelet is unhealthy
This commit is contained in:
parent
b3efdebeb6
commit
92c5997b8e
@ -28,6 +28,7 @@ go_library(
|
||||
"//cmd/kubeadm/app/constants:go_default_library",
|
||||
"//cmd/kubeadm/app/discovery:go_default_library",
|
||||
"//cmd/kubeadm/app/features:go_default_library",
|
||||
"//cmd/kubeadm/app/images:go_default_library",
|
||||
"//cmd/kubeadm/app/phases/addons/dns:go_default_library",
|
||||
"//cmd/kubeadm/app/phases/addons/proxy:go_default_library",
|
||||
"//cmd/kubeadm/app/phases/apiconfig:go_default_library",
|
||||
|
@ -37,6 +37,7 @@ import (
|
||||
"k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/validation"
|
||||
kubeadmconstants "k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
||||
"k8s.io/kubernetes/cmd/kubeadm/app/features"
|
||||
"k8s.io/kubernetes/cmd/kubeadm/app/images"
|
||||
dnsaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/dns"
|
||||
proxyaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/proxy"
|
||||
apiconfigphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/apiconfig"
|
||||
@ -81,6 +82,23 @@ var (
|
||||
kubeadm join --token {{.Token}} {{.MasterHostPort}} --discovery-token-ca-cert-hash {{.CAPubKeyPin}}
|
||||
|
||||
`)))
|
||||
|
||||
kubeletFailTempl = template.Must(template.New("init").Parse(dedent.Dedent(`
|
||||
Unfortunately, an error has occurred:
|
||||
{{ .Error }}
|
||||
|
||||
This error is likely caused by that:
|
||||
- The kubelet is not running
|
||||
- The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled)
|
||||
- There is no internet connection; so the kubelet can't pull the following control plane images:
|
||||
- {{ .APIServerImage }}
|
||||
- {{ .ControllerManagerImage }}
|
||||
- {{ .SchedulerImage }}
|
||||
|
||||
You can troubleshoot this for example with the following commands if you're on a systemd-powered system:
|
||||
- 'systemctl status kubelet'
|
||||
- 'journalctl -xeu kubelet'
|
||||
`)))
|
||||
)
|
||||
|
||||
// NewCmdInit returns "kubeadm init" command.
|
||||
@ -325,12 +343,17 @@ func (i *Init) Run(out io.Writer) error {
|
||||
// waiter holds the apiclient.Waiter implementation of choice, responsible for querying the API server in various ways and waiting for conditions to be fulfilled
|
||||
waiter := getWaiter(i.dryRun, client)
|
||||
|
||||
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
|
||||
fmt.Println("[init] This process often takes about a minute to perform or longer if the control plane images have to be pulled...")
|
||||
// TODO: Adjust this timeout or start polling the kubelet API
|
||||
// TODO: Make this timeout more realistic when we do create some more complex logic about the interaction with the kubelet
|
||||
if err := waiter.WaitForAPI(); err != nil {
|
||||
return err
|
||||
if err := waitForAPIAndKubelet(waiter); err != nil {
|
||||
ctx := map[string]string{
|
||||
"Error": fmt.Sprintf("%v", err),
|
||||
"APIServerImage": images.GetCoreImage(kubeadmconstants.KubeAPIServer, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||
"ControllerManagerImage": images.GetCoreImage(kubeadmconstants.KubeControllerManager, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||
"SchedulerImage": images.GetCoreImage(kubeadmconstants.KubeScheduler, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||
}
|
||||
|
||||
kubeletFailTempl.Execute(out, ctx)
|
||||
|
||||
return fmt.Errorf("couldn't initialize a Kubernetes cluster")
|
||||
}
|
||||
|
||||
// Upload currently used configuration to the cluster
|
||||
@ -472,11 +495,43 @@ func printFilesIfDryRunning(dryRun bool, manifestDir string) error {
|
||||
return dryrunutil.PrintDryRunFiles(files, os.Stdout)
|
||||
}
|
||||
|
||||
// getWaiter gets the right waiter implementation
|
||||
// getWaiter gets the right waiter implementation for the right occasion
|
||||
func getWaiter(dryRun bool, client clientset.Interface) apiclient.Waiter {
|
||||
if dryRun {
|
||||
return dryrunutil.NewWaiter()
|
||||
}
|
||||
// TODO: Adjust this timeout slightly?
|
||||
return apiclient.NewKubeWaiter(client, 30*time.Minute, os.Stdout)
|
||||
}
|
||||
|
||||
// waitForAPIAndKubelet waits primarily for the API server to come up. If that takes a long time, and the kubelet
|
||||
// /healthz and /healthz/syncloop endpoints continuously are unhealthy, kubeadm will error out after a period of
|
||||
// backoffing exponentially
|
||||
func waitForAPIAndKubelet(waiter apiclient.Waiter) error {
|
||||
errorChan := make(chan error)
|
||||
|
||||
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
|
||||
fmt.Println("[init] This often takes around a minute; or longer if the control plane images have to be pulled.")
|
||||
|
||||
go func(errC chan error, waiter apiclient.Waiter) {
|
||||
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
|
||||
if err := waiter.WaitForHealthyKubelet(40*time.Second, "http://localhost:10255/healthz"); err != nil {
|
||||
errC <- err
|
||||
}
|
||||
}(errorChan, waiter)
|
||||
|
||||
go func(errC chan error, waiter apiclient.Waiter) {
|
||||
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
|
||||
if err := waiter.WaitForHealthyKubelet(60*time.Second, "http://localhost:10255/healthz/syncloop"); err != nil {
|
||||
errC <- err
|
||||
}
|
||||
}(errorChan, waiter)
|
||||
|
||||
go func(errC chan error, waiter apiclient.Waiter) {
|
||||
// This main goroutine sends whatever WaitForAPI returns (error or not) to the channel
|
||||
// This in order to continue on success (nil error), or just fail if
|
||||
errC <- waiter.WaitForAPI()
|
||||
}(errorChan, waiter)
|
||||
|
||||
// This call is blocking until one of the goroutines sends to errorChan
|
||||
return <-errorChan
|
||||
}
|
||||
|
@ -39,7 +39,7 @@ const (
|
||||
selfHostingWaitTimeout = 2 * time.Minute
|
||||
|
||||
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
||||
selfHostingFailureThreshold uint8 = 5
|
||||
selfHostingFailureThreshold int = 5
|
||||
)
|
||||
|
||||
// CreateSelfHostedControlPlane is responsible for turning a Static Pod-hosted control plane to a self-hosted one
|
||||
|
@ -43,7 +43,7 @@ const (
|
||||
selfHostingWaitTimeout = 2 * time.Minute
|
||||
|
||||
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
||||
selfHostingFailureThreshold uint8 = 10
|
||||
selfHostingFailureThreshold int = 10
|
||||
)
|
||||
|
||||
// controlPlaneComponentResources holds the relevant Pod and DaemonSet associated with a control plane component
|
||||
|
@ -113,6 +113,11 @@ func (w *fakeWaiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) erro
|
||||
return w.errsToReturn[waitForHashChange]
|
||||
}
|
||||
|
||||
// WaitForHealthyKubelet returns a dummy nil just to implement the interface
|
||||
func (w *fakeWaiter) WaitForHealthyKubelet(_ time.Duration, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type fakeStaticPodPathManager struct {
|
||||
realManifestDir string
|
||||
tempManifestDir string
|
||||
|
@ -40,10 +40,13 @@ type Waiter interface {
|
||||
WaitForPodsWithLabel(kvLabel string) error
|
||||
// WaitForPodToDisappear waits for the given Pod in the kube-system namespace to be deleted
|
||||
WaitForPodToDisappear(staticPodName string) error
|
||||
// WaitForStaticPodControlPlaneHashes
|
||||
// WaitForStaticPodControlPlaneHashes fetches sha256 hashes for the control plane static pods
|
||||
WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error)
|
||||
// WaitForStaticPodControlPlaneHashChange
|
||||
// WaitForStaticPodControlPlaneHashChange waits for the given static pod component's static pod hash to get updated.
|
||||
// By doing that we can be sure that the kubelet has restarted the given Static Pod
|
||||
WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error
|
||||
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||
WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error
|
||||
// SetTimeout adjusts the timeout to the specified duration
|
||||
SetTimeout(timeout time.Duration)
|
||||
}
|
||||
@ -123,6 +126,26 @@ func (w *KubeWaiter) WaitForPodToDisappear(podName string) error {
|
||||
})
|
||||
}
|
||||
|
||||
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||
func (w *KubeWaiter) WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error {
|
||||
time.Sleep(initalTimeout)
|
||||
return TryRunCommand(func() error {
|
||||
resp, err := http.Get(healthzEndpoint)
|
||||
if err != nil {
|
||||
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.\n")
|
||||
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' failed with error: %v.\n", healthzEndpoint, err)
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.")
|
||||
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' returned HTTP code %d\n", healthzEndpoint, resp.StatusCode)
|
||||
return fmt.Errorf("the kubelet healthz endpoint is unhealthy")
|
||||
}
|
||||
return nil
|
||||
}, 5) // a failureThreshold of five means waiting for a total of 155 seconds
|
||||
}
|
||||
|
||||
// SetTimeout adjusts the timeout to the specified duration
|
||||
func (w *KubeWaiter) SetTimeout(timeout time.Duration) {
|
||||
w.timeout = timeout
|
||||
@ -184,20 +207,19 @@ func getStaticPodControlPlaneHashes(client clientset.Interface, nodeName string)
|
||||
}
|
||||
|
||||
// TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned
|
||||
func TryRunCommand(f func() error, failureThreshold uint8) error {
|
||||
var numFailures uint8
|
||||
return wait.PollImmediate(5*time.Second, 20*time.Minute, func() (bool, error) {
|
||||
func TryRunCommand(f func() error, failureThreshold int) error {
|
||||
backoff := wait.Backoff{
|
||||
Duration: 5 * time.Second,
|
||||
Factor: 2, // double the timeout for every failure
|
||||
Steps: failureThreshold,
|
||||
}
|
||||
return wait.ExponentialBackoff(backoff, func() (bool, error) {
|
||||
err := f()
|
||||
if err != nil {
|
||||
numFailures++
|
||||
// If we've reached the maximum amount of failures, error out
|
||||
if numFailures == failureThreshold {
|
||||
return false, err
|
||||
}
|
||||
// Retry
|
||||
// Retry until the timeout
|
||||
return false, nil
|
||||
}
|
||||
// The last f() call was a success!
|
||||
// The last f() call was a success, return cleanly
|
||||
return true, nil
|
||||
})
|
||||
}
|
||||
|
@ -97,6 +97,12 @@ func (w *Waiter) WaitForPodToDisappear(podName string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||
func (w *Waiter) WaitForHealthyKubelet(_ time.Duration, healthzEndpoint string) error {
|
||||
fmt.Printf("[dryrun] Would make sure the kubelet %q endpoint is healthy\n", healthzEndpoint)
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetTimeout is a no-op; we don't wait in this implementation
|
||||
func (w *Waiter) SetTimeout(_ time.Duration) {}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user