mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 04:06:03 +00:00
kubeadm: Detect kubelet readiness and error out if the kubelet is unhealthy
This commit is contained in:
parent
b3efdebeb6
commit
92c5997b8e
@ -28,6 +28,7 @@ go_library(
|
|||||||
"//cmd/kubeadm/app/constants:go_default_library",
|
"//cmd/kubeadm/app/constants:go_default_library",
|
||||||
"//cmd/kubeadm/app/discovery:go_default_library",
|
"//cmd/kubeadm/app/discovery:go_default_library",
|
||||||
"//cmd/kubeadm/app/features:go_default_library",
|
"//cmd/kubeadm/app/features:go_default_library",
|
||||||
|
"//cmd/kubeadm/app/images:go_default_library",
|
||||||
"//cmd/kubeadm/app/phases/addons/dns:go_default_library",
|
"//cmd/kubeadm/app/phases/addons/dns:go_default_library",
|
||||||
"//cmd/kubeadm/app/phases/addons/proxy:go_default_library",
|
"//cmd/kubeadm/app/phases/addons/proxy:go_default_library",
|
||||||
"//cmd/kubeadm/app/phases/apiconfig:go_default_library",
|
"//cmd/kubeadm/app/phases/apiconfig:go_default_library",
|
||||||
|
@ -37,6 +37,7 @@ import (
|
|||||||
"k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/validation"
|
"k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/validation"
|
||||||
kubeadmconstants "k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
kubeadmconstants "k8s.io/kubernetes/cmd/kubeadm/app/constants"
|
||||||
"k8s.io/kubernetes/cmd/kubeadm/app/features"
|
"k8s.io/kubernetes/cmd/kubeadm/app/features"
|
||||||
|
"k8s.io/kubernetes/cmd/kubeadm/app/images"
|
||||||
dnsaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/dns"
|
dnsaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/dns"
|
||||||
proxyaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/proxy"
|
proxyaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/proxy"
|
||||||
apiconfigphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/apiconfig"
|
apiconfigphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/apiconfig"
|
||||||
@ -81,6 +82,23 @@ var (
|
|||||||
kubeadm join --token {{.Token}} {{.MasterHostPort}} --discovery-token-ca-cert-hash {{.CAPubKeyPin}}
|
kubeadm join --token {{.Token}} {{.MasterHostPort}} --discovery-token-ca-cert-hash {{.CAPubKeyPin}}
|
||||||
|
|
||||||
`)))
|
`)))
|
||||||
|
|
||||||
|
kubeletFailTempl = template.Must(template.New("init").Parse(dedent.Dedent(`
|
||||||
|
Unfortunately, an error has occurred:
|
||||||
|
{{ .Error }}
|
||||||
|
|
||||||
|
This error is likely caused by that:
|
||||||
|
- The kubelet is not running
|
||||||
|
- The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled)
|
||||||
|
- There is no internet connection; so the kubelet can't pull the following control plane images:
|
||||||
|
- {{ .APIServerImage }}
|
||||||
|
- {{ .ControllerManagerImage }}
|
||||||
|
- {{ .SchedulerImage }}
|
||||||
|
|
||||||
|
You can troubleshoot this for example with the following commands if you're on a systemd-powered system:
|
||||||
|
- 'systemctl status kubelet'
|
||||||
|
- 'journalctl -xeu kubelet'
|
||||||
|
`)))
|
||||||
)
|
)
|
||||||
|
|
||||||
// NewCmdInit returns "kubeadm init" command.
|
// NewCmdInit returns "kubeadm init" command.
|
||||||
@ -325,12 +343,17 @@ func (i *Init) Run(out io.Writer) error {
|
|||||||
// waiter holds the apiclient.Waiter implementation of choice, responsible for querying the API server in various ways and waiting for conditions to be fulfilled
|
// waiter holds the apiclient.Waiter implementation of choice, responsible for querying the API server in various ways and waiting for conditions to be fulfilled
|
||||||
waiter := getWaiter(i.dryRun, client)
|
waiter := getWaiter(i.dryRun, client)
|
||||||
|
|
||||||
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
|
if err := waitForAPIAndKubelet(waiter); err != nil {
|
||||||
fmt.Println("[init] This process often takes about a minute to perform or longer if the control plane images have to be pulled...")
|
ctx := map[string]string{
|
||||||
// TODO: Adjust this timeout or start polling the kubelet API
|
"Error": fmt.Sprintf("%v", err),
|
||||||
// TODO: Make this timeout more realistic when we do create some more complex logic about the interaction with the kubelet
|
"APIServerImage": images.GetCoreImage(kubeadmconstants.KubeAPIServer, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||||
if err := waiter.WaitForAPI(); err != nil {
|
"ControllerManagerImage": images.GetCoreImage(kubeadmconstants.KubeControllerManager, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||||
return err
|
"SchedulerImage": images.GetCoreImage(kubeadmconstants.KubeScheduler, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
|
||||||
|
}
|
||||||
|
|
||||||
|
kubeletFailTempl.Execute(out, ctx)
|
||||||
|
|
||||||
|
return fmt.Errorf("couldn't initialize a Kubernetes cluster")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Upload currently used configuration to the cluster
|
// Upload currently used configuration to the cluster
|
||||||
@ -472,11 +495,43 @@ func printFilesIfDryRunning(dryRun bool, manifestDir string) error {
|
|||||||
return dryrunutil.PrintDryRunFiles(files, os.Stdout)
|
return dryrunutil.PrintDryRunFiles(files, os.Stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getWaiter gets the right waiter implementation
|
// getWaiter gets the right waiter implementation for the right occasion
|
||||||
func getWaiter(dryRun bool, client clientset.Interface) apiclient.Waiter {
|
func getWaiter(dryRun bool, client clientset.Interface) apiclient.Waiter {
|
||||||
if dryRun {
|
if dryRun {
|
||||||
return dryrunutil.NewWaiter()
|
return dryrunutil.NewWaiter()
|
||||||
}
|
}
|
||||||
// TODO: Adjust this timeout slightly?
|
|
||||||
return apiclient.NewKubeWaiter(client, 30*time.Minute, os.Stdout)
|
return apiclient.NewKubeWaiter(client, 30*time.Minute, os.Stdout)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// waitForAPIAndKubelet waits primarily for the API server to come up. If that takes a long time, and the kubelet
|
||||||
|
// /healthz and /healthz/syncloop endpoints continuously are unhealthy, kubeadm will error out after a period of
|
||||||
|
// backoffing exponentially
|
||||||
|
func waitForAPIAndKubelet(waiter apiclient.Waiter) error {
|
||||||
|
errorChan := make(chan error)
|
||||||
|
|
||||||
|
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
|
||||||
|
fmt.Println("[init] This often takes around a minute; or longer if the control plane images have to be pulled.")
|
||||||
|
|
||||||
|
go func(errC chan error, waiter apiclient.Waiter) {
|
||||||
|
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
|
||||||
|
if err := waiter.WaitForHealthyKubelet(40*time.Second, "http://localhost:10255/healthz"); err != nil {
|
||||||
|
errC <- err
|
||||||
|
}
|
||||||
|
}(errorChan, waiter)
|
||||||
|
|
||||||
|
go func(errC chan error, waiter apiclient.Waiter) {
|
||||||
|
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
|
||||||
|
if err := waiter.WaitForHealthyKubelet(60*time.Second, "http://localhost:10255/healthz/syncloop"); err != nil {
|
||||||
|
errC <- err
|
||||||
|
}
|
||||||
|
}(errorChan, waiter)
|
||||||
|
|
||||||
|
go func(errC chan error, waiter apiclient.Waiter) {
|
||||||
|
// This main goroutine sends whatever WaitForAPI returns (error or not) to the channel
|
||||||
|
// This in order to continue on success (nil error), or just fail if
|
||||||
|
errC <- waiter.WaitForAPI()
|
||||||
|
}(errorChan, waiter)
|
||||||
|
|
||||||
|
// This call is blocking until one of the goroutines sends to errorChan
|
||||||
|
return <-errorChan
|
||||||
|
}
|
||||||
|
@ -39,7 +39,7 @@ const (
|
|||||||
selfHostingWaitTimeout = 2 * time.Minute
|
selfHostingWaitTimeout = 2 * time.Minute
|
||||||
|
|
||||||
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
||||||
selfHostingFailureThreshold uint8 = 5
|
selfHostingFailureThreshold int = 5
|
||||||
)
|
)
|
||||||
|
|
||||||
// CreateSelfHostedControlPlane is responsible for turning a Static Pod-hosted control plane to a self-hosted one
|
// CreateSelfHostedControlPlane is responsible for turning a Static Pod-hosted control plane to a self-hosted one
|
||||||
|
@ -43,7 +43,7 @@ const (
|
|||||||
selfHostingWaitTimeout = 2 * time.Minute
|
selfHostingWaitTimeout = 2 * time.Minute
|
||||||
|
|
||||||
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
|
||||||
selfHostingFailureThreshold uint8 = 10
|
selfHostingFailureThreshold int = 10
|
||||||
)
|
)
|
||||||
|
|
||||||
// controlPlaneComponentResources holds the relevant Pod and DaemonSet associated with a control plane component
|
// controlPlaneComponentResources holds the relevant Pod and DaemonSet associated with a control plane component
|
||||||
|
@ -113,6 +113,11 @@ func (w *fakeWaiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) erro
|
|||||||
return w.errsToReturn[waitForHashChange]
|
return w.errsToReturn[waitForHashChange]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WaitForHealthyKubelet returns a dummy nil just to implement the interface
|
||||||
|
func (w *fakeWaiter) WaitForHealthyKubelet(_ time.Duration, _ string) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
type fakeStaticPodPathManager struct {
|
type fakeStaticPodPathManager struct {
|
||||||
realManifestDir string
|
realManifestDir string
|
||||||
tempManifestDir string
|
tempManifestDir string
|
||||||
|
@ -40,10 +40,13 @@ type Waiter interface {
|
|||||||
WaitForPodsWithLabel(kvLabel string) error
|
WaitForPodsWithLabel(kvLabel string) error
|
||||||
// WaitForPodToDisappear waits for the given Pod in the kube-system namespace to be deleted
|
// WaitForPodToDisappear waits for the given Pod in the kube-system namespace to be deleted
|
||||||
WaitForPodToDisappear(staticPodName string) error
|
WaitForPodToDisappear(staticPodName string) error
|
||||||
// WaitForStaticPodControlPlaneHashes
|
// WaitForStaticPodControlPlaneHashes fetches sha256 hashes for the control plane static pods
|
||||||
WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error)
|
WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error)
|
||||||
// WaitForStaticPodControlPlaneHashChange
|
// WaitForStaticPodControlPlaneHashChange waits for the given static pod component's static pod hash to get updated.
|
||||||
|
// By doing that we can be sure that the kubelet has restarted the given Static Pod
|
||||||
WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error
|
WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error
|
||||||
|
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||||
|
WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error
|
||||||
// SetTimeout adjusts the timeout to the specified duration
|
// SetTimeout adjusts the timeout to the specified duration
|
||||||
SetTimeout(timeout time.Duration)
|
SetTimeout(timeout time.Duration)
|
||||||
}
|
}
|
||||||
@ -123,6 +126,26 @@ func (w *KubeWaiter) WaitForPodToDisappear(podName string) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||||
|
func (w *KubeWaiter) WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error {
|
||||||
|
time.Sleep(initalTimeout)
|
||||||
|
return TryRunCommand(func() error {
|
||||||
|
resp, err := http.Get(healthzEndpoint)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.\n")
|
||||||
|
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' failed with error: %v.\n", healthzEndpoint, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode != http.StatusOK {
|
||||||
|
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.")
|
||||||
|
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' returned HTTP code %d\n", healthzEndpoint, resp.StatusCode)
|
||||||
|
return fmt.Errorf("the kubelet healthz endpoint is unhealthy")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}, 5) // a failureThreshold of five means waiting for a total of 155 seconds
|
||||||
|
}
|
||||||
|
|
||||||
// SetTimeout adjusts the timeout to the specified duration
|
// SetTimeout adjusts the timeout to the specified duration
|
||||||
func (w *KubeWaiter) SetTimeout(timeout time.Duration) {
|
func (w *KubeWaiter) SetTimeout(timeout time.Duration) {
|
||||||
w.timeout = timeout
|
w.timeout = timeout
|
||||||
@ -184,20 +207,19 @@ func getStaticPodControlPlaneHashes(client clientset.Interface, nodeName string)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned
|
// TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned
|
||||||
func TryRunCommand(f func() error, failureThreshold uint8) error {
|
func TryRunCommand(f func() error, failureThreshold int) error {
|
||||||
var numFailures uint8
|
backoff := wait.Backoff{
|
||||||
return wait.PollImmediate(5*time.Second, 20*time.Minute, func() (bool, error) {
|
Duration: 5 * time.Second,
|
||||||
|
Factor: 2, // double the timeout for every failure
|
||||||
|
Steps: failureThreshold,
|
||||||
|
}
|
||||||
|
return wait.ExponentialBackoff(backoff, func() (bool, error) {
|
||||||
err := f()
|
err := f()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
numFailures++
|
// Retry until the timeout
|
||||||
// If we've reached the maximum amount of failures, error out
|
|
||||||
if numFailures == failureThreshold {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
// Retry
|
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
// The last f() call was a success!
|
// The last f() call was a success, return cleanly
|
||||||
return true, nil
|
return true, nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -97,6 +97,12 @@ func (w *Waiter) WaitForPodToDisappear(podName string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
|
||||||
|
func (w *Waiter) WaitForHealthyKubelet(_ time.Duration, healthzEndpoint string) error {
|
||||||
|
fmt.Printf("[dryrun] Would make sure the kubelet %q endpoint is healthy\n", healthzEndpoint)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// SetTimeout is a no-op; we don't wait in this implementation
|
// SetTimeout is a no-op; we don't wait in this implementation
|
||||||
func (w *Waiter) SetTimeout(_ time.Duration) {}
|
func (w *Waiter) SetTimeout(_ time.Duration) {}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user