kubeadm: Detect kubelet readiness and error out if the kubelet is unhealthy

This commit is contained in:
Lucas Käldström 2017-09-03 18:02:46 +03:00
parent b3efdebeb6
commit 92c5997b8e
No known key found for this signature in database
GPG Key ID: 3FA3783D77751514
7 changed files with 111 additions and 22 deletions

View File

@ -28,6 +28,7 @@ go_library(
"//cmd/kubeadm/app/constants:go_default_library",
"//cmd/kubeadm/app/discovery:go_default_library",
"//cmd/kubeadm/app/features:go_default_library",
"//cmd/kubeadm/app/images:go_default_library",
"//cmd/kubeadm/app/phases/addons/dns:go_default_library",
"//cmd/kubeadm/app/phases/addons/proxy:go_default_library",
"//cmd/kubeadm/app/phases/apiconfig:go_default_library",

View File

@ -37,6 +37,7 @@ import (
"k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/validation"
kubeadmconstants "k8s.io/kubernetes/cmd/kubeadm/app/constants"
"k8s.io/kubernetes/cmd/kubeadm/app/features"
"k8s.io/kubernetes/cmd/kubeadm/app/images"
dnsaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/dns"
proxyaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/proxy"
apiconfigphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/apiconfig"
@ -81,6 +82,23 @@ var (
kubeadm join --token {{.Token}} {{.MasterHostPort}} --discovery-token-ca-cert-hash {{.CAPubKeyPin}}
`)))
kubeletFailTempl = template.Must(template.New("init").Parse(dedent.Dedent(`
Unfortunately, an error has occurred:
{{ .Error }}
This error is likely caused by that:
- The kubelet is not running
- The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled)
- There is no internet connection; so the kubelet can't pull the following control plane images:
- {{ .APIServerImage }}
- {{ .ControllerManagerImage }}
- {{ .SchedulerImage }}
You can troubleshoot this for example with the following commands if you're on a systemd-powered system:
- 'systemctl status kubelet'
- 'journalctl -xeu kubelet'
`)))
)
// NewCmdInit returns "kubeadm init" command.
@ -325,12 +343,17 @@ func (i *Init) Run(out io.Writer) error {
// waiter holds the apiclient.Waiter implementation of choice, responsible for querying the API server in various ways and waiting for conditions to be fulfilled
waiter := getWaiter(i.dryRun, client)
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
fmt.Println("[init] This process often takes about a minute to perform or longer if the control plane images have to be pulled...")
// TODO: Adjust this timeout or start polling the kubelet API
// TODO: Make this timeout more realistic when we do create some more complex logic about the interaction with the kubelet
if err := waiter.WaitForAPI(); err != nil {
return err
if err := waitForAPIAndKubelet(waiter); err != nil {
ctx := map[string]string{
"Error": fmt.Sprintf("%v", err),
"APIServerImage": images.GetCoreImage(kubeadmconstants.KubeAPIServer, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
"ControllerManagerImage": images.GetCoreImage(kubeadmconstants.KubeControllerManager, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
"SchedulerImage": images.GetCoreImage(kubeadmconstants.KubeScheduler, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage),
}
kubeletFailTempl.Execute(out, ctx)
return fmt.Errorf("couldn't initialize a Kubernetes cluster")
}
// Upload currently used configuration to the cluster
@ -472,11 +495,43 @@ func printFilesIfDryRunning(dryRun bool, manifestDir string) error {
return dryrunutil.PrintDryRunFiles(files, os.Stdout)
}
// getWaiter gets the right waiter implementation
// getWaiter gets the right waiter implementation for the right occasion
func getWaiter(dryRun bool, client clientset.Interface) apiclient.Waiter {
if dryRun {
return dryrunutil.NewWaiter()
}
// TODO: Adjust this timeout slightly?
return apiclient.NewKubeWaiter(client, 30*time.Minute, os.Stdout)
}
// waitForAPIAndKubelet waits primarily for the API server to come up. If that takes a long time, and the kubelet
// /healthz and /healthz/syncloop endpoints continuously are unhealthy, kubeadm will error out after a period of
// backoffing exponentially
func waitForAPIAndKubelet(waiter apiclient.Waiter) error {
errorChan := make(chan error)
fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory())
fmt.Println("[init] This often takes around a minute; or longer if the control plane images have to be pulled.")
go func(errC chan error, waiter apiclient.Waiter) {
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
if err := waiter.WaitForHealthyKubelet(40*time.Second, "http://localhost:10255/healthz"); err != nil {
errC <- err
}
}(errorChan, waiter)
go func(errC chan error, waiter apiclient.Waiter) {
// This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special
if err := waiter.WaitForHealthyKubelet(60*time.Second, "http://localhost:10255/healthz/syncloop"); err != nil {
errC <- err
}
}(errorChan, waiter)
go func(errC chan error, waiter apiclient.Waiter) {
// This main goroutine sends whatever WaitForAPI returns (error or not) to the channel
// This in order to continue on success (nil error), or just fail if
errC <- waiter.WaitForAPI()
}(errorChan, waiter)
// This call is blocking until one of the goroutines sends to errorChan
return <-errorChan
}

View File

@ -39,7 +39,7 @@ const (
selfHostingWaitTimeout = 2 * time.Minute
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
selfHostingFailureThreshold uint8 = 5
selfHostingFailureThreshold int = 5
)
// CreateSelfHostedControlPlane is responsible for turning a Static Pod-hosted control plane to a self-hosted one

View File

@ -43,7 +43,7 @@ const (
selfHostingWaitTimeout = 2 * time.Minute
// selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets
selfHostingFailureThreshold uint8 = 10
selfHostingFailureThreshold int = 10
)
// controlPlaneComponentResources holds the relevant Pod and DaemonSet associated with a control plane component

View File

@ -113,6 +113,11 @@ func (w *fakeWaiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) erro
return w.errsToReturn[waitForHashChange]
}
// WaitForHealthyKubelet returns a dummy nil just to implement the interface
func (w *fakeWaiter) WaitForHealthyKubelet(_ time.Duration, _ string) error {
return nil
}
type fakeStaticPodPathManager struct {
realManifestDir string
tempManifestDir string

View File

@ -40,10 +40,13 @@ type Waiter interface {
WaitForPodsWithLabel(kvLabel string) error
// WaitForPodToDisappear waits for the given Pod in the kube-system namespace to be deleted
WaitForPodToDisappear(staticPodName string) error
// WaitForStaticPodControlPlaneHashes
// WaitForStaticPodControlPlaneHashes fetches sha256 hashes for the control plane static pods
WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error)
// WaitForStaticPodControlPlaneHashChange
// WaitForStaticPodControlPlaneHashChange waits for the given static pod component's static pod hash to get updated.
// By doing that we can be sure that the kubelet has restarted the given Static Pod
WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error
// SetTimeout adjusts the timeout to the specified duration
SetTimeout(timeout time.Duration)
}
@ -123,6 +126,26 @@ func (w *KubeWaiter) WaitForPodToDisappear(podName string) error {
})
}
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
func (w *KubeWaiter) WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error {
time.Sleep(initalTimeout)
return TryRunCommand(func() error {
resp, err := http.Get(healthzEndpoint)
if err != nil {
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.\n")
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' failed with error: %v.\n", healthzEndpoint, err)
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.")
fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' returned HTTP code %d\n", healthzEndpoint, resp.StatusCode)
return fmt.Errorf("the kubelet healthz endpoint is unhealthy")
}
return nil
}, 5) // a failureThreshold of five means waiting for a total of 155 seconds
}
// SetTimeout adjusts the timeout to the specified duration
func (w *KubeWaiter) SetTimeout(timeout time.Duration) {
w.timeout = timeout
@ -184,20 +207,19 @@ func getStaticPodControlPlaneHashes(client clientset.Interface, nodeName string)
}
// TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned
func TryRunCommand(f func() error, failureThreshold uint8) error {
var numFailures uint8
return wait.PollImmediate(5*time.Second, 20*time.Minute, func() (bool, error) {
func TryRunCommand(f func() error, failureThreshold int) error {
backoff := wait.Backoff{
Duration: 5 * time.Second,
Factor: 2, // double the timeout for every failure
Steps: failureThreshold,
}
return wait.ExponentialBackoff(backoff, func() (bool, error) {
err := f()
if err != nil {
numFailures++
// If we've reached the maximum amount of failures, error out
if numFailures == failureThreshold {
return false, err
}
// Retry
// Retry until the timeout
return false, nil
}
// The last f() call was a success!
// The last f() call was a success, return cleanly
return true, nil
})
}

View File

@ -97,6 +97,12 @@ func (w *Waiter) WaitForPodToDisappear(podName string) error {
return nil
}
// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok'
func (w *Waiter) WaitForHealthyKubelet(_ time.Duration, healthzEndpoint string) error {
fmt.Printf("[dryrun] Would make sure the kubelet %q endpoint is healthy\n", healthzEndpoint)
return nil
}
// SetTimeout is a no-op; we don't wait in this implementation
func (w *Waiter) SetTimeout(_ time.Duration) {}