diff --git a/cmd/kubeadm/app/cmd/BUILD b/cmd/kubeadm/app/cmd/BUILD index 6c261ac2c68..3603c31e1b5 100644 --- a/cmd/kubeadm/app/cmd/BUILD +++ b/cmd/kubeadm/app/cmd/BUILD @@ -28,6 +28,7 @@ go_library( "//cmd/kubeadm/app/constants:go_default_library", "//cmd/kubeadm/app/discovery:go_default_library", "//cmd/kubeadm/app/features:go_default_library", + "//cmd/kubeadm/app/images:go_default_library", "//cmd/kubeadm/app/phases/addons/dns:go_default_library", "//cmd/kubeadm/app/phases/addons/proxy:go_default_library", "//cmd/kubeadm/app/phases/apiconfig:go_default_library", diff --git a/cmd/kubeadm/app/cmd/init.go b/cmd/kubeadm/app/cmd/init.go index 60caf43bed6..7f3c0c5590e 100644 --- a/cmd/kubeadm/app/cmd/init.go +++ b/cmd/kubeadm/app/cmd/init.go @@ -37,6 +37,7 @@ import ( "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm/validation" kubeadmconstants "k8s.io/kubernetes/cmd/kubeadm/app/constants" "k8s.io/kubernetes/cmd/kubeadm/app/features" + "k8s.io/kubernetes/cmd/kubeadm/app/images" dnsaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/dns" proxyaddonphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/addons/proxy" apiconfigphase "k8s.io/kubernetes/cmd/kubeadm/app/phases/apiconfig" @@ -81,6 +82,23 @@ var ( kubeadm join --token {{.Token}} {{.MasterHostPort}} --discovery-token-ca-cert-hash {{.CAPubKeyPin}} `))) + + kubeletFailTempl = template.Must(template.New("init").Parse(dedent.Dedent(` + Unfortunately, an error has occurred: + {{ .Error }} + + This error is likely caused by that: + - The kubelet is not running + - The kubelet is unhealthy due to a misconfiguration of the node in some way (required cgroups disabled) + - There is no internet connection; so the kubelet can't pull the following control plane images: + - {{ .APIServerImage }} + - {{ .ControllerManagerImage }} + - {{ .SchedulerImage }} + + You can troubleshoot this for example with the following commands if you're on a systemd-powered system: + - 'systemctl status kubelet' + - 'journalctl -xeu kubelet' + `))) ) // NewCmdInit returns "kubeadm init" command. @@ -325,12 +343,17 @@ func (i *Init) Run(out io.Writer) error { // waiter holds the apiclient.Waiter implementation of choice, responsible for querying the API server in various ways and waiting for conditions to be fulfilled waiter := getWaiter(i.dryRun, client) - fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory()) - fmt.Println("[init] This process often takes about a minute to perform or longer if the control plane images have to be pulled...") - // TODO: Adjust this timeout or start polling the kubelet API - // TODO: Make this timeout more realistic when we do create some more complex logic about the interaction with the kubelet - if err := waiter.WaitForAPI(); err != nil { - return err + if err := waitForAPIAndKubelet(waiter); err != nil { + ctx := map[string]string{ + "Error": fmt.Sprintf("%v", err), + "APIServerImage": images.GetCoreImage(kubeadmconstants.KubeAPIServer, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage), + "ControllerManagerImage": images.GetCoreImage(kubeadmconstants.KubeControllerManager, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage), + "SchedulerImage": images.GetCoreImage(kubeadmconstants.KubeScheduler, i.cfg.GetControlPlaneImageRepository(), i.cfg.KubernetesVersion, i.cfg.UnifiedControlPlaneImage), + } + + kubeletFailTempl.Execute(out, ctx) + + return fmt.Errorf("couldn't initialize a Kubernetes cluster") } // Upload currently used configuration to the cluster @@ -472,11 +495,43 @@ func printFilesIfDryRunning(dryRun bool, manifestDir string) error { return dryrunutil.PrintDryRunFiles(files, os.Stdout) } -// getWaiter gets the right waiter implementation +// getWaiter gets the right waiter implementation for the right occasion func getWaiter(dryRun bool, client clientset.Interface) apiclient.Waiter { if dryRun { return dryrunutil.NewWaiter() } - // TODO: Adjust this timeout slightly? return apiclient.NewKubeWaiter(client, 30*time.Minute, os.Stdout) } + +// waitForAPIAndKubelet waits primarily for the API server to come up. If that takes a long time, and the kubelet +// /healthz and /healthz/syncloop endpoints continuously are unhealthy, kubeadm will error out after a period of +// backoffing exponentially +func waitForAPIAndKubelet(waiter apiclient.Waiter) error { + errorChan := make(chan error) + + fmt.Printf("[init] Waiting for the kubelet to boot up the control plane as Static Pods from directory %q\n", kubeadmconstants.GetStaticPodDirectory()) + fmt.Println("[init] This often takes around a minute; or longer if the control plane images have to be pulled.") + + go func(errC chan error, waiter apiclient.Waiter) { + // This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special + if err := waiter.WaitForHealthyKubelet(40*time.Second, "http://localhost:10255/healthz"); err != nil { + errC <- err + } + }(errorChan, waiter) + + go func(errC chan error, waiter apiclient.Waiter) { + // This goroutine can only make kubeadm init fail. If this check succeeds, it won't do anything special + if err := waiter.WaitForHealthyKubelet(60*time.Second, "http://localhost:10255/healthz/syncloop"); err != nil { + errC <- err + } + }(errorChan, waiter) + + go func(errC chan error, waiter apiclient.Waiter) { + // This main goroutine sends whatever WaitForAPI returns (error or not) to the channel + // This in order to continue on success (nil error), or just fail if + errC <- waiter.WaitForAPI() + }(errorChan, waiter) + + // This call is blocking until one of the goroutines sends to errorChan + return <-errorChan +} diff --git a/cmd/kubeadm/app/phases/selfhosting/selfhosting.go b/cmd/kubeadm/app/phases/selfhosting/selfhosting.go index fc19d07edb8..010dcfd9eef 100644 --- a/cmd/kubeadm/app/phases/selfhosting/selfhosting.go +++ b/cmd/kubeadm/app/phases/selfhosting/selfhosting.go @@ -39,7 +39,7 @@ const ( selfHostingWaitTimeout = 2 * time.Minute // selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets - selfHostingFailureThreshold uint8 = 5 + selfHostingFailureThreshold int = 5 ) // CreateSelfHostedControlPlane is responsible for turning a Static Pod-hosted control plane to a self-hosted one diff --git a/cmd/kubeadm/app/phases/upgrade/selfhosted.go b/cmd/kubeadm/app/phases/upgrade/selfhosted.go index 8f849dff178..b6a02320e20 100644 --- a/cmd/kubeadm/app/phases/upgrade/selfhosted.go +++ b/cmd/kubeadm/app/phases/upgrade/selfhosted.go @@ -43,7 +43,7 @@ const ( selfHostingWaitTimeout = 2 * time.Minute // selfHostingFailureThreshold describes how many times kubeadm will retry creating the DaemonSets - selfHostingFailureThreshold uint8 = 10 + selfHostingFailureThreshold int = 10 ) // controlPlaneComponentResources holds the relevant Pod and DaemonSet associated with a control plane component diff --git a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go index 301fc20a54b..1b0270823b3 100644 --- a/cmd/kubeadm/app/phases/upgrade/staticpods_test.go +++ b/cmd/kubeadm/app/phases/upgrade/staticpods_test.go @@ -113,6 +113,11 @@ func (w *fakeWaiter) WaitForStaticPodControlPlaneHashChange(_, _, _ string) erro return w.errsToReturn[waitForHashChange] } +// WaitForHealthyKubelet returns a dummy nil just to implement the interface +func (w *fakeWaiter) WaitForHealthyKubelet(_ time.Duration, _ string) error { + return nil +} + type fakeStaticPodPathManager struct { realManifestDir string tempManifestDir string diff --git a/cmd/kubeadm/app/util/apiclient/wait.go b/cmd/kubeadm/app/util/apiclient/wait.go index 5690572b18d..0ac79dd7ef4 100644 --- a/cmd/kubeadm/app/util/apiclient/wait.go +++ b/cmd/kubeadm/app/util/apiclient/wait.go @@ -40,10 +40,13 @@ type Waiter interface { WaitForPodsWithLabel(kvLabel string) error // WaitForPodToDisappear waits for the given Pod in the kube-system namespace to be deleted WaitForPodToDisappear(staticPodName string) error - // WaitForStaticPodControlPlaneHashes + // WaitForStaticPodControlPlaneHashes fetches sha256 hashes for the control plane static pods WaitForStaticPodControlPlaneHashes(nodeName string) (map[string]string, error) - // WaitForStaticPodControlPlaneHashChange + // WaitForStaticPodControlPlaneHashChange waits for the given static pod component's static pod hash to get updated. + // By doing that we can be sure that the kubelet has restarted the given Static Pod WaitForStaticPodControlPlaneHashChange(nodeName, component, previousHash string) error + // WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok' + WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error // SetTimeout adjusts the timeout to the specified duration SetTimeout(timeout time.Duration) } @@ -123,6 +126,26 @@ func (w *KubeWaiter) WaitForPodToDisappear(podName string) error { }) } +// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok' +func (w *KubeWaiter) WaitForHealthyKubelet(initalTimeout time.Duration, healthzEndpoint string) error { + time.Sleep(initalTimeout) + return TryRunCommand(func() error { + resp, err := http.Get(healthzEndpoint) + if err != nil { + fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.\n") + fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' failed with error: %v.\n", healthzEndpoint, err) + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + fmt.Printf("[kubelet-check] It seems like the kubelet isn't running or healthy.") + fmt.Printf("[kubelet-check] The HTTP call equal to 'curl -sSL %s' returned HTTP code %d\n", healthzEndpoint, resp.StatusCode) + return fmt.Errorf("the kubelet healthz endpoint is unhealthy") + } + return nil + }, 5) // a failureThreshold of five means waiting for a total of 155 seconds +} + // SetTimeout adjusts the timeout to the specified duration func (w *KubeWaiter) SetTimeout(timeout time.Duration) { w.timeout = timeout @@ -184,20 +207,19 @@ func getStaticPodControlPlaneHashes(client clientset.Interface, nodeName string) } // TryRunCommand runs a function a maximum of failureThreshold times, and retries on error. If failureThreshold is hit; the last error is returned -func TryRunCommand(f func() error, failureThreshold uint8) error { - var numFailures uint8 - return wait.PollImmediate(5*time.Second, 20*time.Minute, func() (bool, error) { +func TryRunCommand(f func() error, failureThreshold int) error { + backoff := wait.Backoff{ + Duration: 5 * time.Second, + Factor: 2, // double the timeout for every failure + Steps: failureThreshold, + } + return wait.ExponentialBackoff(backoff, func() (bool, error) { err := f() if err != nil { - numFailures++ - // If we've reached the maximum amount of failures, error out - if numFailures == failureThreshold { - return false, err - } - // Retry + // Retry until the timeout return false, nil } - // The last f() call was a success! + // The last f() call was a success, return cleanly return true, nil }) } diff --git a/cmd/kubeadm/app/util/dryrun/dryrun.go b/cmd/kubeadm/app/util/dryrun/dryrun.go index 41ed0fc406f..cad1eba2fba 100644 --- a/cmd/kubeadm/app/util/dryrun/dryrun.go +++ b/cmd/kubeadm/app/util/dryrun/dryrun.go @@ -97,6 +97,12 @@ func (w *Waiter) WaitForPodToDisappear(podName string) error { return nil } +// WaitForHealthyKubelet blocks until the kubelet /healthz endpoint returns 'ok' +func (w *Waiter) WaitForHealthyKubelet(_ time.Duration, healthzEndpoint string) error { + fmt.Printf("[dryrun] Would make sure the kubelet %q endpoint is healthy\n", healthzEndpoint) + return nil +} + // SetTimeout is a no-op; we don't wait in this implementation func (w *Waiter) SetTimeout(_ time.Duration) {}