diff --git a/cmd/kubeadm/app/phases/upgrade/health.go b/cmd/kubeadm/app/phases/upgrade/health.go index 39235f5f9da..da37c6df848 100644 --- a/cmd/kubeadm/app/phases/upgrade/health.go +++ b/cmd/kubeadm/app/phases/upgrade/health.go @@ -64,7 +64,7 @@ func (c *healthCheck) Name() string { } // CheckClusterHealth makes sure: -// - the API /healthz endpoint is healthy +// - the cluster can accept a workload // - all control-plane Nodes are Ready // - (if static pod-hosted) that all required Static Pod manifests exist on disk func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { @@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi } // createJob is a check that verifies that a Job can be created in the cluster -func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) { +func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error { const ( - prefix = "upgrade-health-check" - ns = metav1.NamespaceSystem - timeout = 15 * time.Second + prefix = "upgrade-health-check" + fieldSelector = "spec.unschedulable=false" + ns = metav1.NamespaceSystem + timeout = 15 * time.Second + ) + var ( + err, lastError error + ctx = context.Background() + nodes *v1.NodeList + listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector} ) // If client.Discovery().RESTClient() is nil, the fake client is used. @@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) return nil } + // Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check. + err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { + nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions) + if err != nil { + klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err) + lastError = err + return false, nil + } + return true, nil + }) + if err != nil { + return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod") + } + + if len(nodes.Items) == 0 { + klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.") + return nil + } + // Prepare Job job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) }, Spec: batchv1.JobSpec{ BackoffLimit: ptr.To[int32](0), - TTLSecondsAfterFinished: ptr.To[int32](2), + TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'. Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyNever, @@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) }, } - ctx := context.Background() - // Create the Job, but retry if it fails klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) var jobName string - err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { - createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}) + err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { + createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{}) if err != nil { klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) lastError = err @@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) } // Wait for the Job to complete - err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { - job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) + err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { + job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{}) if err != nil { lastError = err klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) @@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon selectorControlPlane := labels.SelectorFromSet(map[string]string{ constants.LabelNodeRoleControlPlane: "", }) - nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ + nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ LabelSelector: selectorControlPlane.String(), }) if err != nil {