kubeadm: check for available nodes during 'CreateJob' preflight

Instead of erroring during the preflight check 'CreateJob'
from "upgrade" commands when there are no schedulable nodes,
show a warning.

This can happen in single node clusters.

Also increase the Job TTL after completion to 20 seconds
to make sure it's more than the timeout what waits
for the Job to complete.
This commit is contained in:
Lubomir I. Ivanov 2024-04-24 13:03:27 +03:00
parent 43a0480e94
commit 1410806d2a

View File

@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
} }
// CheckClusterHealth makes sure: // CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy // - the cluster can accept a workload
// - all control-plane Nodes are Ready // - all control-plane Nodes are Ready
// - (if static pod-hosted) that all required Static Pod manifests exist on disk // - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
} }
// createJob is a check that verifies that a Job can be created in the cluster // createJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) { func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
const ( const (
prefix = "upgrade-health-check" prefix = "upgrade-health-check"
ns = metav1.NamespaceSystem fieldSelector = "spec.unschedulable=false"
timeout = 15 * time.Second ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)
var (
err, lastError error
ctx = context.Background()
nodes *v1.NodeList
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
) )
// If client.Discovery().RESTClient() is nil, the fake client is used. // If client.Discovery().RESTClient() is nil, the fake client is used.
@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
return nil return nil
} }
// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
if err != nil {
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
}
if len(nodes.Items) == 0 {
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
return nil
}
// Prepare Job // Prepare Job
job := &batchv1.Job{ job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}, },
Spec: batchv1.JobSpec{ Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](0), BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ptr.To[int32](2), TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
Template: v1.PodTemplateSpec{ Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{ Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever, RestartPolicy: v1.RestartPolicyNever,
@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}, },
} }
ctx := context.Background()
// Create the Job, but retry if it fails // Create the Job, but retry if it fails
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
var jobName string var jobName string
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}) createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil { if err != nil {
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
lastError = err lastError = err
@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
} }
// Wait for the Job to complete // Wait for the Job to complete
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil { if err != nil {
lastError = err lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
selectorControlPlane := labels.SelectorFromSet(map[string]string{ selectorControlPlane := labels.SelectorFromSet(map[string]string{
constants.LabelNodeRoleControlPlane: "", constants.LabelNodeRoleControlPlane: "",
}) })
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: selectorControlPlane.String(), LabelSelector: selectorControlPlane.String(),
}) })
if err != nil { if err != nil {