kubeadm: check for available nodes during 'CreateJob' preflight

Instead of erroring during the preflight check 'CreateJob'
from "upgrade" commands when there are no schedulable nodes,
show a warning.

This can happen in single node clusters.

Also increase the Job TTL after completion to 20 seconds
to make sure it's more than the timeout what waits
for the Job to complete.
This commit is contained in:
Lubomir I. Ivanov 2024-04-24 13:03:27 +03:00
parent 43a0480e94
commit 1410806d2a

View File

@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
}
// CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy
// - the cluster can accept a workload
// - all control-plane Nodes are Ready
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
}
// createJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
const (
prefix = "upgrade-health-check"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
prefix = "upgrade-health-check"
fieldSelector = "spec.unschedulable=false"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)
var (
err, lastError error
ctx = context.Background()
nodes *v1.NodeList
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
)
// If client.Discovery().RESTClient() is nil, the fake client is used.
@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
return nil
}
// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
if err != nil {
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
}
if len(nodes.Items) == 0 {
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
return nil
}
// Prepare Job
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ptr.To[int32](2),
TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
}
ctx := context.Background()
// Create the Job, but retry if it fails
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
var jobName string
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
lastError = err
@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}
// Wait for the Job to complete
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
selectorControlPlane := labels.SelectorFromSet(map[string]string{
constants.LabelNodeRoleControlPlane: "",
})
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: selectorControlPlane.String(),
})
if err != nil {