mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-13 13:55:41 +00:00
kubeadm: check for available nodes during 'CreateJob' preflight
Instead of erroring during the preflight check 'CreateJob' from "upgrade" commands when there are no schedulable nodes, show a warning. This can happen in single node clusters. Also increase the Job TTL after completion to 20 seconds to make sure it's more than the timeout what waits for the Job to complete.
This commit is contained in:
parent
43a0480e94
commit
1410806d2a
@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CheckClusterHealth makes sure:
|
// CheckClusterHealth makes sure:
|
||||||
// - the API /healthz endpoint is healthy
|
// - the cluster can accept a workload
|
||||||
// - all control-plane Nodes are Ready
|
// - all control-plane Nodes are Ready
|
||||||
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
|
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
|
||||||
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
|
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
|
||||||
@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
|
|||||||
}
|
}
|
||||||
|
|
||||||
// createJob is a check that verifies that a Job can be created in the cluster
|
// createJob is a check that verifies that a Job can be created in the cluster
|
||||||
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
|
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
|
||||||
const (
|
const (
|
||||||
prefix = "upgrade-health-check"
|
prefix = "upgrade-health-check"
|
||||||
ns = metav1.NamespaceSystem
|
fieldSelector = "spec.unschedulable=false"
|
||||||
timeout = 15 * time.Second
|
ns = metav1.NamespaceSystem
|
||||||
|
timeout = 15 * time.Second
|
||||||
|
)
|
||||||
|
var (
|
||||||
|
err, lastError error
|
||||||
|
ctx = context.Background()
|
||||||
|
nodes *v1.NodeList
|
||||||
|
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
|
||||||
)
|
)
|
||||||
|
|
||||||
// If client.Discovery().RESTClient() is nil, the fake client is used.
|
// If client.Discovery().RESTClient() is nil, the fake client is used.
|
||||||
@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
|
||||||
|
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
|
||||||
|
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
|
||||||
|
if err != nil {
|
||||||
|
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
|
||||||
|
lastError = err
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(nodes.Items) == 0 {
|
||||||
|
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare Job
|
// Prepare Job
|
||||||
job := &batchv1.Job{
|
job := &batchv1.Job{
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
|
|||||||
},
|
},
|
||||||
Spec: batchv1.JobSpec{
|
Spec: batchv1.JobSpec{
|
||||||
BackoffLimit: ptr.To[int32](0),
|
BackoffLimit: ptr.To[int32](0),
|
||||||
TTLSecondsAfterFinished: ptr.To[int32](2),
|
TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
|
||||||
Template: v1.PodTemplateSpec{
|
Template: v1.PodTemplateSpec{
|
||||||
Spec: v1.PodSpec{
|
Spec: v1.PodSpec{
|
||||||
RestartPolicy: v1.RestartPolicyNever,
|
RestartPolicy: v1.RestartPolicyNever,
|
||||||
@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
// Create the Job, but retry if it fails
|
// Create the Job, but retry if it fails
|
||||||
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
|
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
|
||||||
var jobName string
|
var jobName string
|
||||||
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
|
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
|
||||||
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{})
|
createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
|
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
|
||||||
lastError = err
|
lastError = err
|
||||||
@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Wait for the Job to complete
|
// Wait for the Job to complete
|
||||||
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
|
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
|
||||||
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
|
job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lastError = err
|
lastError = err
|
||||||
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
|
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
|
||||||
@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
|
|||||||
selectorControlPlane := labels.SelectorFromSet(map[string]string{
|
selectorControlPlane := labels.SelectorFromSet(map[string]string{
|
||||||
constants.LabelNodeRoleControlPlane: "",
|
constants.LabelNodeRoleControlPlane: "",
|
||||||
})
|
})
|
||||||
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
|
nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
|
||||||
LabelSelector: selectorControlPlane.String(),
|
LabelSelector: selectorControlPlane.String(),
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
Reference in New Issue
Block a user