From ebc460e8cc17fe04ce8282fd4d67072f94795689 Mon Sep 17 00:00:00 2001 From: Yuya Yabe Date: Thu, 12 Sep 2024 17:37:12 +0000 Subject: [PATCH] Add activeDeadlineSeconds to kubeadm upgrade-health-check job With https://github.com/kubernetes/kubernetes/pull/122079, kubeadm now relies on `ttlSecondsAfterFinished` to clean up `upgrade-health-check` once its pod reaches a terminal state. However, there is a case where the pod won't reach a terminal state and the job will not register a terminal state, hence no garbage collection. For example, if the pause image is not present, `ErrImagePull` will make the pod keep retrying to pull the image and the pod will never reach a terminal state on its own. And the job will continue to wait for the pod to reach a terminal state which will not happen. So we need to set `activeDeadlineSeconds` to prevent the job from waiting forever for the pod to reach a terminal state. Without this, users invoking `kubeadm upgrade plan` need to cleanup the job outside of kubeadm even if they ignore the preflight result because the job still runs when the result is configured to be ignored via `--ignore-prelight-errors=CreateJob` flag. Since the timeout for the polling in the `CreateJob` step in kubeadm is 15 seconds, we should set the `activeDeadlineSeconds` to the same timeout. --- cmd/kubeadm/app/phases/upgrade/health.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/kubeadm/app/phases/upgrade/health.go b/cmd/kubeadm/app/phases/upgrade/health.go index da37c6df848..caee81a2d4b 100644 --- a/cmd/kubeadm/app/phases/upgrade/health.go +++ b/cmd/kubeadm/app/phases/upgrade/health.go @@ -98,6 +98,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) fieldSelector = "spec.unschedulable=false" ns = metav1.NamespaceSystem timeout = 15 * time.Second + timeoutMargin = 5 * time.Second ) var ( err, lastError error @@ -132,6 +133,9 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) return nil } + // Adding a margin of error to the polling timeout. + timeoutWithMargin := timeout.Seconds() + timeoutMargin.Seconds() + // Prepare Job job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ @@ -140,7 +144,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) }, Spec: batchv1.JobSpec{ BackoffLimit: ptr.To[int32](0), - TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'. + TTLSecondsAfterFinished: ptr.To[int32](int32(timeoutWithMargin)), + ActiveDeadlineSeconds: ptr.To[int64](int64(timeoutWithMargin)), Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyNever,