Fix: Clean job tracking finalizer from orphan pods

Change-Id: I04cd70725fd1830be8daf2dca53f67bc10a379b7
2025-09-05 11:12:03 +00:00 · 2022-03-16 17:02:13 -04:00
parent 1d9e3766d2
commit 211e33d93f
1 changed files with 32 additions and 3 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -151,9 +151,7 @@ func NewController(podInformer coreinformers.PodInformer, jobInformer batchinfor
 			jm.enqueueController(obj, true)
 		},
 		UpdateFunc: jm.updateJob,
-		DeleteFunc: func(obj interface{}) {
-			jm.enqueueController(obj, true)
-		},
+		DeleteFunc: jm.deleteJob,
 	})
 	jm.jobLister = jobInformer.Lister()
 	jm.jobStoreSynced = jobInformer.Informer().HasSynced
@@ -368,6 +366,10 @@ func (jm *Controller) deletePod(obj interface{}, final bool) {
 	controllerRef := metav1.GetControllerOf(pod)
 	if controllerRef == nil {
 		// No controller should care about orphans being deleted.
+		// But this pod might have belonged to a Job and the GC removed the reference.
+		if hasJobTrackingFinalizer(pod) {
+			jm.enqueueOrphanPod(pod)
+		}
 		return
 	}
 	job := jm.resolveControllerRef(pod.Namespace, controllerRef)
@@ -421,6 +423,33 @@ func (jm *Controller) updateJob(old, cur interface{}) {
 	}
 }

+// deleteJob enqueues the job and all the pods associated with it that still
+// have a finalizer.
+func (jm *Controller) deleteJob(obj interface{}) {
+	jm.enqueueController(obj, true)
+	jobObj, ok := obj.(*batch.Job)
+	if !ok {
+		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
+		if !ok {
+			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
+			return
+		}
+		jobObj, ok = tombstone.Obj.(*batch.Job)
+		if !ok {
+			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a job %+v", obj))
+			return
+		}
+	}
+	// Listing pods shouldn't really fail, as we are just querying the informer cache.
+	pods, _ := jm.podStore.Pods(jobObj.Namespace).List(labels.Everything())
+	for _, pod := range pods {
+		controllerRef := metav1.GetControllerOf(pod)
+		if (controllerRef == nil || controllerRef.UID == jobObj.UID) && hasJobTrackingFinalizer(pod) {
+			jm.enqueueOrphanPod(pod)
+		}
+	}
+}
+
 // obj could be an *batch.Job, or a DeletionFinalStateUnknown marker item,
 // immediate tells the controller to update the status right away, and should
 // happen ONLY when there was a successful pod run.