From 56adcd06f37a75696556b7a5209a4ca30cda6700 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 20 Mar 2025 18:15:58 +0100 Subject: [PATCH] DRA device eviction: fix eviction triggered by pod scheduling Normally the scheduler shouldn't schedule when there is a taint, but perhaps it didn't know yet. The TestEviction/update test covered this, but only failed under the right timing conditions. The new event handler test case covers it reliably. --- .../device_taint_eviction.go | 3 ++- .../device_taint_eviction_test.go | 21 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go index 31da9712e8e..3e2abb194d7 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -794,12 +794,13 @@ func (tc *Controller) handlePodChange(oldPod, newPod *v1.Pod) { // Pods get updated quite frequently. There's no need // to check them again unless something changed regarding - // their claims. + // their claims or they got scheduled. // // In particular this prevents adding the pod again // directly after the eviction condition got added // to it. if oldPod != nil && + oldPod.Spec.NodeName == newPod.Spec.NodeName && apiequality.Semantic.DeepEqual(oldPod.Status.ResourceClaimStatuses, newPod.Status.ResourceClaimStatuses) { return } diff --git a/pkg/controller/devicetainteviction/device_taint_eviction_test.go b/pkg/controller/devicetainteviction/device_taint_eviction_test.go index 60f7f2b154d..5ef0d8c25e6 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction_test.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction_test.go @@ -318,6 +318,10 @@ var ( OwnerReference(podName, podUID+"-other", podKind). UID("other"). Obj() + unscheduledPodWithClaimName = st.MakePod().Name(podName).Namespace(namespace). + UID(podUID). + PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimName: &claimName}). + Obj() podWithClaimName = st.MakePod().Name(podName).Namespace(namespace). UID(podUID). PodResourceClaims(v1.PodResourceClaim{Name: resourceName, ResourceClaimName: &claimName}). @@ -494,6 +498,23 @@ func TestHandlers(t *testing.T) { // At the moment, the code reliably cancels right away. wantEvents: []*v1.Event{cancelPodEviction}, }, + "evict-pod-after-scheduling": { + initialState: state{ + pods: []*v1.Pod{unscheduledPodWithClaimName}, + slices: []*resourceapi.ResourceSlice{sliceTainted, slice2}, + allocatedClaims: []allocatedClaim{{ResourceClaim: inUseClaim, evictionTime: &taintTime}}, + }, + events: []any{ + // Normally the scheduler shouldn't schedule when there is a taint, + // but perhaps it didn't know yet. + update(unscheduledPodWithClaimName, podWithClaimName), + }, + finalState: state{ + slices: []*resourceapi.ResourceSlice{sliceTainted, slice2}, + allocatedClaims: []allocatedClaim{{ResourceClaim: inUseClaim, evictionTime: &taintTime}}, + evicting: []evictAt{{newObject(podWithClaimName), taintTime.Time}}, + }, + }, "evict-pod-resourceclaim-unrelated-changes": { initialState: state{ pods: []*v1.Pod{podWithClaimName},