From ecbafb8de5fc9a45e9c55ace9def1f33996faead Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 17 May 2024 16:31:47 +0200 Subject: [PATCH] DRA: fix scheduler/resource claim controller race There was a race caused by having to update claim finalizer and status in two different operations: - Resource claim controller removes allocation, does not yet get to remove the finalizer. - Scheduler prepares an allocation, without adding the finalizer because it's there. - Controller removes finalizer. - Scheduler adds allocation. This is an invalid state. Automatic checking found this during the execution of the "with translated parameters on single node.*supports sharing a claim sequentially" E2E test, but only when run stand-alone. When running in parallel (as in the CI), the bad outcome of the race did not occur. The fix is to check that the finalizer is still set when adding the allocation. The apiserver doesn't check that because it doesn't know which finalizer goes with the allocation result. It could check for "some finalizer", but that is not guaranteed to be correct (could be some unrelated one). Checking the finalizer can only be done with a JSON patch. Despite the complications, having the ability to add multiple pods concurrently to ReservedFor seems worth it (avoids expensive rescheduling or a local retry loop). The resource claim controller doesn't need this, it can do a normal update which implicitly checks ResourceVersion. --- .../dynamicresources/dynamicresources.go | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 106ce0bdf58..5bc8476a9be 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -23,6 +23,7 @@ import ( "fmt" "slices" "sort" + "strings" "sync" "github.com/google/go-cmp/cmp" @@ -1620,7 +1621,8 @@ func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleStat func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourcev1alpha2.ResourceClaim, finalErr error) { logger := klog.FromContext(ctx) claim := state.claims[index] - allocationPatch := "" + driverName := "" + allocationObject := "" allocation := state.informationsForClaim[index].allocation logger.V(5).Info("preparing claim status patch", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation)) @@ -1631,7 +1633,8 @@ func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, ind if err != nil { return nil, fmt.Errorf("marshaling AllocationResult failed: %v", err) } - allocationPatch = fmt.Sprintf(`"driverName": %q, "allocation": %s, `, state.informationsForClaim[index].allocationDriverName, string(buffer)) + driverName = state.informationsForClaim[index].allocationDriverName + allocationObject = string(buffer) // The finalizer needs to be added in a normal update. Using a simple update is fine // because we don't expect concurrent modifications while the claim is not allocated @@ -1659,20 +1662,51 @@ func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, ind // Note that this also works when the allocation result gets added twice because // two pods both started using a shared claim: the first pod to get here adds the // allocation result. The second pod then only adds itself to reservedFor. - patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": {%s "reservedFor": [ {"resource": "pods", "name": %q, "uid": %q} ] }}`, - claim.UID, - allocationPatch, - pod.Name, - pod.UID, + // + // Because we allow concurrent updates, we don't check ResourceVersion + // in a precondition. What we *do* need to check is that the finalizer + // is still present. This implies using a JSON patch instead of a + // simpler strategic-merge-patch. + parts := make([]string, 0, 7) + parts = append(parts, + fmt.Sprintf(`{ "op": "test", "path": "/metadata/uid", "value": %q }`, claim.UID), ) + // Strictly speaking, we only need to check for existence of our finalizer. + // JSON patch does not support that, so we check for the stronger "array is equal". + if len(claim.Finalizers) > 0 { + parts = append(parts, + fmt.Sprintf(`{ "op": "test", "path": "/metadata/finalizers", "value": %q }`, claim.Finalizers), + ) + } + // JSON patch can only append to a non-empty array. An empty reservedFor gets + // omitted and even if it didn't, it would be null and not an empty array. + // Therefore we have to test and add if it's currently empty. + reservedForEntry := fmt.Sprintf(`{"resource": "pods", "name": %q, "uid": %q}`, pod.Name, pod.UID) + if len(claim.Status.ReservedFor) == 0 { + parts = append(parts, + fmt.Sprintf(`{ "op": "test", "path": "/status/reservedFor", "value": null }`), + fmt.Sprintf(`{ "op": "add", "path": "/status/reservedFor", "value": [ %s ] }`, reservedForEntry), + ) + } else { + parts = append(parts, + fmt.Sprintf(`{ "op": "add", "path": "/status/reservedFor/-", "value": %s }`, reservedForEntry), + ) + } + if allocationObject != "" { + parts = append(parts, + fmt.Sprintf(`{ "op": "add", "path": "/status/driverName", "value": %q }`, driverName), + fmt.Sprintf(`{ "op": "add", "path": "/status/allocation", "value": %s }`, allocationObject), + ) + } + patch := "[\n" + strings.Join(parts, ",\n") + "\n]" if loggerV := logger.V(6); loggerV.Enabled() { logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim), "patch", patch) } else { logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim)) } - claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status") + claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}, "status") logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim), "err", err) - if allocationPatch != "" { + if allocationObject != "" { // The scheduler was handling allocation. Now that has // completed, either successfully or with a failure. if err == nil {