diff --git a/pkg/api/pod/util.go b/pkg/api/pod/util.go index c4c9c1a1ee5..9a28154be27 100644 --- a/pkg/api/pod/util.go +++ b/pkg/api/pod/util.go @@ -540,6 +540,10 @@ func dropDisabledPodStatusFields(podStatus, oldPodStatus *api.PodStatus, podSpec dropResourcesFields(podStatus.EphemeralContainerStatuses) podStatus.Resize = "" } + + if !utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) && !dynamicResourceAllocationInUse(oldPodSpec) { + podStatus.ResourceClaimStatuses = nil + } } // dropDisabledDynamicResourceAllocationFields removes pod claim references from diff --git a/pkg/api/pod/util_test.go b/pkg/api/pod/util_test.go index 1f355bd71fa..e4a9a81dace 100644 --- a/pkg/api/pod/util_test.go +++ b/pkg/api/pod/util_test.go @@ -822,6 +822,11 @@ func TestDropDynamicResourceAllocation(t *testing.T) { }, }, }, + Status: api.PodStatus{ + ResourceClaimStatuses: []api.PodResourceClaimStatus{ + {Name: "my-claim", ResourceClaimName: pointer.String("pod-my-claim")}, + }, + }, } podWithoutClaims := &api.Pod{ Spec: api.PodSpec{ diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index 5c384212660..1784b329b07 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -3210,15 +3210,9 @@ type ClaimSource struct { // // The template will be used to create a new ResourceClaim, which will // be bound to this pod. When this pod is deleted, the ResourceClaim - // will also be deleted. The name of the ResourceClaim will be -, where is the - // PodResourceClaim.Name. Pod validation will reject the pod if the - // concatenated name is not valid for a ResourceClaim (e.g. too long). - // - // An existing ResourceClaim with that name that is not owned by the - // pod will not be used for the pod to avoid using an unrelated - // resource by mistake. Scheduling and pod startup are then blocked - // until the unrelated ResourceClaim is removed. + // will also be deleted. The pod name and resource name, along with a + // generated component, will be used to form a unique name for the + // ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses. // // This field is immutable and no changes will be made to the // corresponding ResourceClaim by the control plane after creating the @@ -3226,6 +3220,22 @@ type ClaimSource struct { ResourceClaimTemplateName *string } +// PodResourceClaimStatus is stored in the PodStatus for each PodResourceClaim +// which references a ResourceClaimTemplate. It stores the generated name for +// the corresponding ResourceClaim. +type PodResourceClaimStatus struct { + // Name uniquely identifies this resource claim inside the pod. + // This must match the name of an entry in pod.spec.resourceClaims, + // which implies that the string must be a DNS_LABEL. + Name string + + // ResourceClaimName is the name of the ResourceClaim that was + // generated for the Pod in the namespace of the Pod. It this is + // unset, then generating a ResourceClaim was not necessary. The + // pod.spec.resourceClaims entry can be ignored in this case. + ResourceClaimName *string +} + // OSName is the set of OS'es that can be used in OS. type OSName string @@ -3661,6 +3671,11 @@ type PodStatus struct { // +featureGate=InPlacePodVerticalScaling // +optional Resize PodResizeStatus + + // Status of resource claims. + // +featureGate=DynamicResourceAllocation + // +optional + ResourceClaimStatuses []PodResourceClaimStatus } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/pkg/apis/core/validation/validation.go b/pkg/apis/core/validation/validation.go index 9589da30631..5caee3e6880 100644 --- a/pkg/apis/core/validation/validation.go +++ b/pkg/apis/core/validation/validation.go @@ -4845,6 +4845,7 @@ func ValidatePodStatusUpdate(newPod, oldPod *core.Pod, opts PodValidationOptions allErrs = append(allErrs, ValidateContainerStateTransition(newPod.Status.InitContainerStatuses, oldPod.Status.InitContainerStatuses, fldPath.Child("initContainerStatuses"), oldPod.Spec.RestartPolicy)...) // The kubelet will never restart ephemeral containers, so treat them like they have an implicit RestartPolicyNever. allErrs = append(allErrs, ValidateContainerStateTransition(newPod.Status.EphemeralContainerStatuses, oldPod.Status.EphemeralContainerStatuses, fldPath.Child("ephemeralContainerStatuses"), core.RestartPolicyNever)...) + allErrs = append(allErrs, validatePodResourceClaimStatuses(newPod.Status.ResourceClaimStatuses, newPod.Spec.ResourceClaims, fldPath.Child("resourceClaimStatuses"))...) if newIPErrs := validatePodIPs(newPod); len(newIPErrs) > 0 { allErrs = append(allErrs, newIPErrs...) @@ -4866,6 +4867,36 @@ func validatePodConditions(conditions []core.PodCondition, fldPath *field.Path) return allErrs } +// validatePodResourceClaimStatuses validates the ResourceClaimStatuses slice in a pod status. +func validatePodResourceClaimStatuses(statuses []core.PodResourceClaimStatus, podClaims []core.PodResourceClaim, fldPath *field.Path) field.ErrorList { + var allErrs field.ErrorList + + for i, status := range statuses { + idxPath := fldPath.Index(i) + // There's no need to check the content of the name. If it matches an entry, + // then it is valid, otherwise we reject it here. + if !havePodClaim(podClaims, status.Name) { + allErrs = append(allErrs, field.Invalid(idxPath.Child("name"), status.Name, "must match the name of an entry in `spec.resourceClaims`")) + } + if status.ResourceClaimName != nil { + for _, detail := range ValidateResourceClaimName(*status.ResourceClaimName, false) { + allErrs = append(allErrs, field.Invalid(idxPath.Child("name"), status.ResourceClaimName, detail)) + } + } + } + + return allErrs +} + +func havePodClaim(podClaims []core.PodResourceClaim, name string) bool { + for _, podClaim := range podClaims { + if podClaim.Name == name { + return true + } + } + return false +} + // ValidatePodEphemeralContainersUpdate tests that a user update to EphemeralContainers is valid. // newPod and oldPod must only differ in their EphemeralContainers. func ValidatePodEphemeralContainersUpdate(newPod, oldPod *core.Pod, opts PodValidationOptions) field.ErrorList { diff --git a/pkg/apis/core/validation/validation_test.go b/pkg/apis/core/validation/validation_test.go index 8d5df082286..951384547cd 100644 --- a/pkg/apis/core/validation/validation_test.go +++ b/pkg/apis/core/validation/validation_test.go @@ -13629,6 +13629,82 @@ func TestValidatePodStatusUpdate(t *testing.T) { }, "", "Container statuses all containers terminated", + }, { + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Status: core.PodStatus{ + ResourceClaimStatuses: []core.PodResourceClaimStatus{ + {Name: "no-such-claim", ResourceClaimName: utilpointer.String("my-claim")}, + }, + }, + }, + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + }, + "status.resourceClaimStatuses[0].name: Invalid value: \"no-such-claim\": must match the name of an entry in `spec.resourceClaims`", + "Non-existent PodResourceClaim", + }, { + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: core.PodSpec{ + ResourceClaims: []core.PodResourceClaim{ + {Name: "my-claim"}, + }, + }, + Status: core.PodStatus{ + ResourceClaimStatuses: []core.PodResourceClaimStatus{ + {Name: "my-claim", ResourceClaimName: utilpointer.String("%$!#")}, + }, + }, + }, + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: core.PodSpec{ + ResourceClaims: []core.PodResourceClaim{ + {Name: "my-claim"}, + }, + }, + }, + `status.resourceClaimStatuses[0].name: Invalid value: "%$!#": a lowercase RFC 1123 subdomain must consist of`, + "Invalid ResourceClaim name", + }, { + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: core.PodSpec{ + ResourceClaims: []core.PodResourceClaim{ + {Name: "my-claim"}, + {Name: "my-other-claim"}, + }, + }, + Status: core.PodStatus{ + ResourceClaimStatuses: []core.PodResourceClaimStatus{ + {Name: "my-claim", ResourceClaimName: utilpointer.String("foo-my-claim-12345")}, + {Name: "my-other-claim", ResourceClaimName: nil}, + }, + }, + }, + core.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + }, + Spec: core.PodSpec{ + ResourceClaims: []core.PodResourceClaim{ + {Name: "my-claim"}, + }, + }, + }, + "", + "ResourceClaimStatuses okay", }, } diff --git a/pkg/controller/resourceclaim/controller.go b/pkg/controller/resourceclaim/controller.go index 481f2d86d88..74527ca8cdc 100644 --- a/pkg/controller/resourceclaim/controller.go +++ b/pkg/controller/resourceclaim/controller.go @@ -18,17 +18,20 @@ package resourceclaim import ( "context" + "errors" "fmt" "strings" "time" v1 "k8s.io/api/core/v1" resourcev1alpha2 "k8s.io/api/resource/v1alpha2" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" + corev1apply "k8s.io/client-go/applyconfigurations/core/v1" v1informers "k8s.io/client-go/informers/core/v1" resourcev1alpha2informers "k8s.io/client-go/informers/resource/v1alpha2" clientset "k8s.io/client-go/kubernetes" @@ -50,6 +53,15 @@ const ( // podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim templates. podResourceClaimIndex = "pod-resource-claim-index" + // podResourceClaimAnnotation is the special annotation that generated + // ResourceClaims get. Its value is the pod.spec.resourceClaims[].name + // for which it was generated. This is used only inside the controller + // and not documented as part of the Kubernetes API. + podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name" + + // Field manager used to update the pod status. + fieldManager = "ResourceClaimController" + maxUIDCacheEntries = 500 ) @@ -178,10 +190,21 @@ func (ec *Controller) enqueuePod(logger klog.Logger, obj interface{}, deleted bo // Release reservations of a deleted or completed pod? if deleted || isPodDone(pod) { for _, podClaim := range pod.Spec.ResourceClaims { - claimName := resourceclaim.Name(pod, &podClaim) - key := claimKeyPrefix + pod.Namespace + "/" + claimName - logger.V(6).Info("pod is deleted or done, process claim", "pod", klog.KObj(pod), "key", key) - ec.queue.Add(key) + claimName, _, err := resourceclaim.Name(pod, &podClaim) + switch { + case err != nil: + // Either the claim was not created (nothing to do here) or + // the API changed. The later will also get reported elsewhere, + // so here it's just a debug message. + klog.TODO().V(6).Info("Nothing to do for claim during pod change", "err", err) + case claimName != nil: + key := claimKeyPrefix + pod.Namespace + "/" + *claimName + logger.V(6).Info("pod is deleted or done, process claim", "pod", klog.KObj(pod), "key", key) + ec.queue.Add(claimKeyPrefix + pod.Namespace + "/" + *claimName) + default: + // Nothing to do, claim wasn't generated. + klog.TODO().V(6).Info("Nothing to do for skipped claim during pod change") + } } } @@ -318,7 +341,7 @@ func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error ctx = klog.NewContext(ctx, logger) pod, err := ec.podLister.Pods(namespace).Get(name) if err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { logger.V(5).Info("nothing to do for pod, it is gone") return nil } @@ -331,8 +354,9 @@ func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error return nil } + var newPodClaims map[string]string for _, podClaim := range pod.Spec.ResourceClaims { - if err := ec.handleClaim(ctx, pod, podClaim); err != nil { + if err := ec.handleClaim(ctx, pod, podClaim, &newPodClaims); err != nil { if ec.recorder != nil { ec.recorder.Event(pod, v1.EventTypeWarning, "FailedResourceClaimCreation", fmt.Sprintf("PodResourceClaim %s: %v", podClaim.Name, err)) } @@ -340,73 +364,181 @@ func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error } } + if newPodClaims != nil { + // Patch the pod status with the new information about + // generated ResourceClaims. + statuses := make([]*corev1apply.PodResourceClaimStatusApplyConfiguration, 0, len(newPodClaims)) + for podClaimName, resourceClaimName := range newPodClaims { + statuses = append(statuses, corev1apply.PodResourceClaimStatus().WithName(podClaimName).WithResourceClaimName(resourceClaimName)) + } + podApply := corev1apply.Pod(name, namespace).WithStatus(corev1apply.PodStatus().WithResourceClaimStatuses(statuses...)) + if _, err := ec.kubeClient.CoreV1().Pods(namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true}); err != nil { + return fmt.Errorf("update pod %s/%s ResourceClaimStatuses: %v", namespace, name, err) + } + } + return nil } // handleResourceClaim is invoked for each volume of a pod. -func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim) error { +func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim, newPodClaims *map[string]string) error { logger := klog.LoggerWithValues(klog.FromContext(ctx), "podClaim", podClaim.Name) ctx = klog.NewContext(ctx, logger) logger.V(5).Info("checking", "podClaim", podClaim.Name) - templateName := podClaim.Source.ResourceClaimTemplateName - if templateName == nil { - return nil - } - claimName := resourceclaim.Name(pod, &podClaim) - claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName) - if err != nil && !errors.IsNotFound(err) { - return err - } - if claim != nil { - if err := resourceclaim.IsForPod(pod, claim); err != nil { + // resourceclaim.Name checks for the situation that the client doesn't + // know some future addition to the API. Therefore it gets called here + // even if there is no template to work on, because if some new field + // gets added, the expectation might be that the controller does + // something for it. + claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim) + switch { + case errors.Is(err, resourceclaim.ErrClaimNotFound): + // Continue below. + case err != nil: + return fmt.Errorf("checking for claim before creating it: %v", err) + case claimName == nil: + // Nothing to do, no claim needed. + return nil + case *claimName != "": + claimName := *claimName + // The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses, + // but perhaps it was deleted accidentally. In that case we re-create it. + claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName) + if err != nil && !apierrors.IsNotFound(err) { return err } - // Already created, nothing more to do. - logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName) + if claim != nil { + var err error + if mustCheckOwner { + err = resourceclaim.IsForPod(pod, claim) + } + if err == nil { + // Already created, nothing more to do. + logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName) + return nil + } + logger.Error(err, "claim that was created for the pod is no longer owned by the pod, creating a new one", "podClaim", podClaim.Name, "resourceClaim", claimName) + } + } + + templateName := podClaim.Source.ResourceClaimTemplateName + if templateName == nil { + // Nothing to do. return nil } - template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName) + // Before we create a new ResourceClaim, check if there is an orphaned one. + // This covers the case that the controller has created it, but then fails + // before it can update the pod status. + claim, err := ec.findPodResourceClaim(pod, podClaim) if err != nil { - return fmt.Errorf("resource claim template %q: %v", *templateName, err) + return fmt.Errorf("finding ResourceClaim for claim %s in pod %s/%s failed: %v", podClaim.Name, pod.Namespace, pod.Name, err) } - // Create the ResourceClaim with pod as owner. - isTrue := true - claim = &resourcev1alpha2.ResourceClaim{ - ObjectMeta: metav1.ObjectMeta{ - Name: claimName, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "v1", - Kind: "Pod", - Name: pod.Name, - UID: pod.UID, - Controller: &isTrue, - BlockOwnerDeletion: &isTrue, + if claim == nil { + template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName) + if err != nil { + return fmt.Errorf("resource claim template %q: %v", *templateName, err) + } + + // Create the ResourceClaim with pod as owner, with a generated name that uses + // - as base. + isTrue := true + annotations := template.Spec.ObjectMeta.Annotations + if annotations == nil { + annotations = make(map[string]string) + } + annotations[podResourceClaimAnnotation] = podClaim.Name + generateName := pod.Name + "-" + podClaim.Name + maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters. + if len(generateName) > maxBaseLen { + // We could leave truncation to the apiserver, but as + // it removes at the end, we would loose everything + // from the pod claim name when the pod name is long. + // We can do better and truncate both strings, + // proportional to their length. + generateName = pod.Name[0:len(pod.Name)*maxBaseLen/len(generateName)] + + "-" + + podClaim.Name[0:len(podClaim.Name)*maxBaseLen/len(generateName)] + } + claim = &resourcev1alpha2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: generateName, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "v1", + Kind: "Pod", + Name: pod.Name, + UID: pod.UID, + Controller: &isTrue, + BlockOwnerDeletion: &isTrue, + }, }, + Annotations: annotations, + Labels: template.Spec.ObjectMeta.Labels, }, - Annotations: template.Spec.ObjectMeta.Annotations, - Labels: template.Spec.ObjectMeta.Labels, - }, - Spec: template.Spec.Spec, + Spec: template.Spec.Spec, + } + metrics.ResourceClaimCreateAttempts.Inc() + claimName := claim.Name + claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{}) + if err != nil { + metrics.ResourceClaimCreateFailures.Inc() + return fmt.Errorf("create ResourceClaim %s: %v", claimName, err) + } } - metrics.ResourceClaimCreateAttempts.Inc() - _, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{}) - if err != nil { - metrics.ResourceClaimCreateFailures.Inc() - return fmt.Errorf("create ResourceClaim %s: %v", claimName, err) + + // Remember the new ResourceClaim for a batch PodStatus update in our caller. + if *newPodClaims == nil { + *newPodClaims = make(map[string]string) } + (*newPodClaims)[podClaim.Name] = claim.Name + return nil } +// findPodResourceClaim looks for an existing ResourceClaim with the right +// annotation (ties it to the pod claim) and the right ownership (ties it to +// the pod). +func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceClaim) (*resourcev1alpha2.ResourceClaim, error) { + claims, err := ec.claimLister.List(labels.Everything()) + if err != nil { + return nil, err + } + deterministicName := pod.Name + "-" + podClaim.Name // Kubernetes <= 1.27 behavior. + for _, claim := range claims { + if err := resourceclaim.IsForPod(pod, claim); err != nil { + continue + } + podClaimName, ok := claim.Annotations[podResourceClaimAnnotation] + if ok && podClaimName != podClaim.Name { + continue + } + + // No annotation? It might a ResourceClaim created for + // the pod with a previous Kubernetes release where the + // ResourceClaim name was deterministic, in which case + // we have to use it and update the new pod status + // field accordingly. + if !ok && claim.Name != deterministicName { + continue + } + + // Pick the first one that matches. There shouldn't be more than one. If there is, + // then all others will be ignored until the pod gets deleted. Then they also get + // cleaned up. + return claim, nil + } + return nil, nil +} + func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error { logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name)) ctx = klog.NewContext(ctx, logger) claim, err := ec.claimLister.ResourceClaims(namespace).Get(name) if err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { logger.V(5).Info("nothing to do for claim, it is gone") return nil } @@ -438,7 +570,7 @@ func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) err } else { pod, err := ec.podLister.Pods(claim.Namespace).Get(reservedFor.Name) switch { - case err != nil && !errors.IsNotFound(err): + case err != nil && !apierrors.IsNotFound(err): return err case err != nil: // We might not have it in our informer cache @@ -447,7 +579,7 @@ func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) err // absolutely sure and thus have to check with // the API server. pod, err := ec.kubeClient.CoreV1().Pods(claim.Namespace).Get(ctx, reservedFor.Name, metav1.GetOptions{}) - if err != nil && !errors.IsNotFound(err) { + if err != nil && !apierrors.IsNotFound(err) { return err } if pod == nil || pod.UID != reservedFor.UID { @@ -528,7 +660,7 @@ func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) err } else { logger.V(6).Info("wrong pod content, not deleting claim", "claim", klog.KObj(claim), "podUID", podUID, "podContent", pod) } - case errors.IsNotFound(err): + case apierrors.IsNotFound(err): // We might not know the pod *yet*. Instead of doing an expensive API call, // let the garbage collector handle the case that the pod is truly gone. logger.V(5).Info("pod for claim not found", "claim", klog.KObj(claim), "pod", klog.KRef(claim.Namespace, podName)) @@ -564,8 +696,14 @@ func podResourceClaimIndexFunc(obj interface{}) ([]string, error) { keys := []string{} for _, podClaim := range pod.Spec.ResourceClaims { if podClaim.Source.ResourceClaimTemplateName != nil { - claimName := resourceclaim.Name(pod, &podClaim) - keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, claimName)) + claimName, _, err := resourceclaim.Name(pod, &podClaim) + if err != nil || claimName == nil { + // Index functions are not supposed to fail, the caller will panic. + // For both error reasons (claim not created yet, unknown API) + // we simply don't index. + continue + } + keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName)) } } return keys, nil diff --git a/pkg/controller/resourceclaim/controller_test.go b/pkg/controller/resourceclaim/controller_test.go index aef37dbf50b..8b8dcf7e044 100644 --- a/pkg/controller/resourceclaim/controller_test.go +++ b/pkg/controller/resourceclaim/controller_test.go @@ -19,7 +19,9 @@ package resourceclaim import ( "context" "errors" + "fmt" "sort" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -53,6 +55,7 @@ var ( testPodWithResource = makePod(testPodName, testNamespace, testPodUID, *makePodResourceClaim(podResourceClaimName, templateName)) otherTestPod = makePod(testPodName+"-II", testNamespace, testPodUID+"-II") testClaim = makeClaim(testPodName+"-"+podResourceClaimName, testNamespace, className, makeOwnerReference(testPodWithResource, true)) + generatedTestClaim = makeGeneratedClaim(podResourceClaimName, testPodName+"-"+podResourceClaimName, testNamespace, className, 1, makeOwnerReference(testPodWithResource, true)) testClaimReserved = func() *resourcev1alpha2.ResourceClaim { claim := testClaim.DeepCopy() claim.Status.ReservedFor = append(claim.Status.ReservedFor, @@ -86,22 +89,67 @@ func init() { func TestSyncHandler(t *testing.T) { tests := []struct { - name string - key string - claims []*resourcev1alpha2.ResourceClaim - pods []*v1.Pod - podsLater []*v1.Pod - templates []*resourcev1alpha2.ResourceClaimTemplate - expectedClaims []resourcev1alpha2.ResourceClaim - expectedError bool - expectedMetrics expectedMetrics + name string + key string + claims []*resourcev1alpha2.ResourceClaim + pods []*v1.Pod + podsLater []*v1.Pod + templates []*resourcev1alpha2.ResourceClaimTemplate + expectedClaims []resourcev1alpha2.ResourceClaim + expectedStatuses map[string][]v1.PodResourceClaimStatus + expectedError bool + expectedMetrics expectedMetrics }{ { - name: "create", - pods: []*v1.Pod{testPodWithResource}, - templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, - key: podKey(testPodWithResource), - expectedClaims: []resourcev1alpha2.ResourceClaim{*testClaim}, + name: "create", + pods: []*v1.Pod{testPodWithResource}, + templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, + key: podKey(testPodWithResource), + expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + }, + }, + expectedMetrics: expectedMetrics{1, 0}, + }, + { + name: "nop", + pods: []*v1.Pod{func() *v1.Pod { + pod := testPodWithResource.DeepCopy() + pod.Status.ResourceClaimStatuses = []v1.PodResourceClaimStatus{ + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + } + return pod + }()}, + templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, + key: podKey(testPodWithResource), + claims: []*resourcev1alpha2.ResourceClaim{generatedTestClaim}, + expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + }, + }, + expectedMetrics: expectedMetrics{0, 0}, + }, + { + name: "recreate", + pods: []*v1.Pod{func() *v1.Pod { + pod := testPodWithResource.DeepCopy() + pod.Status.ResourceClaimStatuses = []v1.PodResourceClaimStatus{ + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + } + return pod + }()}, + templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, + key: podKey(testPodWithResource), + expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + }, + }, expectedMetrics: expectedMetrics{1, 0}, }, { @@ -112,11 +160,29 @@ func TestSyncHandler(t *testing.T) { expectedError: true, }, { - name: "nop", - pods: []*v1.Pod{testPodWithResource}, - key: podKey(testPodWithResource), - claims: []*resourcev1alpha2.ResourceClaim{testClaim}, - expectedClaims: []resourcev1alpha2.ResourceClaim{*testClaim}, + name: "find-existing-claim-by-label", + pods: []*v1.Pod{testPodWithResource}, + key: podKey(testPodWithResource), + claims: []*resourcev1alpha2.ResourceClaim{generatedTestClaim}, + expectedClaims: []resourcev1alpha2.ResourceClaim{*generatedTestClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + }, + }, + expectedMetrics: expectedMetrics{0, 0}, + }, + { + name: "find-existing-claim-by-name", + pods: []*v1.Pod{testPodWithResource}, + key: podKey(testPodWithResource), + claims: []*resourcev1alpha2.ResourceClaim{testClaim}, + expectedClaims: []resourcev1alpha2.ResourceClaim{*testClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &testClaim.Name}, + }, + }, expectedMetrics: expectedMetrics{0, 0}, }, { @@ -139,12 +205,17 @@ func TestSyncHandler(t *testing.T) { key: podKey(testPod), }, { - name: "create-with-other-claim", - pods: []*v1.Pod{testPodWithResource}, - templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, - key: podKey(testPodWithResource), - claims: []*resourcev1alpha2.ResourceClaim{otherNamespaceClaim}, - expectedClaims: []resourcev1alpha2.ResourceClaim{*otherNamespaceClaim, *testClaim}, + name: "create-with-other-claim", + pods: []*v1.Pod{testPodWithResource}, + templates: []*resourcev1alpha2.ResourceClaimTemplate{template}, + key: podKey(testPodWithResource), + claims: []*resourcev1alpha2.ResourceClaim{otherNamespaceClaim}, + expectedClaims: []resourcev1alpha2.ResourceClaim{*otherNamespaceClaim, *generatedTestClaim}, + expectedStatuses: map[string][]v1.PodResourceClaimStatus{ + testPodWithResource.Name: { + {Name: testPodWithResource.Spec.ResourceClaims[0].Name, ResourceClaimName: &generatedTestClaim.Name}, + }, + }, expectedMetrics: expectedMetrics{1, 0}, }, { @@ -296,6 +367,23 @@ func TestSyncHandler(t *testing.T) { t.Fatalf("unexpected error while listing claims: %v", err) } assert.Equal(t, normalizeClaims(tc.expectedClaims), normalizeClaims(claims.Items)) + + pods, err := fakeKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{}) + if err != nil { + t.Fatalf("unexpected error while listing pods: %v", err) + } + var actualStatuses map[string][]v1.PodResourceClaimStatus + for _, pod := range pods.Items { + if len(pod.Status.ResourceClaimStatuses) == 0 { + continue + } + if actualStatuses == nil { + actualStatuses = make(map[string][]v1.PodResourceClaimStatus) + } + actualStatuses[pod.Name] = pod.Status.ResourceClaimStatuses + } + assert.Equal(t, tc.expectedStatuses, actualStatuses, "pod resource claim statuses") + expectMetrics(t, tc.expectedMetrics) }) } @@ -315,6 +403,25 @@ func makeClaim(name, namespace, classname string, owner *metav1.OwnerReference) return claim } +func makeGeneratedClaim(podClaimName, generateName, namespace, classname string, createCounter int, owner *metav1.OwnerReference) *resourcev1alpha2.ResourceClaim { + claim := &resourcev1alpha2.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("%s-%d", generateName, createCounter), + GenerateName: generateName, + Namespace: namespace, + Annotations: map[string]string{"resource.kubernetes.io/pod-claim-name": podClaimName}, + }, + Spec: resourcev1alpha2.ResourceClaimSpec{ + ResourceClassName: classname, + }, + } + if owner != nil { + claim.OwnerReferences = []metav1.OwnerReference{*owner} + } + + return claim +} + func makePodResourceClaim(name, templateName string) *v1.PodResourceClaim { return &v1.PodResourceClaim{ Name: name, @@ -369,8 +476,13 @@ func makeOwnerReference(pod *v1.Pod, isController bool) *metav1.OwnerReference { func normalizeClaims(claims []resourcev1alpha2.ResourceClaim) []resourcev1alpha2.ResourceClaim { sort.Slice(claims, func(i, j int) bool { - return claims[i].Namespace < claims[j].Namespace || - claims[i].Name < claims[j].Name + if claims[i].Namespace < claims[j].Namespace { + return true + } + if claims[i].Namespace > claims[j].Namespace { + return false + } + return claims[i].Name < claims[j].Name }) for i := range claims { if len(claims[i].Status.ReservedFor) == 0 { @@ -382,9 +494,27 @@ func normalizeClaims(claims []resourcev1alpha2.ResourceClaim) []resourcev1alpha2 func createTestClient(objects ...runtime.Object) *fake.Clientset { fakeClient := fake.NewSimpleClientset(objects...) + fakeClient.PrependReactor("create", "resourceclaims", createResourceClaimReactor()) return fakeClient } +// createResourceClaimReactor implements the logic required for the GenerateName field to work when using +// the fake client. Add it with client.PrependReactor to your fake client. +func createResourceClaimReactor() func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + nameCounter := 1 + var mutex sync.Mutex + return func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + mutex.Lock() + defer mutex.Unlock() + claim := action.(k8stesting.CreateAction).GetObject().(*resourcev1alpha2.ResourceClaim) + if claim.Name == "" && claim.GenerateName != "" { + claim.Name = fmt.Sprintf("%s-%d", claim.GenerateName, nameCounter) + } + nameCounter++ + return false, nil, nil + } +} + // Metrics helpers type expectedMetrics struct { diff --git a/pkg/kubelet/cm/dra/manager.go b/pkg/kubelet/cm/dra/manager.go index d1f0a877012..54e5a550539 100644 --- a/pkg/kubelet/cm/dra/manager.go +++ b/pkg/kubelet/cm/dra/manager.go @@ -68,22 +68,35 @@ func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string) ( func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error { for i := range pod.Spec.ResourceClaims { podClaim := &pod.Spec.ResourceClaims[i] - claimName := resourceclaim.Name(pod, podClaim) - klog.V(3).InfoS("Processing resource", "claim", claimName, "pod", pod.Name) + klog.V(3).InfoS("Processing resource", "podClaim", podClaim.Name, "pod", pod.Name) + claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim) + if err != nil { + return fmt.Errorf("prepare resource claim: %v", err) + } + if claimName == nil { + // Nothing to do. + continue + } // Query claim object from the API server resourceClaim, err := m.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Get( context.TODO(), - claimName, + *claimName, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", claimName, pod.Name, err) + return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", *claimName, pod.Name, err) + } + + if mustCheckOwner { + if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil { + return err + } } // Check if pod is in the ReservedFor for the claim if !resourceclaim.IsReservedForPod(pod, resourceClaim) { return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)", - pod.Name, pod.UID, claimName, resourceClaim.UID) + pod.Name, pod.UID, *claimName, resourceClaim.UID) } // If no container actually uses the claim, then we don't need @@ -94,7 +107,7 @@ func (m *ManagerImpl) PrepareResources(pod *v1.Pod) error { } // Is the resource already prepared? Then add the pod UID to it. - if claimInfo := m.cache.get(claimName, pod.Namespace); claimInfo != nil { + if claimInfo := m.cache.get(*claimName, pod.Namespace); claimInfo != nil { // We delay checkpointing of this change until this call // returns successfully. It is OK to do this because we // will only return successfully from this call if the @@ -221,20 +234,28 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta cdiDevices := []kubecontainer.CDIDevice{} for i, podResourceClaim := range pod.Spec.ResourceClaims { - claimName := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) - + claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) + if err != nil { + return nil, fmt.Errorf("list resource claims: %v", err) + } + // The claim name might be nil if no underlying resource claim + // was generated for the referenced claim. There are valid use + // cases when this might happen, so we simply skip it. + if claimName == nil { + continue + } for _, claim := range container.Resources.Claims { if podResourceClaim.Name != claim.Name { continue } - claimInfo := m.cache.get(claimName, pod.Namespace) + claimInfo := m.cache.get(*claimName, pod.Namespace) if claimInfo == nil { - return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, claimName) + return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName) } claimInfo.RLock() - klog.V(3).InfoS("Add resource annotations", "claim", claimName, "annotations", claimInfo.annotations) + klog.V(3).InfoS("Add resource annotations", "claim", *claimName, "annotations", claimInfo.annotations) annotations = append(annotations, claimInfo.annotations...) for _, devices := range claimInfo.CDIDevices { for _, device := range devices { @@ -255,8 +276,19 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error { // Call NodeUnprepareResource RPC for every resource claim referenced by the pod for i := range pod.Spec.ResourceClaims { - claimName := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) - claimInfo := m.cache.get(claimName, pod.Namespace) + claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) + if err != nil { + return fmt.Errorf("unprepare resource claim: %v", err) + } + + // The claim name might be nil if no underlying resource claim + // was generated for the referenced claim. There are valid use + // cases when this might happen, so we simply skip it. + if claimName == nil { + continue + } + + claimInfo := m.cache.get(*claimName, pod.Namespace) // Skip calling NodeUnprepareResource if claim info is not cached if claimInfo == nil { @@ -277,10 +309,10 @@ func (m *ManagerImpl) UnprepareResources(pod *v1.Pod) error { // Query claim object from the API server resourceClaim, err := m.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Get( context.TODO(), - claimName, + *claimName, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", claimName, pod.Name, err) + return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %+v", *claimName, pod.Name, err) } // Grab the allocation.resourceHandles. If there are no @@ -353,15 +385,18 @@ func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Containe claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims)) for i, podResourceClaim := range pod.Spec.ResourceClaims { - claimName := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) + claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) + if err != nil { + return nil, fmt.Errorf("determine resource claim information: %v", err) + } for _, claim := range container.Resources.Claims { if podResourceClaim.Name != claim.Name { continue } - claimInfo := m.cache.get(claimName, pod.Namespace) + claimInfo := m.cache.get(*claimName, pod.Namespace) if claimInfo == nil { - return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, claimName) + return nil, fmt.Errorf("unable to get resource for namespace: %s, claim: %s", pod.Namespace, *claimName) } claimInfos = append(claimInfos, claimInfo) } diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go index a37178a5577..ed553215c1e 100644 --- a/pkg/kubelet/status/status_manager.go +++ b/pkg/kubelet/status/status_manager.go @@ -1081,6 +1081,9 @@ func mergePodStatus(oldPodStatus, newPodStatus v1.PodStatus, couldHaveRunningCon } newPodStatus.Conditions = podConditions + // ResourceClaimStatuses is not owned and not modified by kubelet. + newPodStatus.ResourceClaimStatuses = oldPodStatus.ResourceClaimStatuses + // Delay transitioning a pod to a terminal status unless the pod is actually terminal. // The Kubelet should never transition a pod to terminal status that could have running // containers and thus actively be leveraging exclusive resources. Note that resources diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 3aae5afca75..8ec90bae497 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -268,16 +268,18 @@ func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2.ResourceClaim, error) { claims := make([]*resourcev1alpha2.ResourceClaim, 0, len(pod.Spec.ResourceClaims)) for _, resource := range pod.Spec.ResourceClaims { - claimName := resourceclaim.Name(pod, &resource) - isEphemeral := resource.Source.ResourceClaimTemplateName != nil - claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(claimName) + claimName, mustCheckOwner, err := resourceclaim.Name(pod, &resource) + if err != nil { + return nil, err + } + // The claim name might be nil if no underlying resource claim + // was generated for the referenced claim. There are valid use + // cases when this might happen, so we simply skip it. + if claimName == nil { + continue + } + claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(*claimName) if err != nil { - // The error usually has already enough context ("resourcevolumeclaim "myclaim" not found"), - // but we can do better for generic ephemeral inline volumes where that situation - // is normal directly after creating a pod. - if isEphemeral && apierrors.IsNotFound(err) { - err = fmt.Errorf("waiting for dynamic resource controller to create the resourceclaim %q", claimName) - } return nil, err } @@ -285,7 +287,7 @@ func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2. return nil, fmt.Errorf("resourceclaim %q is being deleted", claim.Name) } - if isEphemeral { + if mustCheckOwner { if err := resourceclaim.IsForPod(pod, claim); err != nil { return nil, err } diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index 189750db32b..2c863463199 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -76,6 +76,16 @@ var ( UID(podUID). PodResourceClaims(v1.PodResourceClaim{Name: resourceName, Source: v1.ClaimSource{ResourceClaimTemplateName: &claimName}}). Obj() + podWithClaimTemplateInStatus = func() *v1.Pod { + pod := podWithClaimTemplate.DeepCopy() + pod.Status.ResourceClaimStatuses = []v1.PodResourceClaimStatus{ + { + Name: pod.Spec.ResourceClaims[0].Name, + ResourceClaimName: &claimName, + }, + } + return pod + }() podWithTwoClaimNames = st.MakePod().Name(podName).Namespace(namespace). UID(podUID). PodResourceClaims(v1.PodResourceClaim{Name: resourceName, Source: v1.ClaimSource{ResourceClaimName: &claimName}}). @@ -238,7 +248,7 @@ func TestPlugin(t *testing.T) { }, }, "claim-template": { - pod: podWithClaimTemplate, + pod: podWithClaimTemplateInStatus, claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim, otherClaim}, want: want{ reserve: result{ @@ -255,10 +265,11 @@ func TestPlugin(t *testing.T) { }, }, "missing-claim": { - pod: podWithClaimTemplate, + pod: podWithClaimTemplate, // status not set + claims: []*resourcev1alpha2.ResourceClaim{allocatedClaim, otherClaim}, want: want{ prefilter: result{ - status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `waiting for dynamic resource controller to create the resourceclaim "my-pod-my-resource"`), + status: framework.NewStatus(framework.UnschedulableAndUnresolvable, `pod "default/my-pod": ResourceClaim not created yet`), }, postfilter: result{ status: framework.NewStatus(framework.Unschedulable, `no new claims to deallocate`), diff --git a/pkg/scheduler/internal/queue/scheduling_queue.go b/pkg/scheduler/internal/queue/scheduling_queue.go index b15cc1cd54a..9f427712480 100644 --- a/pkg/scheduler/internal/queue/scheduling_queue.go +++ b/pkg/scheduler/internal/queue/scheduling_queue.go @@ -689,13 +689,17 @@ func (p *PriorityQueue) Pop() (*framework.QueuedPodInfo, error) { } // isPodUpdated checks if the pod is updated in a way that it may have become -// schedulable. It drops status of the pod and compares it with old version. +// schedulable. It drops status of the pod and compares it with old version, +// except for pod.status.resourceClaimStatuses: changing that may have an +// effect on scheduling. func isPodUpdated(oldPod, newPod *v1.Pod) bool { strip := func(pod *v1.Pod) *v1.Pod { p := pod.DeepCopy() p.ResourceVersion = "" p.Generation = 0 - p.Status = v1.PodStatus{} + p.Status = v1.PodStatus{ + ResourceClaimStatuses: pod.Status.ResourceClaimStatuses, + } p.ManagedFields = nil p.Finalizers = nil return p diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go index 1c90e45c7a6..b1eb24a9c3e 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go @@ -214,6 +214,7 @@ func buildControllerRoles() ([]rbacv1.ClusterRole, []rbacv1.ClusterRoleBinding) rbacv1helpers.NewRule("update").Groups(legacyGroup).Resources("pods/finalizers").RuleOrDie(), rbacv1helpers.NewRule("get", "list", "watch", "create", "delete").Groups(resourceGroup).Resources("resourceclaims").RuleOrDie(), rbacv1helpers.NewRule("update", "patch").Groups(resourceGroup).Resources("resourceclaims/status").RuleOrDie(), + rbacv1helpers.NewRule("update", "patch").Groups(legacyGroup).Resources("pods/status").RuleOrDie(), eventsRule(), }, }) diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index 258d37ff75e..8da304c9256 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -3550,15 +3550,9 @@ type ClaimSource struct { // // The template will be used to create a new ResourceClaim, which will // be bound to this pod. When this pod is deleted, the ResourceClaim - // will also be deleted. The name of the ResourceClaim will be -, where is the - // PodResourceClaim.Name. Pod validation will reject the pod if the - // concatenated name is not valid for a ResourceClaim (e.g. too long). - // - // An existing ResourceClaim with that name that is not owned by the - // pod will not be used for the pod to avoid using an unrelated - // resource by mistake. Scheduling and pod startup are then blocked - // until the unrelated ResourceClaim is removed. + // will also be deleted. The pod name and resource name, along with a + // generated component, will be used to form a unique name for the + // ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses. // // This field is immutable and no changes will be made to the // corresponding ResourceClaim by the control plane after creating the @@ -3566,6 +3560,24 @@ type ClaimSource struct { ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty" protobuf:"bytes,2,opt,name=resourceClaimTemplateName"` } +// PodResourceClaimStatus is stored in the PodStatus for each PodResourceClaim +// which references a ResourceClaimTemplate. It stores the generated name for +// the corresponding ResourceClaim. +type PodResourceClaimStatus struct { + // Name uniquely identifies this resource claim inside the pod. + // This must match the name of an entry in pod.spec.resourceClaims, + // which implies that the string must be a DNS_LABEL. + Name string `json:"name" protobuf:"bytes,1,name=name"` + + // ResourceClaimName is the name of the ResourceClaim that was + // generated for the Pod in the namespace of the Pod. It this is + // unset, then generating a ResourceClaim was not necessary. The + // pod.spec.resourceClaims entry can be ignored in this case. + // + // +optional + ResourceClaimName *string `json:"resourceClaimName,omitempty" protobuf:"bytes,2,opt,name=resourceClaimName"` +} + // OSName is the set of OS'es that can be used in OS. type OSName string @@ -4207,6 +4219,15 @@ type PodStatus struct { // +featureGate=InPlacePodVerticalScaling // +optional Resize PodResizeStatus `json:"resize,omitempty" protobuf:"bytes,14,opt,name=resize,casttype=PodResizeStatus"` + + // Status of resource claims. + // +patchMergeKey=name + // +patchStrategy=merge,retainKeys + // +listType=map + // +listMapKey=name + // +featureGate=DynamicResourceAllocation + // +optional + ResourceClaimStatuses []PodResourceClaimStatus `json:"resourceClaimStatuses,omitempty" patchStrategy:"merge,retainKeys" patchMergeKey:"name" protobuf:"bytes,15,rep,name=resourceClaimStatuses"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/staging/src/k8s.io/dynamic-resource-allocation/controller/controller.go b/staging/src/k8s.io/dynamic-resource-allocation/controller/controller.go index 20bc825815f..22f7e75a043 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/controller/controller.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/controller/controller.go @@ -625,13 +625,20 @@ func (ctrl *controller) allocateClaims(ctx context.Context, claims []*ClaimAlloc } func (ctrl *controller) checkPodClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim) (*ClaimAllocation, error) { - claimName := resourceclaim.Name(pod, &podClaim) - key := pod.Namespace + "/" + claimName + claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim) + if err != nil { + return nil, err + } + if claimName == nil { + // Nothing to do. + return nil, nil + } + key := pod.Namespace + "/" + *claimName claim, err := ctrl.getCachedClaim(ctx, key) if claim == nil || err != nil { return nil, err } - if podClaim.Source.ResourceClaimTemplateName != nil { + if mustCheckOwner { if err := resourceclaim.IsForPod(pod, claim); err != nil { return nil, err } diff --git a/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/resourceclaim.go b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/resourceclaim.go index cfe35988962..3fb1bced3d6 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/resourceclaim.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/resourceclaim.go @@ -24,6 +24,7 @@ limitations under the License. package resourceclaim import ( + "errors" "fmt" v1 "k8s.io/api/core/v1" @@ -31,25 +32,53 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +var ( + // ErrAPIUnsupported is wrapped by the actual errors returned by Name and + // indicates that none of the required fields are set. + ErrAPIUnsupported = errors.New("none of the supported fields are set") + + // ErrClaimNotFound is wrapped by the actual errors returned by Name and + // indicates that the claim has not been created yet. + ErrClaimNotFound = errors.New("ResourceClaim not created yet") +) + // Name returns the name of the ResourceClaim object that gets referenced by or -// created for the PodResourceClaim. The name is deterministic and therefore -// this function does not need any additional information and it will never -// fail. +// created for the PodResourceClaim. Three different results are possible: // -// Either podClaim.ResourceClaimName or podClaim.Template must be non-nil, but not -// both. This is enforced by API validation. +// - An error is returned when some field is not set as expected (either the +// input is invalid or the API got extended and the library and the client +// using it need to be updated) or the claim hasn't been created yet. +// +// The error includes pod and pod claim name and the unexpected field and +// is derived from one of the pre-defined errors in this package. +// +// - A nil string pointer and no error when the ResourceClaim intentionally +// didn't get created and the PodResourceClaim can be ignored. +// +// - A pointer to the name and no error when the ResourceClaim got created. +// In this case the boolean determines whether IsForPod must be called +// after retrieving the ResourceClaim and before using it. // // If podClaim.Template is not nil, the caller must check that the // ResourceClaim is indeed the one that was created for the Pod by calling // IsUsable. -func Name(pod *v1.Pod, podClaim *v1.PodResourceClaim) string { - if podClaim.Source.ResourceClaimName != nil { - return *podClaim.Source.ResourceClaimName +func Name(pod *v1.Pod, podClaim *v1.PodResourceClaim) (name *string, mustCheckOwner bool, err error) { + switch { + case podClaim.Source.ResourceClaimName != nil: + return podClaim.Source.ResourceClaimName, false, nil + case podClaim.Source.ResourceClaimTemplateName != nil: + for _, status := range pod.Status.ResourceClaimStatuses { + if status.Name == podClaim.Name { + return status.ResourceClaimName, true, nil + } + } + return nil, false, fmt.Errorf(`pod "%s/%s": %w`, pod.Namespace, pod.Name, ErrClaimNotFound) + default: + return nil, false, fmt.Errorf(`pod "%s/%s", spec.resourceClaim %q: %w`, pod.Namespace, pod.Name, podClaim.Name, ErrAPIUnsupported) } - return pod.Name + "-" + podClaim.Name } -// IsForPod checks that the ResourceClaim is the ephemeral volume that +// IsForPod checks that the ResourceClaim is the one that // was created for the Pod. It returns an error that is informative // enough to be returned by the caller without adding further details // about the Pod or ResourceClaim. diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go index 03fb397bfba..2d31f8265c6 100644 --- a/test/e2e/dra/dra.go +++ b/test/e2e/dra/dra.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "strings" "time" "github.com/onsi/ginkgo/v2" @@ -36,6 +37,7 @@ import ( "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" admissionapi "k8s.io/pod-security-admission/api" + utilpointer "k8s.io/utils/pointer" ) const ( @@ -180,6 +182,17 @@ var _ = ginkgo.Describe("[sig-node] DRA [Feature:DynamicResourceAllocation]", fu driver := NewDriver(f, nodes, networkResources) b := newBuilder(f, driver) + ginkgo.It("truncates the name of a generated resource claim", func(ctx context.Context) { + parameters := b.parameters() + pod, template := b.podInline(resourcev1alpha2.AllocationModeWaitForFirstConsumer) + pod.Name = strings.Repeat("p", 63) + pod.Spec.ResourceClaims[0].Name = strings.Repeat("c", 63) + pod.Spec.Containers[0].Resources.Claims[0].Name = pod.Spec.ResourceClaims[0].Name + b.create(ctx, parameters, template, pod) + + b.testPod(ctx, f.ClientSet, pod) + }) + // claimTests tries out several different combinations of pods with // claims, both inline and external. claimTests := func(allocationMode resourcev1alpha2.AllocationMode) { @@ -759,7 +772,7 @@ func (b *builder) podInline(allocationMode resourcev1alpha2.AllocationMode) (*v1 { Name: podClaimName, Source: v1.ClaimSource{ - ResourceClaimTemplateName: &pod.Name, + ResourceClaimTemplateName: utilpointer.String(pod.Name), }, }, }