DRA admin access: add feature gate

The new DRAAdminAccess feature gate has the following effects:
- If disabled in the apiserver, the spec.devices.requests[*].adminAccess
  field gets cleared. Same in the status. In both cases the scenario
  that it was already set and a claim or claim template get updated
  is special: in those cases, the field is not cleared.

  Also, allocating a claim with admin access is allowed regardless of the
  feature gate and the field is not cleared. In practice, the scheduler
  will not do that.
- If disabled in the resource claim controller, creating ResourceClaims
  with the field set gets rejected. This prevents running workloads
  which depend on admin access.
- If disabled in the scheduler, claims with admin access don't get
  allocated. The effect is the same.

The alternative would have been to ignore the fields in claim controller and
scheduler. This is bad because a monitoring workload then runs, blocking
resources that probably were meant for production workloads.
This commit is contained in:
Patrick Ohly
2024-10-09 20:12:49 +02:00
parent f3fef01e79
commit 9a7e4ccab2
29 changed files with 564 additions and 56 deletions

View File

@@ -71,6 +71,9 @@ const (
// Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec.
type Controller struct {
// adminAccessEnabled matches the DRAAdminAccess feature gate state.
adminAccessEnabled bool
// kubeClient is the kube API client used to communicate with the API
// server.
kubeClient clientset.Interface
@@ -118,20 +121,22 @@ const (
// NewController creates a ResourceClaim controller.
func NewController(
logger klog.Logger,
adminAccessEnabled bool,
kubeClient clientset.Interface,
podInformer v1informers.PodInformer,
claimInformer resourceinformers.ResourceClaimInformer,
templateInformer resourceinformers.ResourceClaimTemplateInformer) (*Controller, error) {
ec := &Controller{
kubeClient: kubeClient,
podLister: podInformer.Lister(),
podIndexer: podInformer.Informer().GetIndexer(),
podSynced: podInformer.Informer().HasSynced,
claimLister: claimInformer.Lister(),
claimsSynced: claimInformer.Informer().HasSynced,
templateLister: templateInformer.Lister(),
templatesSynced: templateInformer.Informer().HasSynced,
adminAccessEnabled: adminAccessEnabled,
kubeClient: kubeClient,
podLister: podInformer.Lister(),
podIndexer: podInformer.Informer().GetIndexer(),
podSynced: podInformer.Informer().HasSynced,
claimLister: claimInformer.Lister(),
claimsSynced: claimInformer.Informer().HasSynced,
templateLister: templateInformer.Lister(),
templatesSynced: templateInformer.Informer().HasSynced,
queue: workqueue.NewTypedRateLimitingQueueWithConfig(
workqueue.DefaultTypedControllerRateLimiter[string](),
workqueue.TypedRateLimitingQueueConfig[string]{Name: "resource_claim"},
@@ -612,6 +617,10 @@ func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.
return fmt.Errorf("resource claim template %q: %v", *templateName, err)
}
if !ec.adminAccessEnabled && needsAdminAccess(template) {
return errors.New("admin access is requested, but the feature is disabled")
}
// Create the ResourceClaim with pod as owner, with a generated name that uses
// <pod>-<claim name> as base.
isTrue := true
@@ -670,6 +679,15 @@ func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.
return nil
}
func needsAdminAccess(claimTemplate *resourceapi.ResourceClaimTemplate) bool {
for _, request := range claimTemplate.Spec.Spec.Devices.Requests {
if request.AdminAccess {
return true
}
}
return false
}
// findPodResourceClaim looks for an existing ResourceClaim with the right
// annotation (ties it to the pod claim) and the right ownership (ties it to
// the pod).

View File

@@ -37,7 +37,6 @@ import (
"k8s.io/client-go/kubernetes/fake"
k8stesting "k8s.io/client-go/testing"
"k8s.io/component-base/metrics/testutil"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
"k8s.io/kubernetes/test/utils/ktesting"
@@ -83,17 +82,18 @@ var (
func TestSyncHandler(t *testing.T) {
tests := []struct {
name string
key string
claims []*resourceapi.ResourceClaim
claimsInCache []*resourceapi.ResourceClaim
pods []*v1.Pod
podsLater []*v1.Pod
templates []*resourceapi.ResourceClaimTemplate
expectedClaims []resourceapi.ResourceClaim
expectedStatuses map[string][]v1.PodResourceClaimStatus
expectedError bool
expectedMetrics expectedMetrics
name string
key string
adminAccessEnabled bool
claims []*resourceapi.ResourceClaim
claimsInCache []*resourceapi.ResourceClaim
pods []*v1.Pod
podsLater []*v1.Pod
templates []*resourceapi.ResourceClaimTemplate
expectedClaims []resourceapi.ResourceClaim
expectedStatuses map[string][]v1.PodResourceClaimStatus
expectedError bool
expectedMetrics expectedMetrics
}{
{
name: "create",
@@ -390,7 +390,7 @@ func TestSyncHandler(t *testing.T) {
claimInformer := informerFactory.Resource().V1alpha3().ResourceClaims()
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
ec, err := NewController(klog.FromContext(tCtx), fakeKubeClient, podInformer, claimInformer, templateInformer)
ec, err := NewController(tCtx.Logger(), tc.adminAccessEnabled, fakeKubeClient, podInformer, claimInformer, templateInformer)
if err != nil {
t.Fatalf("error creating ephemeral controller : %v", err)
}
@@ -465,7 +465,7 @@ func TestResourceClaimEventHandler(t *testing.T) {
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
claimClient := fakeKubeClient.ResourceV1alpha3().ResourceClaims(testNamespace)
_, err := NewController(tCtx.Logger(), fakeKubeClient, podInformer, claimInformer, templateInformer)
_, err := NewController(tCtx.Logger(), false /* admin access */, fakeKubeClient, podInformer, claimInformer, templateInformer)
tCtx.ExpectNoError(err, "creating ephemeral controller")
informerFactory.Start(tCtx.Done())