From ee3205804bbfb2bfc7258d0ff9e80e41c3c3f6eb Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 6 Jun 2024 09:40:06 +0200 Subject: [PATCH] dra e2e: demonstrate how to use RBAC + VAP for a kubelet plugin In reality, the kubelet plugin of a DRA driver is meant to be deployed as a daemonset with a service account that limits its permissions. https://kubernetes.io/docs/reference/access-authn-authz/service-accounts-admin/#additional-metadata-in-pod-bound-tokens ensures that the node name is bound to the pod, which then can be used in a validating admission policy (VAP) to ensure that the operations are limited to the node. In E2E testing, we emulate that via impersonation. This ensures that the plugin does not accidentally depend on additional permissions. --- test/e2e/dra/deploy.go | 137 ++++++++++++++++-- test/e2e/dra/dra.go | 92 ++++++++++++ .../deploy/example/plugin-permissions.yaml | 84 +++++++++++ 3 files changed, 304 insertions(+), 9 deletions(-) create mode 100644 test/e2e/dra/test-driver/deploy/example/plugin-permissions.yaml diff --git a/test/e2e/dra/deploy.go b/test/e2e/dra/deploy.go index 8c5d86c09d4..61682914365 100644 --- a/test/e2e/dra/deploy.go +++ b/test/e2e/dra/deploy.go @@ -19,6 +19,7 @@ package dra import ( "bytes" "context" + _ "embed" "errors" "fmt" "net" @@ -38,10 +39,19 @@ import ( v1 "k8s.io/api/core/v1" resourcev1alpha2 "k8s.io/api/resource/v1alpha2" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/selection" + "k8s.io/apiserver/pkg/authentication/serviceaccount" + "k8s.io/client-go/discovery/cached/memory" resourceapiinformer "k8s.io/client-go/informers/resource/v1alpha2" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/restmapper" "k8s.io/client-go/tools/cache" "k8s.io/dynamic-resource-allocation/kubeletplugin" "k8s.io/klog/v2" @@ -52,6 +62,7 @@ import ( e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" "k8s.io/kubernetes/test/e2e/storage/drivers/proxy" "k8s.io/kubernetes/test/e2e/storage/utils" + "sigs.k8s.io/yaml" ) const ( @@ -63,10 +74,14 @@ type Nodes struct { NodeNames []string } +//go:embed test-driver/deploy/example/plugin-permissions.yaml +var pluginPermissions string + // NewNodes selects nodes to run the test on. func NewNodes(f *framework.Framework, minNodes, maxNodes int) *Nodes { nodes := &Nodes{} ginkgo.BeforeEach(func(ctx context.Context) { + ginkgo.By("selecting nodes") // The kubelet plugin is harder. We deploy the builtin manifest // after patching in the driver name and all nodes on which we @@ -166,15 +181,19 @@ type MethodInstance struct { } type Driver struct { - f *framework.Framework - ctx context.Context - cleanup []func() // executed first-in-first-out - wg sync.WaitGroup + f *framework.Framework + ctx context.Context + cleanup []func() // executed first-in-first-out + wg sync.WaitGroup + serviceAccountName string NameSuffix string Controller *app.ExampleController Name string - Nodes map[string]*app.ExamplePlugin + + // Nodes contains entries for each node selected for a test when the test runs. + // In addition, there is one entry for a fictional node. + Nodes map[string]KubeletPlugin parameterMode parameterMode parameterAPIGroup string @@ -189,6 +208,11 @@ type Driver struct { callCounts map[MethodInstance]int64 } +type KubeletPlugin struct { + *app.ExamplePlugin + ClientSet kubernetes.Interface +} + type parameterMode string const ( @@ -199,7 +223,7 @@ const ( func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { ginkgo.By(fmt.Sprintf("deploying driver on nodes %v", nodes.NodeNames)) - d.Nodes = map[string]*app.ExamplePlugin{} + d.Nodes = make(map[string]KubeletPlugin) d.Name = d.f.UniqueName + d.NameSuffix + ".k8s.io" resources.DriverName = d.Name @@ -250,6 +274,13 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { framework.Failf("unknown test driver parameter mode: %s", d.parameterMode) } + // Create service account and corresponding RBAC rules. + d.serviceAccountName = "dra-kubelet-plugin-" + d.Name + "-service-account" + content := pluginPermissions + content = strings.ReplaceAll(content, "dra-kubelet-plugin-namespace", d.f.Namespace.Name) + content = strings.ReplaceAll(content, "dra-kubelet-plugin", "dra-kubelet-plugin-"+d.Name) + d.createFromYAML(ctx, []byte(content), d.f.Namespace.Name) + instanceKey := "app.kubernetes.io/instance" rsName := "" draAddr := path.Join(framework.TestContext.KubeletRootDir, "plugins", d.Name+".sock") @@ -262,6 +293,7 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { item.Spec.Replicas = &numNodes item.Spec.Selector.MatchLabels[instanceKey] = d.Name item.Spec.Template.Labels[instanceKey] = d.Name + item.Spec.Template.Spec.ServiceAccountName = d.serviceAccountName item.Spec.Template.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution[0].LabelSelector.MatchLabels[instanceKey] = d.Name item.Spec.Template.Spec.Affinity.NodeAffinity = &v1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ @@ -305,15 +337,31 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { framework.ExpectNoError(err, "list proxy pods") gomega.Expect(numNodes).To(gomega.Equal(int32(len(pods.Items))), "number of proxy pods") - // Run registar and plugin for each of the pods. + // Run registrar and plugin for each of the pods. for _, pod := range pods.Items { // Need a local variable, not the loop variable, for the anonymous // callback functions below. pod := pod nodename := pod.Spec.NodeName + + // Authenticate the plugin so that it has the exact same + // permissions as the daemonset pod. This includes RBAC and a + // validating admission policy which limits writes to per-node + // ResourceSlices. + // + // We could retrieve + // /var/run/secrets/kubernetes.io/serviceaccount/token from + // each pod and use it. That would check that + // ServiceAccountTokenNodeBindingValidation works. But that's + // better covered by a test owned by SIG Auth (like the one in + // https://github.com/kubernetes/kubernetes/pull/124711). + // + // Here we merely use impersonation, which is faster. + driverClient := d.impersonateKubeletPlugin(&pod) + logger := klog.LoggerWithValues(klog.LoggerWithName(klog.Background(), "kubelet plugin"), "node", pod.Spec.NodeName, "pod", klog.KObj(&pod)) loggerCtx := klog.NewContext(ctx, logger) - plugin, err := app.StartPlugin(loggerCtx, "/cdi", d.Name, d.f.ClientSet, nodename, + plugin, err := app.StartPlugin(loggerCtx, "/cdi", d.Name, driverClient, nodename, app.FileOperations{ Create: func(name string, content []byte) error { klog.Background().Info("creating CDI file", "node", nodename, "filename", name, "content", string(content)) @@ -342,7 +390,7 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { // Depends on cancel being called first. plugin.Stop() }) - d.Nodes[nodename] = plugin + d.Nodes[nodename] = KubeletPlugin{ExamplePlugin: plugin, ClientSet: driverClient} } // Wait for registration. @@ -359,6 +407,26 @@ func (d *Driver) SetUp(nodes *Nodes, resources app.Resources) { }).WithTimeout(time.Minute).Should(gomega.BeEmpty(), "hosts where the plugin has not been registered yet") } +func (d *Driver) impersonateKubeletPlugin(pod *v1.Pod) kubernetes.Interface { + ginkgo.GinkgoHelper() + driverUserInfo := (&serviceaccount.ServiceAccountInfo{ + Name: d.serviceAccountName, + Namespace: pod.Namespace, + NodeName: pod.Spec.NodeName, + PodName: pod.Name, + PodUID: string(pod.UID), + }).UserInfo() + driverClientConfig := d.f.ClientConfig() + driverClientConfig.Impersonate = rest.ImpersonationConfig{ + UserName: driverUserInfo.GetName(), + Groups: driverUserInfo.GetGroups(), + Extra: driverUserInfo.GetExtra(), + } + driverClient, err := kubernetes.NewForConfig(driverClientConfig) + framework.ExpectNoError(err, "create client for driver") + return driverClient +} + func (d *Driver) createFile(pod *v1.Pod, name string, content []byte) error { buffer := bytes.NewBuffer(content) // Writing the content can be slow. Better create a temporary file and @@ -375,6 +443,57 @@ func (d *Driver) removeFile(pod *v1.Pod, name string) error { return d.podIO(pod).RemoveAll(name) } +func (d *Driver) createFromYAML(ctx context.Context, content []byte, namespace string) { + // Not caching the discovery result isn't very efficient, but good enough. + discoveryCache := memory.NewMemCacheClient(d.f.ClientSet.Discovery()) + restMapper := restmapper.NewDeferredDiscoveryRESTMapper(discoveryCache) + + for _, content := range bytes.Split(content, []byte("---\n")) { + if len(content) == 0 { + continue + } + + var obj *unstructured.Unstructured + framework.ExpectNoError(yaml.UnmarshalStrict(content, &obj), fmt.Sprintf("Full YAML:\n%s\n", string(content))) + + gv, err := schema.ParseGroupVersion(obj.GetAPIVersion()) + framework.ExpectNoError(err, fmt.Sprintf("extract group+version from object %q", klog.KObj(obj))) + gk := schema.GroupKind{Group: gv.Group, Kind: obj.GetKind()} + + mapping, err := restMapper.RESTMapping(gk, gv.Version) + framework.ExpectNoError(err, fmt.Sprintf("map %q to resource", gk)) + + resourceClient := d.f.DynamicClient.Resource(mapping.Resource) + options := metav1.CreateOptions{ + // If the YAML input is invalid, then we want the + // apiserver to tell us via an error. This can + // happen because decoding into an unstructured object + // doesn't validate. + FieldValidation: "Strict", + } + switch mapping.Scope.Name() { + case meta.RESTScopeNameRoot: + _, err = resourceClient.Create(ctx, obj, options) + case meta.RESTScopeNameNamespace: + if namespace == "" { + framework.Failf("need namespace for object type %s", gk) + } + _, err = resourceClient.Namespace(namespace).Create(ctx, obj, options) + } + framework.ExpectNoError(err, "create object") + ginkgo.DeferCleanup(func(ctx context.Context) { + del := resourceClient.Delete + if mapping.Scope.Name() == meta.RESTScopeNameNamespace { + del = resourceClient.Namespace(namespace).Delete + } + err := del(ctx, obj.GetName(), metav1.DeleteOptions{}) + if !apierrors.IsNotFound(err) { + framework.ExpectNoError(err, fmt.Sprintf("deleting %s.%s %s", obj.GetKind(), obj.GetAPIVersion(), klog.KObj(obj))) + } + }) + } +} + func (d *Driver) podIO(pod *v1.Pod) proxy.PodDirIO { logger := klog.Background() return proxy.PodDirIO{ diff --git a/test/e2e/dra/dra.go b/test/e2e/dra/dra.go index 1c507699636..1ac7e5b6e75 100644 --- a/test/e2e/dra/dra.go +++ b/test/e2e/dra/dra.go @@ -29,6 +29,7 @@ import ( "github.com/onsi/gomega" "github.com/onsi/gomega/gcustom" "github.com/onsi/gomega/gstruct" + "github.com/onsi/gomega/types" v1 "k8s.io/api/core/v1" resourcev1alpha2 "k8s.io/api/resource/v1alpha2" @@ -891,6 +892,96 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, driver := NewDriver(f, nodes, perNode(1, nodes)) driver.parameterMode = parameterModeStructured + f.It("must apply per-node permission checks", func(ctx context.Context) { + // All of the operations use the client set of a kubelet plugin for + // a fictional node which both don't exist, so nothing interferes + // when we actually manage to create a slice. + fictionalNodeName := "dra-fictional-node" + gomega.Expect(nodes.NodeNames).NotTo(gomega.ContainElement(fictionalNodeName)) + fictionalNodeClient := driver.impersonateKubeletPlugin(&v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fictionalNodeName + "-dra-plugin", + Namespace: f.Namespace.Name, + UID: "12345", + }, + Spec: v1.PodSpec{ + NodeName: fictionalNodeName, + }, + }) + + // This is for some actual node in the cluster. + realNodeName := nodes.NodeNames[0] + realNodeClient := driver.Nodes[realNodeName].ClientSet + + // This is the slice that we try to create. It needs to be deleted + // after testing, if it still exists at that time. + fictionalNodeSlice := &resourcev1alpha2.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: fictionalNodeName + "-slice", + }, + NodeName: fictionalNodeName, + DriverName: "dra.example.com", + ResourceModel: resourcev1alpha2.ResourceModel{ + NamedResources: &resourcev1alpha2.NamedResourcesResources{}, + }, + } + ginkgo.DeferCleanup(func(ctx context.Context) { + err := f.ClientSet.ResourceV1alpha2().ResourceSlices().Delete(ctx, fictionalNodeSlice.Name, metav1.DeleteOptions{}) + if !apierrors.IsNotFound(err) { + framework.ExpectNoError(err) + } + }) + + // Message from test-driver/deploy/example/plugin-permissions.yaml + matchVAPDeniedError := gomega.MatchError(gomega.ContainSubstring("may only modify resourceslices that belong to the node the pod is running on")) + + mustCreate := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice) *resourcev1alpha2.ResourceSlice { + ginkgo.GinkgoHelper() + slice, err := clientSet.ResourceV1alpha2().ResourceSlices().Create(ctx, slice, metav1.CreateOptions{}) + framework.ExpectNoError(err, fmt.Sprintf("CREATE: %s + %s", clientName, slice.Name)) + return slice + } + mustUpdate := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice) *resourcev1alpha2.ResourceSlice { + ginkgo.GinkgoHelper() + slice, err := clientSet.ResourceV1alpha2().ResourceSlices().Update(ctx, slice, metav1.UpdateOptions{}) + framework.ExpectNoError(err, fmt.Sprintf("UPDATE: %s + %s", clientName, slice.Name)) + return slice + } + mustDelete := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice) { + ginkgo.GinkgoHelper() + err := clientSet.ResourceV1alpha2().ResourceSlices().Delete(ctx, slice.Name, metav1.DeleteOptions{}) + framework.ExpectNoError(err, fmt.Sprintf("DELETE: %s + %s", clientName, slice.Name)) + } + mustFailToCreate := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice, matchError types.GomegaMatcher) { + ginkgo.GinkgoHelper() + _, err := clientSet.ResourceV1alpha2().ResourceSlices().Create(ctx, slice, metav1.CreateOptions{}) + gomega.Expect(err).To(matchError, fmt.Sprintf("CREATE: %s + %s", clientName, slice.Name)) + } + mustFailToUpdate := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice, matchError types.GomegaMatcher) { + ginkgo.GinkgoHelper() + _, err := clientSet.ResourceV1alpha2().ResourceSlices().Update(ctx, slice, metav1.UpdateOptions{}) + gomega.Expect(err).To(matchError, fmt.Sprintf("UPDATE: %s + %s", clientName, slice.Name)) + } + mustFailToDelete := func(clientSet kubernetes.Interface, clientName string, slice *resourcev1alpha2.ResourceSlice, matchError types.GomegaMatcher) { + ginkgo.GinkgoHelper() + err := clientSet.ResourceV1alpha2().ResourceSlices().Delete(ctx, slice.Name, metav1.DeleteOptions{}) + gomega.Expect(err).To(matchError, fmt.Sprintf("DELETE: %s + %s", clientName, slice.Name)) + } + + // Create with different clients, keep it in the end. + mustFailToCreate(realNodeClient, "real plugin", fictionalNodeSlice, matchVAPDeniedError) + createdFictionalNodeSlice := mustCreate(fictionalNodeClient, "fictional plugin", fictionalNodeSlice) + + // Update with different clients. + mustFailToUpdate(realNodeClient, "real plugin", createdFictionalNodeSlice, matchVAPDeniedError) + createdFictionalNodeSlice = mustUpdate(fictionalNodeClient, "fictional plugin", createdFictionalNodeSlice) + createdFictionalNodeSlice = mustUpdate(f.ClientSet, "admin", createdFictionalNodeSlice) + + // Delete with different clients. + mustFailToDelete(realNodeClient, "real plugin", createdFictionalNodeSlice, matchVAPDeniedError) + mustDelete(fictionalNodeClient, "fictional plugin", createdFictionalNodeSlice) + }) + f.It("must manage ResourceSlices", f.WithSlow(), func(ctx context.Context) { driverName := driver.Name @@ -1100,6 +1191,7 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation, b := newBuilder(f, driver) preScheduledTests(b, driver, resourcev1alpha2.AllocationModeImmediate) claimTests(b, driver, resourcev1alpha2.AllocationModeImmediate) + }) }) diff --git a/test/e2e/dra/test-driver/deploy/example/plugin-permissions.yaml b/test/e2e/dra/test-driver/deploy/example/plugin-permissions.yaml new file mode 100644 index 00000000000..80e12ee5e4c --- /dev/null +++ b/test/e2e/dra/test-driver/deploy/example/plugin-permissions.yaml @@ -0,0 +1,84 @@ +# Real driver deployments must replace all occurrences of "dra-kubelet-plugin" +# with something specific to their driver. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dra-kubelet-plugin-service-account + namespace: dra-kubelet-plugin-namespace +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: dra-kubelet-plugin-role +rules: +- apiGroups: ["resource.k8s.io"] + resources: ["resourceclaims"] + verbs: ["get"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get"] +- apiGroups: ["resource.k8s.io"] + resources: ["resourceslices"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: dra-kubelet-plugin-role-binding +subjects: +- kind: ServiceAccount + name: dra-kubelet-plugin-service-account + namespace: dra-kubelet-plugin-namespace +roleRef: + kind: ClusterRole + name: dra-kubelet-plugin-role + apiGroup: rbac.authorization.k8s.io +--- +# This ValidatingAdmissionPolicy is specific to the DRA driver's kubelet plugin +# because it checks the ServiceAccount defined for it above. An admin could +# also define a single policy for all drivers. +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicy +metadata: + name: resourceslices-policy-dra-kubelet-plugin +spec: + failurePolicy: Fail + matchConstraints: + resourceRules: + - apiGroups: ["resource.k8s.io"] + apiVersions: ["v1alpha2"] + operations: ["CREATE", "UPDATE", "DELETE"] + resources: ["resourceslices"] + variables: + - name: hasNodeName + expression: >- + "authentication.kubernetes.io/node-name" in request.userInfo.extra + - name: isKubeletPlugin + expression: >- + request.userInfo.username == "system:serviceaccount:dra-kubelet-plugin-namespace:dra-kubelet-plugin-service-account" + - name: objectNodeName + expression: >- + (request.operation == "DELETE" ? oldObject : object).?nodeName.orValue("") + validations: + - expression: >- + !variables.isKubeletPlugin || variables.hasNodeName + message: This user must have a "authentication.kubernetes.io/node-name" claim. ServiceAccountTokenNodeBindingValidation must be enabled in the cluster. + - expression: >- + !variables.isKubeletPlugin || !variables.hasNodeName || + variables.objectNodeName == request.userInfo.extra["authentication.kubernetes.io/node-name"][0] + message: This DRA kubelet plugin may only modify resourceslices that belong to the node the pod is running on. + # This is useful for debugging. Can be dropped in a production deployment. + messageExpression: >- + "The DRA kubelet plugin on node " + request.userInfo.extra["authentication.kubernetes.io/node-name"][0] + + " may only modify resourceslices that belong to the node the pod is running on, not " + + (variables.objectNodeName == "" ? variables.objectNodeName : "a cluster-scoped slice") + "." +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingAdmissionPolicyBinding +metadata: + name: resourceslices-policy-dra-kubelet-plugin +spec: + policyName: resourceslices-policy-dra-kubelet-plugin + validationActions: [Deny] + # All ResourceSlices are matched.