DRA E2E: tests for device taints

This commit is contained in:
Patrick Ohly 2025-03-16 09:48:36 +01:00
parent 5760a4f282
commit 2499663b84
4 changed files with 113 additions and 3 deletions

View File

@ -87,6 +87,9 @@ type Resources struct {
// Number of devices called "device-000", "device-001", ... on each node or in the cluster.
MaxAllocations int
// Tainted causes all devices to be published with a NoExecute taint.
Tainted bool
}
//go:embed test-driver/deploy/example/plugin-permissions.yaml
@ -299,10 +302,18 @@ func (d *Driver) SetUp(nodes *Nodes, resources Resources, devicesPerNode ...map[
maxAllocations = 10
}
for i := 0; i < maxAllocations; i++ {
slice.Spec.Devices = append(slice.Spec.Devices, resourceapi.Device{
device := resourceapi.Device{
Name: fmt.Sprintf("device-%d", i),
Basic: &resourceapi.BasicDevice{},
})
}
if resources.Tainted {
device.Basic.Taints = []resourceapi.DeviceTaint{{
Key: "example.com/taint",
Value: "tainted",
Effect: resourceapi.DeviceTaintEffectNoSchedule,
}}
}
slice.Spec.Devices = append(slice.Spec.Devices, device)
}
_, err := d.f.ClientSet.ResourceV1beta1().ResourceSlices().Create(ctx, slice, metav1.CreateOptions{})

View File

@ -35,6 +35,7 @@ import (
admissionregistrationv1 "k8s.io/api/admissionregistration/v1"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
resourcealphaapi "k8s.io/api/resource/v1alpha3"
resourceapi "k8s.io/api/resource/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
@ -1280,6 +1281,88 @@ var _ = framework.SIGDescribe("node")("DRA", feature.DynamicResourceAllocation,
prioritizedListTests()
})
framework.Context("with device taints", feature.DRADeviceTaints, framework.WithFeatureGate(features.DRADeviceTaints), func() {
nodes := NewNodes(f, 1, 1)
driver := NewDriver(f, nodes, func() Resources {
return Resources{
Tainted: true,
}
})
b := newBuilder(f, driver)
f.It("DeviceTaint keeps pod pending", func(ctx context.Context) {
pod, template := b.podInline()
b.create(ctx, pod, template)
framework.ExpectNoError(e2epod.WaitForPodNameUnschedulableInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name))
})
f.It("DeviceToleration enables pod scheduling", func(ctx context.Context) {
pod, template := b.podInline()
template.Spec.Spec.Devices.Requests[0].Tolerations = []resourceapi.DeviceToleration{{
Effect: resourceapi.DeviceTaintEffectNoSchedule,
Operator: resourceapi.DeviceTolerationOpExists,
// No key: tolerate *all* taints with this effect.
}}
b.create(ctx, pod, template)
b.testPod(ctx, f, pod)
})
f.It("DeviceTaintRule evicts pod", func(ctx context.Context) {
pod, template := b.podInline()
template.Spec.Spec.Devices.Requests[0].Tolerations = []resourceapi.DeviceToleration{{
Effect: resourceapi.DeviceTaintEffectNoSchedule,
Operator: resourceapi.DeviceTolerationOpExists,
// No key: tolerate *all* taints with this effect.
}}
// Add a finalizer to ensure that we get a chance to test the pod status after eviction (= deletion).
pod.Finalizers = []string{"e2e-test/dont-delete-me"}
b.create(ctx, pod, template)
b.testPod(ctx, f, pod)
ginkgo.DeferCleanup(func(ctx context.Context) {
// Unblock shutdown by removing the finalizer.
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "get pod")
pod.Finalizers = nil
_, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Update(ctx, pod, metav1.UpdateOptions{})
framework.ExpectNoError(err, "remove finalizers from pod")
})
// Now evict it.
ginkgo.By("Evicting pod...")
taint := &resourcealphaapi.DeviceTaintRule{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "device-taint-rule-" + f.UniqueName + "-",
},
Spec: resourcealphaapi.DeviceTaintRuleSpec{
// All devices of the current driver instance.
DeviceSelector: &resourcealphaapi.DeviceTaintSelector{
Driver: &driver.Name,
},
Taint: resourcealphaapi.DeviceTaint{
Effect: resourcealphaapi.DeviceTaintEffectNoExecute,
Key: "test.example.com/evict",
Value: "now",
// No TimeAdded, gets defaulted.
},
},
}
createdTaint := b.create(ctx, taint)
taint = createdTaint[0].(*resourcealphaapi.DeviceTaintRule)
gomega.Expect(*taint).Should(gomega.HaveField("Spec.Taint.TimeAdded.Time", gomega.BeTemporally("~", time.Now(), time.Minute /* allow for some clock drift and delays */)))
framework.ExpectNoError(e2epod.WaitForPodTerminatingInNamespaceTimeout(ctx, f.ClientSet, pod.Name, f.Namespace.Name, f.Timeouts.PodStart))
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err, "get pod")
gomega.Expect(pod).Should(gomega.HaveField("Status.Conditions", gomega.ContainElement(gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{
// LastTransitionTime is unknown.
"Type": gomega.Equal(v1.DisruptionTarget),
"Status": gomega.Equal(v1.ConditionTrue),
"Reason": gomega.Equal("DeletionByDeviceTaintManager"),
"Message": gomega.Equal("Device Taint manager: deleting due to NoExecute taint"),
}))))
})
})
// TODO (https://github.com/kubernetes/kubernetes/issues/123699): move most of the test below into `testDriver` so that they get
// executed with different parameters.
@ -2006,6 +2089,12 @@ func (b *builder) create(ctx context.Context, objs ...klog.KMetadata) []klog.KMe
err := b.f.ClientSet.ResourceV1beta1().ResourceSlices().Delete(ctx, createdObj.GetName(), metav1.DeleteOptions{})
framework.ExpectNoError(err, "delete node resource slice")
})
case *resourcealphaapi.DeviceTaintRule:
createdObj, err = b.f.ClientSet.ResourceV1alpha3().DeviceTaintRules().Create(ctx, obj, metav1.CreateOptions{})
ginkgo.DeferCleanup(func(ctx context.Context) {
err := b.f.ClientSet.ResourceV1alpha3().DeviceTaintRules().Delete(ctx, createdObj.GetName(), metav1.DeleteOptions{})
framework.ExpectNoError(err, "delete DeviceTaintRule")
})
case *appsv1.DaemonSet:
createdObj, err = b.f.ClientSet.AppsV1().DaemonSets(b.f.Namespace.Name).Create(ctx, obj, metav1.CreateOptions{})
// Cleanup not really needed, but speeds up namespace shutdown.

View File

@ -20,7 +20,7 @@ nodes:
v: "5"
apiServer:
extraArgs:
runtime-config: "resource.k8s.io/v1beta1=true"
runtime-config: "resource.k8s.io/v1alpha3=true,resource.k8s.io/v1beta1=true"
- |
kind: InitConfiguration
nodeRegistration:

View File

@ -112,6 +112,16 @@ var (
// is enabled such that passing CDI device IDs through CRI fields is supported
DRAAdminAccess = framework.WithFeature(framework.ValidFeatures.Add("DRAAdminAccess"))
// owning-sig: sig-scheduling
// kep: https://kep.k8s.io/5055
// test-infra jobs:
// - "ci-kind-dra-all" in https://testgrid.k8s.io/sig-node-dynamic-resource-allocation
//
// This label is used for tests which need:
// - the DynamicResourceAllocation *and* DRADeviceTaints feature gates
// - the resource.k8s.io API group, including version v1alpha3
DRADeviceTaints = framework.WithFeature(framework.ValidFeatures.Add("DRADeviceTaints"))
// TODO: document the feature (owning SIG, when to use this feature for a test)
// OWNER: sig-node
// Testing downward API huge pages