DRA cel: enforce runtime limit by default again

As pointed out during code review, the CEL cost estimates are not considered perfectly reliable. Therefore it is better to also do runtime checks. Some downstream users might decide to allow CEL expressions to run longer. Therefore the cost limit is now part of an Options struct. kube-scheduler uses the default cost limit defined in the resource.k8s.io API, which is the same cost limit that also the apiserver uses during validation.
2025-07-27 05:27:21 +00:00 · 2024-10-23 21:15:05 +02:00 · 2024-10-23 21:15:05 +02:00 · d53cb79cec
commit d53cb79cec
parent 021c9fb92f
6 changed files with 40 additions and 35 deletions
--- a/pkg/apis/resource/types.go
+++ b/pkg/apis/resource/types.go
@ -545,8 +545,13 @@ type CELDeviceSelector struct {
 // Validation against this limit and [CELSelectorExpressionMaxLength] happens
 // only when setting an expression for the first time or when changing it. If
 // the limits are changed in a future Kubernetes release, existing users are
-// guaranteed that existing expressions will continue to be valid and won't be
+// guaranteed that existing expressions will continue to be valid.
-// interrupted at runtime after an up- or downgrade.
+//
 // However, the kube-scheduler also applies this cost limit at runtime, so it
 // could happen that a valid expression fails at runtime after an up- or
 // downgrade. This can also happen without version skew when the cost estimate
 // underestimated the actual cost. That this might happen is the reason why
 // kube-scheduler enforces the runtime limit instead of relying on validation.
 //
 // According to
 // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,
--- a/pkg/apis/resource/validation/validation.go
+++ b/pkg/apis/resource/validation/validation.go
@ -176,7 +176,7 @@ func validateCELSelector(celSelector resource.CELDeviceSelector, fldPath *field.
 		return allErrs
 	}
-	result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, envType)
+	result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, dracel.Options{EnvType: &envType})
 	if result.Error != nil {
 		allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error))
 	} else if result.MaxCost > resource.CELSelectorExpressionMaxCost {
--- a/staging/src/k8s.io/api/resource/v1alpha3/types.go
+++ b/staging/src/k8s.io/api/resource/v1alpha3/types.go
@ -551,8 +551,13 @@ type CELDeviceSelector struct {
 // Validation against this limit and [CELSelectorExpressionMaxLength] happens
 // only when setting an expression for the first time or when changing it. If
 // the limits are changed in a future Kubernetes release, existing users are
-// guaranteed that existing expressions will continue to be valid and won't be
+// guaranteed that existing expressions will continue to be valid.
-// interrupted at runtime after an up- or downgrade.
+//
 // However, the kube-scheduler also applies this cost limit at runtime, so it
 // could happen that a valid expression fails at runtime after an up- or
 // downgrade. This can also happen without version skew when the cost estimate
 // underestimated the actual cost. That this might happen is the reason why
 // kube-scheduler enforces the runtime limit instead of relying on validation.
 //
 // According to
 // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,
--- a/staging/src/k8s.io/dynamic-resource-allocation/cel/compile.go
+++ b/staging/src/k8s.io/dynamic-resource-allocation/cel/compile.go
@ -20,7 +20,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
 	"math"
 	"reflect"
 	"strings"
 	"sync"
@ -39,6 +38,7 @@ import (
 	apiservercel "k8s.io/apiserver/pkg/cel"
 	"k8s.io/apiserver/pkg/cel/environment"
 	"k8s.io/apiserver/pkg/cel/library"
 	"k8s.io/utils/ptr"
 )
 const (
@ -93,10 +93,21 @@ func newCompiler() *compiler {
 	return &compiler{envset: mustBuildEnv()}
 }
 // Options contains several additional parameters
 // for [CompileCELExpression]. All of them have reasonable
 // defaults.
 type Options struct {
 	// EnvType allows to override the default environment type [environment.StoredExpressions].
 	EnvType *environment.Type
 	// CostLimit allows overriding the default runtime cost limit [resourceapi.CELSelectorExpressionMaxCost].
 	CostLimit *uint64
 }
 // CompileCELExpression returns a compiled CEL expression. It evaluates to bool.
 //
 // TODO (https://github.com/kubernetes/kubernetes/issues/125826): validate AST to detect invalid attribute names.
-func (c compiler) CompileCELExpression(expression string, envType environment.Type) CompilationResult {
+func (c compiler) CompileCELExpression(expression string, options Options) CompilationResult {
 	resultError := func(errorString string, errType apiservercel.ErrorType) CompilationResult {
 		return CompilationResult{
 			Error: &apiservercel.Error{
@ -107,7 +118,7 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
 		}
 	}
-	env, err := c.envset.Env(envType)
+	env, err := c.envset.Env(ptr.Deref(options.EnvType, environment.StoredExpressions))
 	if err != nil {
 		return resultError(fmt.Sprintf("unexpected error loading CEL environment: %v", err), apiservercel.ErrorTypeInternal)
 	}
@ -131,25 +142,10 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
 		return resultError("unexpected compilation error: "+err.Error(), apiservercel.ErrorTypeInternal)
 	}
 	prog, err := env.Program(ast,
-		// The Kubernetes CEL base environment sets the VAP cost limit.
+		// The Kubernetes CEL base environment sets the VAP limit as runtime cost limit.
-		//
+		// DRA has its own default cost limit and also allows the caller to change that
-		// In DRA, the cost check is done only at validation time.  At
+		// limit.
-		// runtime, any expression that passed validation gets executed
+		cel.CostLimit(ptr.Deref(options.CostLimit, resourceapi.CELSelectorExpressionMaxCost)),
 		// without interrupting it. The advantage is that it becomes
 		// easier to change the limit because stored expression do not
 		// suddenly fail after an up- or downgrade.  The limit could
 		// even become a configuration parameter of the apiserver
 		// because that is the only place where the limit gets checked
 		//
 		// We cannot unset the cost limit
 		// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/cel/options.go#L512-L518). But
 		// we can set a high value that then never triggers
 		// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/interpreter/runtimecost.go#L104-L106).
 		//
 		// Even better would be to enable cost tracking only during
 		// validation, but for that k8s.io/apiserver/pkg/cel/environment would
 		// have to be changed to not set the limit.
 		cel.CostLimit(math.MaxInt64),
 		cel.InterruptCheckFrequency(celconfig.CheckFrequency),
 	)
 	if err != nil {
--- a/staging/src/k8s.io/dynamic-resource-allocation/cel/compile_test.go
+++ b/staging/src/k8s.io/dynamic-resource-allocation/cel/compile_test.go
@ -23,7 +23,6 @@ import (
 	resourceapi "k8s.io/api/resource/v1alpha3"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apiserver/pkg/cel/environment"
 	"k8s.io/klog/v2/ktesting"
 	"k8s.io/utils/ptr"
 )
@ -228,6 +227,7 @@ device.attributes["dra.example.com"]["version"].isGreaterThan(semver("0.0.1"))
 		}(),
 		expression:       `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
 		driver:           "dra.example.com",
 		expectMatchError: "actual cost limit exceeded",
 		expectCost:       18446744073709551615, // Exceeds limit!
 	},
 }
@ -236,7 +236,7 @@ func TestCEL(t *testing.T) {
 	for name, scenario := range testcases {
 		t.Run(name, func(t *testing.T) {
 			_, ctx := ktesting.NewTestContext(t)
-			result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
+			result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
 			if scenario.expectCompileError != "" && result.Error == nil {
 				t.Fatalf("expected compile error %q, got none", scenario.expectCompileError)
 			}
@ -294,7 +294,7 @@ func BenchmarkDeviceMatches(b *testing.B) {
 		}
 		b.Run(name, func(b *testing.B) {
 			_, ctx := ktesting.NewTestContext(b)
-			result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions)
+			result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
 			if result.Error != nil {
 				b.Fatalf("unexpected compile error: %s", result.Error.Error())
 			}
--- a/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
+++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/allocator.go
@ -26,7 +26,6 @@ import (
 	v1 "k8s.io/api/core/v1"
 	resourceapi "k8s.io/api/resource/v1alpha3"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apiserver/pkg/cel/environment"
 	resourcelisters "k8s.io/client-go/listers/resource/v1alpha3"
 	"k8s.io/dynamic-resource-allocation/cel"
 	"k8s.io/klog/v2"
@ -653,7 +652,7 @@ func (alloc *allocator) isSelectable(r requestIndices, slice *resourceapi.Resour
 func (alloc *allocator) selectorsMatch(r requestIndices, device *resourceapi.BasicDevice, deviceID DeviceID, class *resourceapi.DeviceClass, selectors []resourceapi.DeviceSelector) (bool, error) {
 	for i, selector := range selectors {
-		expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, environment.StoredExpressions)
+		expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, cel.Options{})
 		if expr.Error != nil {
 			// Could happen if some future apiserver accepted some
 			// future expression and then got downgraded. Normally