DRA cel: enforce runtime limit by default again

As pointed out during code review, the CEL cost estimates are not considered
perfectly reliable. Therefore it is better to also do runtime checks.

Some downstream users might decide to allow CEL expressions to run
longer. Therefore the cost limit is now part of an Options struct.
kube-scheduler uses the default cost limit defined in the resource.k8s.io API,
which is the same cost limit that also the apiserver uses during validation.
This commit is contained in:
Patrick Ohly 2024-10-23 21:15:05 +02:00
parent 021c9fb92f
commit d53cb79cec
6 changed files with 40 additions and 35 deletions

View File

@ -545,8 +545,13 @@ type CELDeviceSelector struct {
// Validation against this limit and [CELSelectorExpressionMaxLength] happens // Validation against this limit and [CELSelectorExpressionMaxLength] happens
// only when setting an expression for the first time or when changing it. If // only when setting an expression for the first time or when changing it. If
// the limits are changed in a future Kubernetes release, existing users are // the limits are changed in a future Kubernetes release, existing users are
// guaranteed that existing expressions will continue to be valid and won't be // guaranteed that existing expressions will continue to be valid.
// interrupted at runtime after an up- or downgrade. //
// However, the kube-scheduler also applies this cost limit at runtime, so it
// could happen that a valid expression fails at runtime after an up- or
// downgrade. This can also happen without version skew when the cost estimate
// underestimated the actual cost. That this might happen is the reason why
// kube-scheduler enforces the runtime limit instead of relying on validation.
// //
// According to // According to
// https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22, // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,

View File

@ -176,7 +176,7 @@ func validateCELSelector(celSelector resource.CELDeviceSelector, fldPath *field.
return allErrs return allErrs
} }
result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, envType) result := dracel.GetCompiler().CompileCELExpression(celSelector.Expression, dracel.Options{EnvType: &envType})
if result.Error != nil { if result.Error != nil {
allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error)) allErrs = append(allErrs, convertCELErrorToValidationError(fldPath.Child("expression"), celSelector.Expression, result.Error))
} else if result.MaxCost > resource.CELSelectorExpressionMaxCost { } else if result.MaxCost > resource.CELSelectorExpressionMaxCost {

View File

@ -551,8 +551,13 @@ type CELDeviceSelector struct {
// Validation against this limit and [CELSelectorExpressionMaxLength] happens // Validation against this limit and [CELSelectorExpressionMaxLength] happens
// only when setting an expression for the first time or when changing it. If // only when setting an expression for the first time or when changing it. If
// the limits are changed in a future Kubernetes release, existing users are // the limits are changed in a future Kubernetes release, existing users are
// guaranteed that existing expressions will continue to be valid and won't be // guaranteed that existing expressions will continue to be valid.
// interrupted at runtime after an up- or downgrade. //
// However, the kube-scheduler also applies this cost limit at runtime, so it
// could happen that a valid expression fails at runtime after an up- or
// downgrade. This can also happen without version skew when the cost estimate
// underestimated the actual cost. That this might happen is the reason why
// kube-scheduler enforces the runtime limit instead of relying on validation.
// //
// According to // According to
// https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22, // https://github.com/kubernetes/kubernetes/blob/4aeaf1e99e82da8334c0d6dddd848a194cd44b4f/staging/src/k8s.io/apiserver/pkg/apis/cel/config.go#L20-L22,

View File

@ -20,7 +20,6 @@ import (
"context" "context"
"errors" "errors"
"fmt" "fmt"
"math"
"reflect" "reflect"
"strings" "strings"
"sync" "sync"
@ -39,6 +38,7 @@ import (
apiservercel "k8s.io/apiserver/pkg/cel" apiservercel "k8s.io/apiserver/pkg/cel"
"k8s.io/apiserver/pkg/cel/environment" "k8s.io/apiserver/pkg/cel/environment"
"k8s.io/apiserver/pkg/cel/library" "k8s.io/apiserver/pkg/cel/library"
"k8s.io/utils/ptr"
) )
const ( const (
@ -93,10 +93,21 @@ func newCompiler() *compiler {
return &compiler{envset: mustBuildEnv()} return &compiler{envset: mustBuildEnv()}
} }
// Options contains several additional parameters
// for [CompileCELExpression]. All of them have reasonable
// defaults.
type Options struct {
// EnvType allows to override the default environment type [environment.StoredExpressions].
EnvType *environment.Type
// CostLimit allows overriding the default runtime cost limit [resourceapi.CELSelectorExpressionMaxCost].
CostLimit *uint64
}
// CompileCELExpression returns a compiled CEL expression. It evaluates to bool. // CompileCELExpression returns a compiled CEL expression. It evaluates to bool.
// //
// TODO (https://github.com/kubernetes/kubernetes/issues/125826): validate AST to detect invalid attribute names. // TODO (https://github.com/kubernetes/kubernetes/issues/125826): validate AST to detect invalid attribute names.
func (c compiler) CompileCELExpression(expression string, envType environment.Type) CompilationResult { func (c compiler) CompileCELExpression(expression string, options Options) CompilationResult {
resultError := func(errorString string, errType apiservercel.ErrorType) CompilationResult { resultError := func(errorString string, errType apiservercel.ErrorType) CompilationResult {
return CompilationResult{ return CompilationResult{
Error: &apiservercel.Error{ Error: &apiservercel.Error{
@ -107,7 +118,7 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
} }
} }
env, err := c.envset.Env(envType) env, err := c.envset.Env(ptr.Deref(options.EnvType, environment.StoredExpressions))
if err != nil { if err != nil {
return resultError(fmt.Sprintf("unexpected error loading CEL environment: %v", err), apiservercel.ErrorTypeInternal) return resultError(fmt.Sprintf("unexpected error loading CEL environment: %v", err), apiservercel.ErrorTypeInternal)
} }
@ -131,25 +142,10 @@ func (c compiler) CompileCELExpression(expression string, envType environment.Ty
return resultError("unexpected compilation error: "+err.Error(), apiservercel.ErrorTypeInternal) return resultError("unexpected compilation error: "+err.Error(), apiservercel.ErrorTypeInternal)
} }
prog, err := env.Program(ast, prog, err := env.Program(ast,
// The Kubernetes CEL base environment sets the VAP cost limit. // The Kubernetes CEL base environment sets the VAP limit as runtime cost limit.
// // DRA has its own default cost limit and also allows the caller to change that
// In DRA, the cost check is done only at validation time. At // limit.
// runtime, any expression that passed validation gets executed cel.CostLimit(ptr.Deref(options.CostLimit, resourceapi.CELSelectorExpressionMaxCost)),
// without interrupting it. The advantage is that it becomes
// easier to change the limit because stored expression do not
// suddenly fail after an up- or downgrade. The limit could
// even become a configuration parameter of the apiserver
// because that is the only place where the limit gets checked
//
// We cannot unset the cost limit
// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/cel/options.go#L512-L518). But
// we can set a high value that then never triggers
// (https://github.com/kubernetes/kubernetes/blob/9568a2ac145cf9be930e3da835f86c1e61f7f7c1/vendor/github.com/google/cel-go/interpreter/runtimecost.go#L104-L106).
//
// Even better would be to enable cost tracking only during
// validation, but for that k8s.io/apiserver/pkg/cel/environment would
// have to be changed to not set the limit.
cel.CostLimit(math.MaxInt64),
cel.InterruptCheckFrequency(celconfig.CheckFrequency), cel.InterruptCheckFrequency(celconfig.CheckFrequency),
) )
if err != nil { if err != nil {

View File

@ -23,7 +23,6 @@ import (
resourceapi "k8s.io/api/resource/v1alpha3" resourceapi "k8s.io/api/resource/v1alpha3"
"k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apiserver/pkg/cel/environment"
"k8s.io/klog/v2/ktesting" "k8s.io/klog/v2/ktesting"
"k8s.io/utils/ptr" "k8s.io/utils/ptr"
) )
@ -228,6 +227,7 @@ device.attributes["dra.example.com"]["version"].isGreaterThan(semver("0.0.1"))
}(), }(),
expression: `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`, expression: `device.attributes["dra.example.com"].map(s, s.lowerAscii()).map(s, s.size()).sum() == 0`,
driver: "dra.example.com", driver: "dra.example.com",
expectMatchError: "actual cost limit exceeded",
expectCost: 18446744073709551615, // Exceeds limit! expectCost: 18446744073709551615, // Exceeds limit!
}, },
} }
@ -236,7 +236,7 @@ func TestCEL(t *testing.T) {
for name, scenario := range testcases { for name, scenario := range testcases {
t.Run(name, func(t *testing.T) { t.Run(name, func(t *testing.T) {
_, ctx := ktesting.NewTestContext(t) _, ctx := ktesting.NewTestContext(t)
result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions) result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
if scenario.expectCompileError != "" && result.Error == nil { if scenario.expectCompileError != "" && result.Error == nil {
t.Fatalf("expected compile error %q, got none", scenario.expectCompileError) t.Fatalf("expected compile error %q, got none", scenario.expectCompileError)
} }
@ -294,7 +294,7 @@ func BenchmarkDeviceMatches(b *testing.B) {
} }
b.Run(name, func(b *testing.B) { b.Run(name, func(b *testing.B) {
_, ctx := ktesting.NewTestContext(b) _, ctx := ktesting.NewTestContext(b)
result := GetCompiler().CompileCELExpression(scenario.expression, environment.StoredExpressions) result := GetCompiler().CompileCELExpression(scenario.expression, Options{})
if result.Error != nil { if result.Error != nil {
b.Fatalf("unexpected compile error: %s", result.Error.Error()) b.Fatalf("unexpected compile error: %s", result.Error.Error())
} }

View File

@ -26,7 +26,6 @@ import (
v1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1alpha3" resourceapi "k8s.io/api/resource/v1alpha3"
"k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/cel/environment"
resourcelisters "k8s.io/client-go/listers/resource/v1alpha3" resourcelisters "k8s.io/client-go/listers/resource/v1alpha3"
"k8s.io/dynamic-resource-allocation/cel" "k8s.io/dynamic-resource-allocation/cel"
"k8s.io/klog/v2" "k8s.io/klog/v2"
@ -653,7 +652,7 @@ func (alloc *allocator) isSelectable(r requestIndices, slice *resourceapi.Resour
func (alloc *allocator) selectorsMatch(r requestIndices, device *resourceapi.BasicDevice, deviceID DeviceID, class *resourceapi.DeviceClass, selectors []resourceapi.DeviceSelector) (bool, error) { func (alloc *allocator) selectorsMatch(r requestIndices, device *resourceapi.BasicDevice, deviceID DeviceID, class *resourceapi.DeviceClass, selectors []resourceapi.DeviceSelector) (bool, error) {
for i, selector := range selectors { for i, selector := range selectors {
expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, environment.StoredExpressions) expr := cel.GetCompiler().CompileCELExpression(selector.CEL.Expression, cel.Options{})
if expr.Error != nil { if expr.Error != nil {
// Could happen if some future apiserver accepted some // Could happen if some future apiserver accepted some
// future expression and then got downgraded. Normally // future expression and then got downgraded. Normally