Implement scheduler_resourceclaim_creates_total metrics for DRAExtendedResources

This commit is contained in:
Ayato Tokubi
2025-10-08 15:09:11 +00:00
parent 5102591a6b
commit ea7561b243
3 changed files with 91 additions and 3 deletions

View File

@@ -53,6 +53,7 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/metrics"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
"k8s.io/kubernetes/pkg/util/slice"
@@ -1477,8 +1478,10 @@ func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, ind
var err error
claim, err = pl.clientset.ResourceV1().ResourceClaims(claim.Namespace).Create(ctx, claim, metav1.CreateOptions{})
if err != nil {
metrics.ResourceClaimCreatesTotal.WithLabelValues("failure").Inc()
return nil, fmt.Errorf("create claim for extended resources %v: %w", klog.KObj(claim), err)
}
metrics.ResourceClaimCreatesTotal.WithLabelValues("success").Inc()
resourceClaimModified = true
logger.V(5).Info("created claim for extended resources", "pod", klog.KObj(pod), "node", nodeName, "resourceclaim", klog.Format(claim))

View File

@@ -32,7 +32,6 @@ import (
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -45,6 +44,8 @@ import (
cgotesting "k8s.io/client-go/testing"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/events"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/testutil"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache"
resourceslicetracker "k8s.io/dynamic-resource-allocation/resourceslice/tracker"
@@ -57,12 +58,17 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
"k8s.io/kubernetes/pkg/scheduler/metrics"
st "k8s.io/kubernetes/pkg/scheduler/testing"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
"k8s.io/kubernetes/test/utils/ktesting"
"k8s.io/utils/ptr"
)
func init() {
metrics.InitMetrics()
}
var (
podKind = v1.SchemeGroupVersion.WithKind("Pod")
@@ -777,6 +783,7 @@ func TestPlugin(t *testing.T) {
disableDRASchedulerFilterTimeout bool
skipOnWindows string
failPatch bool
metrics func(*testing.T, compbasemetrics.Gatherer)
}{
"empty": {
pod: st.MakePod().Name("foo").Namespace("default").Obj(),
@@ -1411,7 +1418,7 @@ func TestPlugin(t *testing.T) {
},
},
},
"extended-resource-name-wth-node-resource": {
"extended-resource-name-with-node-resource": {
enableDRAExtendedResource: true,
enableDRADeviceBindingConditions: true,
enableDRAResourceClaimDeviceStatus: true,
@@ -1419,6 +1426,10 @@ func TestPlugin(t *testing.T) {
pod: podWithExtendedResourceName,
classes: []*resourceapi.DeviceClass{deviceClassWithExtendResourceName},
want: want{},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
_, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.ErrorContains(t, err, "not found")
},
},
"extended-resource-name-with-zero-allocatable": {
enableDRAExtendedResource: true,
@@ -1466,6 +1477,10 @@ func TestPlugin(t *testing.T) {
status: fwk.NewStatus(fwk.Unschedulable, `still not schedulable`),
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
_, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.ErrorContains(t, err, "not found")
},
},
"extended-resource-name-with-resources": {
enableDRAExtendedResource: true,
@@ -1484,6 +1499,11 @@ func TestPlugin(t *testing.T) {
assumedClaim: reserve(extendedResourceClaim, podWithExtendedResourceName),
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"implicit-extended-resource-name-with-resources": {
enableDRAExtendedResource: true,
@@ -1502,6 +1522,11 @@ func TestPlugin(t *testing.T) {
assumedClaim: reserve(implicitExtendedResourceClaim, podWithImplicitExtendedResourceName),
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"implicit-extended-resource-name-two-containers-with-resources": {
enableDRAExtendedResource: true,
@@ -1520,6 +1545,11 @@ func TestPlugin(t *testing.T) {
assumedClaim: reserve(implicitExtendedResourceClaimTwoContainers, podWithImplicitExtendedResourceNameTwoContainers),
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"extended-resource-name-with-resources-fail-patch": {
enableDRAExtendedResource: true,
@@ -1540,6 +1570,11 @@ func TestPlugin(t *testing.T) {
assumedClaim: reserve(extendedResourceClaim, podWithExtendedResourceName),
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"extended-resource-name-with-resources-has-claim": {
enableDRAExtendedResource: true,
@@ -1558,6 +1593,10 @@ func TestPlugin(t *testing.T) {
removed: []metav1.Object{extendedResourceClaim},
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
_, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.ErrorContains(t, err, "not found")
},
},
"extended-resource-name-with-resources-delete-claim": {
enableDRAExtendedResource: true,
@@ -1576,6 +1615,10 @@ func TestPlugin(t *testing.T) {
removed: []metav1.Object{extendedResourceClaimNode2},
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
_, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.ErrorContains(t, err, "not found")
},
},
"extended-resource-name-bind-failure": {
enableDRAExtendedResource: true,
@@ -1594,6 +1637,11 @@ func TestPlugin(t *testing.T) {
removed: []metav1.Object{reserve(extendedResourceClaim, podWithExtendedResourceName)},
},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"extended-resource-name-skip-bind": {
enableDRAExtendedResource: true,
@@ -1606,6 +1654,11 @@ func TestPlugin(t *testing.T) {
},
unreserveBeforePreBind: &result{},
},
metrics: func(t *testing.T, g compbasemetrics.Gatherer) {
metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status")
assert.NoError(t, err)
assert.Equal(t, float64(1), metric["success"])
},
},
"canceled": {
cancelFilter: true,
@@ -1966,7 +2019,6 @@ func TestPlugin(t *testing.T) {
if len(tc.skipOnWindows) > 0 && goruntime.GOOS == "windows" {
t.Skipf("Skipping '%s' test case on Windows, reason: %s", name, tc.skipOnWindows)
}
// We can run in parallel because logging is per-test.
tc := tc
t.Run(name, func(t *testing.T) {
nodes := tc.nodes
@@ -1987,6 +2039,10 @@ func TestPlugin(t *testing.T) {
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.DRAExtendedResource, tc.enableDRAExtendedResource)
testCtx := setup(t, tc.args, nodes, tc.claims, tc.classes, tc.objs, feats, tc.failPatch)
initialObjects := testCtx.listAll(t)
var registry compbasemetrics.KubeRegistry
if tc.metrics != nil {
registry = setupMetrics(features)
}
status := testCtx.p.PreEnqueue(testCtx.ctx, tc.pod)
t.Run("PreEnqueue", func(t *testing.T) {
@@ -2103,10 +2159,24 @@ func TestPlugin(t *testing.T) {
testCtx.verify(t, tc.want.postfilter, initialObjects, nil, status)
})
}
if tc.metrics != nil {
tc.metrics(t, registry)
}
})
}
}
func setupMetrics(features feature.Features) compbasemetrics.KubeRegistry {
// Since feature gate is not set globally, we can't use metrics.Register().
// We use a new registry instead of using global registry.
testRegistry := compbasemetrics.NewKubeRegistry()
if features.EnableDRAExtendedResource {
testRegistry.MustRegister(metrics.ResourceClaimCreatesTotal)
metrics.ResourceClaimCreatesTotal.Reset()
}
return testRegistry
}
type testContext struct {
ctx context.Context
client *fake.Clientset

View File

@@ -127,6 +127,9 @@ var (
AsyncAPICallDuration *metrics.HistogramVec
AsyncAPIPendingCalls *metrics.GaugeVec
// The below is only available when the DRAExtendedResource feature gate is enabled.
ResourceClaimCreatesTotal *metrics.CounterVec
// metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value.
metricsList []metrics.Registerable
)
@@ -154,6 +157,9 @@ func Register() {
AsyncAPIPendingCalls,
)
}
if utilfeature.DefaultFeatureGate.Enabled(features.DRAExtendedResource) {
RegisterMetrics(ResourceClaimCreatesTotal)
}
})
}
@@ -377,6 +383,15 @@ func InitMetrics() {
},
[]string{"call_type"})
ResourceClaimCreatesTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "resourceclaim_creates_total",
Help: "Number of ResourceClaims creation requests within scheduler",
StabilityLevel: metrics.ALPHA,
},
[]string{"status"})
metricsList = []metrics.Registerable{
scheduleAttempts,
schedulingLatency,