diff --git a/pkg/kubelet/cm/dra/manager.go b/pkg/kubelet/cm/dra/manager.go index e26e2f7b259..a5534536b6b 100644 --- a/pkg/kubelet/cm/dra/manager.go +++ b/pkg/kubelet/cm/dra/manager.go @@ -19,6 +19,7 @@ package dra import ( "context" "fmt" + "strconv" "time" v1 "k8s.io/api/core/v1" @@ -35,6 +36,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/cm/dra/state" "k8s.io/kubernetes/pkg/kubelet/config" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + "k8s.io/kubernetes/pkg/kubelet/metrics" ) // draManagerStateFileName is the file name where dra manager stores its state @@ -150,6 +152,13 @@ func (m *ManagerImpl) reconcileLoop(ctx context.Context) { // for each new resource requirement, process their responses and update the cached // containerResources on success. func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error { + startTime := time.Now() + err := m.prepareResources(ctx, pod) + metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds()) + return err +} + +func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error { logger := klog.FromContext(ctx) batches := make(map[string][]*drapb.Claim) resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim) @@ -369,6 +378,10 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta // As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have // already been successfully unprepared. func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error { + var err error = nil + defer func(startTime time.Time) { + metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds()) + }(time.Now()) var claimNames []string for i := range pod.Spec.ResourceClaims { claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i]) @@ -383,7 +396,8 @@ func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error } claimNames = append(claimNames, *claimName) } - return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames) + err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames) + return err } func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error { diff --git a/pkg/kubelet/metrics/metrics.go b/pkg/kubelet/metrics/metrics.go index 30a194a3a66..97f79dd052c 100644 --- a/pkg/kubelet/metrics/metrics.go +++ b/pkg/kubelet/metrics/metrics.go @@ -32,6 +32,7 @@ import ( const ( FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds" KubeletSubsystem = "kubelet" + DRASubsystem = "dra" NodeNameKey = "node_name" NodeLabelKey = "node" NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds" @@ -132,6 +133,9 @@ const ( ContainerAlignedComputeResourcesScopeLabelKey = "scope" ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary" + // Metric keys for DRA operations + DRAOperationsDurationKey = "operations_duration_seconds" + // Values used in metric labels Container = "container" InitContainer = "init_container" @@ -938,6 +942,18 @@ var ( StabilityLevel: metrics.ALPHA, }, ) + + // DRAOperationsDuration tracks the duration of the DRA PrepareResources and UnprepareResources requests. + DRAOperationsDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: DRASubsystem, + Name: DRAOperationsDurationKey, + Help: "Latency histogram in seconds for the duration of handling all ResourceClaims referenced by a pod when the pod starts or stops. Identified by the name of the operation (PrepareResources or UnprepareResources) and separated by the success of the operation. The number of failed operations is provided through the histogram's overall count.", + Buckets: metrics.DefBuckets, + StabilityLevel: metrics.ALPHA, + }, + []string{"operation_name", "is_error"}, + ) ) var registerMetrics sync.Once @@ -1030,6 +1046,10 @@ func Register(collectors ...metrics.StableCollector) { legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks) legacyregistry.MustRegister(LifecycleHandlerSleepTerminated) legacyregistry.MustRegister(CgroupVersion) + + if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) { + legacyregistry.MustRegister(DRAOperationsDuration) + } }) }