kubelet: add DRAOperationsDuration metric

This commit is contained in:
Ed Bartosh 2024-09-05 15:18:32 +03:00
parent c6669ea7d6
commit a21f3f0a04
2 changed files with 35 additions and 1 deletions

View File

@ -19,6 +19,7 @@ package dra
import (
"context"
"fmt"
"strconv"
"time"
v1 "k8s.io/api/core/v1"
@ -35,6 +36,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
// draManagerStateFileName is the file name where dra manager stores its state
@ -150,6 +152,13 @@ func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
// for each new resource requirement, process their responses and update the cached
// containerResources on success.
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
startTime := time.Now()
err := m.prepareResources(ctx, pod)
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
return err
}
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim)
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
@ -369,6 +378,10 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
// already been successfully unprepared.
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
var err error = nil
defer func(startTime time.Time) {
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
}(time.Now())
var claimNames []string
for i := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
@ -383,7 +396,8 @@ func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error
}
claimNames = append(claimNames, *claimName)
}
return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
return err
}
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {

View File

@ -32,6 +32,7 @@ import (
const (
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
KubeletSubsystem = "kubelet"
DRASubsystem = "dra"
NodeNameKey = "node_name"
NodeLabelKey = "node"
NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds"
@ -132,6 +133,9 @@ const (
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
// Metric keys for DRA operations
DRAOperationsDurationKey = "operations_duration_seconds"
// Values used in metric labels
Container = "container"
InitContainer = "init_container"
@ -938,6 +942,18 @@ var (
StabilityLevel: metrics.ALPHA,
},
)
// DRAOperationsDuration tracks the duration of the DRA PrepareResources and UnprepareResources requests.
DRAOperationsDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: DRASubsystem,
Name: DRAOperationsDurationKey,
Help: "Latency histogram in seconds for the duration of handling all ResourceClaims referenced by a pod when the pod starts or stops. Identified by the name of the operation (PrepareResources or UnprepareResources) and separated by the success of the operation. The number of failed operations is provided through the histogram's overall count.",
Buckets: metrics.DefBuckets,
StabilityLevel: metrics.ALPHA,
},
[]string{"operation_name", "is_error"},
)
)
var registerMetrics sync.Once
@ -1030,6 +1046,10 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
legacyregistry.MustRegister(LifecycleHandlerSleepTerminated)
legacyregistry.MustRegister(CgroupVersion)
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
legacyregistry.MustRegister(DRAOperationsDuration)
}
})
}