mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-31 23:37:01 +00:00
kubelet: add DRAOperationsDuration metric
This commit is contained in:
parent
c6669ea7d6
commit
a21f3f0a04
@ -19,6 +19,7 @@ package dra
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
@ -35,6 +36,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
|
||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
// draManagerStateFileName is the file name where dra manager stores its state
|
||||
@ -150,6 +152,13 @@ func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
|
||||
// for each new resource requirement, process their responses and update the cached
|
||||
// containerResources on success.
|
||||
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||
startTime := time.Now()
|
||||
err := m.prepareResources(ctx, pod)
|
||||
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||
logger := klog.FromContext(ctx)
|
||||
batches := make(map[string][]*drapb.Claim)
|
||||
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
|
||||
@ -369,6 +378,10 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
|
||||
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
|
||||
// already been successfully unprepared.
|
||||
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||
var err error = nil
|
||||
defer func(startTime time.Time) {
|
||||
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
|
||||
}(time.Now())
|
||||
var claimNames []string
|
||||
for i := range pod.Spec.ResourceClaims {
|
||||
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
|
||||
@ -383,7 +396,8 @@ func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error
|
||||
}
|
||||
claimNames = append(claimNames, *claimName)
|
||||
}
|
||||
return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
|
||||
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
|
||||
|
@ -32,6 +32,7 @@ import (
|
||||
const (
|
||||
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
|
||||
KubeletSubsystem = "kubelet"
|
||||
DRASubsystem = "dra"
|
||||
NodeNameKey = "node_name"
|
||||
NodeLabelKey = "node"
|
||||
NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds"
|
||||
@ -132,6 +133,9 @@ const (
|
||||
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
|
||||
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
|
||||
|
||||
// Metric keys for DRA operations
|
||||
DRAOperationsDurationKey = "operations_duration_seconds"
|
||||
|
||||
// Values used in metric labels
|
||||
Container = "container"
|
||||
InitContainer = "init_container"
|
||||
@ -938,6 +942,18 @@ var (
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
|
||||
// DRAOperationsDuration tracks the duration of the DRA PrepareResources and UnprepareResources requests.
|
||||
DRAOperationsDuration = metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Subsystem: DRASubsystem,
|
||||
Name: DRAOperationsDurationKey,
|
||||
Help: "Latency histogram in seconds for the duration of handling all ResourceClaims referenced by a pod when the pod starts or stops. Identified by the name of the operation (PrepareResources or UnprepareResources) and separated by the success of the operation. The number of failed operations is provided through the histogram's overall count.",
|
||||
Buckets: metrics.DefBuckets,
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"operation_name", "is_error"},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -1030,6 +1046,10 @@ func Register(collectors ...metrics.StableCollector) {
|
||||
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
|
||||
legacyregistry.MustRegister(LifecycleHandlerSleepTerminated)
|
||||
legacyregistry.MustRegister(CgroupVersion)
|
||||
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
|
||||
legacyregistry.MustRegister(DRAOperationsDuration)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user