mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-04 18:00:08 +00:00
kubelet: add DRAOperationsDuration metric
This commit is contained in:
parent
c6669ea7d6
commit
a21f3f0a04
@ -19,6 +19,7 @@ package dra
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
v1 "k8s.io/api/core/v1"
|
v1 "k8s.io/api/core/v1"
|
||||||
@ -35,6 +36,7 @@ import (
|
|||||||
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
|
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
// draManagerStateFileName is the file name where dra manager stores its state
|
// draManagerStateFileName is the file name where dra manager stores its state
|
||||||
@ -150,6 +152,13 @@ func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
|
|||||||
// for each new resource requirement, process their responses and update the cached
|
// for each new resource requirement, process their responses and update the cached
|
||||||
// containerResources on success.
|
// containerResources on success.
|
||||||
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
|
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||||
|
startTime := time.Now()
|
||||||
|
err := m.prepareResources(ctx, pod)
|
||||||
|
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||||
logger := klog.FromContext(ctx)
|
logger := klog.FromContext(ctx)
|
||||||
batches := make(map[string][]*drapb.Claim)
|
batches := make(map[string][]*drapb.Claim)
|
||||||
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
|
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
|
||||||
@ -369,6 +378,10 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
|
|||||||
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
|
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
|
||||||
// already been successfully unprepared.
|
// already been successfully unprepared.
|
||||||
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
|
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
|
||||||
|
var err error = nil
|
||||||
|
defer func(startTime time.Time) {
|
||||||
|
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
|
||||||
|
}(time.Now())
|
||||||
var claimNames []string
|
var claimNames []string
|
||||||
for i := range pod.Spec.ResourceClaims {
|
for i := range pod.Spec.ResourceClaims {
|
||||||
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
|
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
|
||||||
@ -383,7 +396,8 @@ func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error
|
|||||||
}
|
}
|
||||||
claimNames = append(claimNames, *claimName)
|
claimNames = append(claimNames, *claimName)
|
||||||
}
|
}
|
||||||
return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
|
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
|
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
|
||||||
|
@ -32,6 +32,7 @@ import (
|
|||||||
const (
|
const (
|
||||||
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
|
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
|
||||||
KubeletSubsystem = "kubelet"
|
KubeletSubsystem = "kubelet"
|
||||||
|
DRASubsystem = "dra"
|
||||||
NodeNameKey = "node_name"
|
NodeNameKey = "node_name"
|
||||||
NodeLabelKey = "node"
|
NodeLabelKey = "node"
|
||||||
NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds"
|
NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds"
|
||||||
@ -132,6 +133,9 @@ const (
|
|||||||
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
|
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
|
||||||
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
|
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
|
||||||
|
|
||||||
|
// Metric keys for DRA operations
|
||||||
|
DRAOperationsDurationKey = "operations_duration_seconds"
|
||||||
|
|
||||||
// Values used in metric labels
|
// Values used in metric labels
|
||||||
Container = "container"
|
Container = "container"
|
||||||
InitContainer = "init_container"
|
InitContainer = "init_container"
|
||||||
@ -938,6 +942,18 @@ var (
|
|||||||
StabilityLevel: metrics.ALPHA,
|
StabilityLevel: metrics.ALPHA,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// DRAOperationsDuration tracks the duration of the DRA PrepareResources and UnprepareResources requests.
|
||||||
|
DRAOperationsDuration = metrics.NewHistogramVec(
|
||||||
|
&metrics.HistogramOpts{
|
||||||
|
Subsystem: DRASubsystem,
|
||||||
|
Name: DRAOperationsDurationKey,
|
||||||
|
Help: "Latency histogram in seconds for the duration of handling all ResourceClaims referenced by a pod when the pod starts or stops. Identified by the name of the operation (PrepareResources or UnprepareResources) and separated by the success of the operation. The number of failed operations is provided through the histogram's overall count.",
|
||||||
|
Buckets: metrics.DefBuckets,
|
||||||
|
StabilityLevel: metrics.ALPHA,
|
||||||
|
},
|
||||||
|
[]string{"operation_name", "is_error"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
var registerMetrics sync.Once
|
var registerMetrics sync.Once
|
||||||
@ -1030,6 +1046,10 @@ func Register(collectors ...metrics.StableCollector) {
|
|||||||
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
|
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
|
||||||
legacyregistry.MustRegister(LifecycleHandlerSleepTerminated)
|
legacyregistry.MustRegister(LifecycleHandlerSleepTerminated)
|
||||||
legacyregistry.MustRegister(CgroupVersion)
|
legacyregistry.MustRegister(CgroupVersion)
|
||||||
|
|
||||||
|
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
|
||||||
|
legacyregistry.MustRegister(DRAOperationsDuration)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user