mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-29 22:46:12 +00:00
Add metrics for azure service operations (route and loadbalancer).
This commit is contained in:
parent
16ea9dc6bc
commit
88c72cc3ad
@ -91,6 +91,7 @@ go_library(
|
||||
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient:go_default_library",
|
||||
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient:go_default_library",
|
||||
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient:go_default_library",
|
||||
"//staging/src/k8s.io/legacy-cloud-providers/azure/metrics:go_default_library",
|
||||
"//staging/src/k8s.io/legacy-cloud-providers/azure/retry:go_default_library",
|
||||
"//vendor/github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute:go_default_library",
|
||||
"//vendor/github.com/Azure/azure-sdk-for-go/services/network/mgmt/2019-06-01/network:go_default_library",
|
||||
|
@ -36,6 +36,7 @@ import (
|
||||
servicehelpers "k8s.io/cloud-provider/service/helpers"
|
||||
"k8s.io/klog/v2"
|
||||
azcache "k8s.io/legacy-cloud-providers/azure/cache"
|
||||
"k8s.io/legacy-cloud-providers/azure/metrics"
|
||||
"k8s.io/legacy-cloud-providers/azure/retry"
|
||||
utilnet "k8s.io/utils/net"
|
||||
)
|
||||
@ -157,6 +158,12 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
|
||||
serviceName := getServiceName(service)
|
||||
klog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName)
|
||||
|
||||
mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName)
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
|
||||
if err != nil {
|
||||
klog.Errorf("reconcileLoadBalancer(%s) failed: %v", serviceName, err)
|
||||
@ -192,6 +199,8 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
|
||||
return nil, err
|
||||
}
|
||||
|
||||
isOperationSucceeded = true
|
||||
|
||||
return lbStatus, nil
|
||||
}
|
||||
|
||||
@ -216,6 +225,12 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
|
||||
serviceName := getServiceName(service)
|
||||
klog.V(5).Infof("Delete service (%s): START clusterName=%q", serviceName, clusterName)
|
||||
|
||||
mc := metrics.NewMetricContext("services", "ensure_loadbalancer_deleted", az.ResourceGroup, az.SubscriptionID, serviceName)
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
serviceIPToCleanup, err := az.findServiceIPAddress(ctx, clusterName, service, isInternal)
|
||||
if err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) {
|
||||
return err
|
||||
@ -235,6 +250,8 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
|
||||
}
|
||||
|
||||
klog.V(2).Infof("Delete service (%s): FINISH", serviceName)
|
||||
isOperationSucceeded = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -33,6 +33,7 @@ import (
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
"k8s.io/klog/v2"
|
||||
azcache "k8s.io/legacy-cloud-providers/azure/cache"
|
||||
"k8s.io/legacy-cloud-providers/azure/metrics"
|
||||
utilnet "k8s.io/utils/net"
|
||||
)
|
||||
|
||||
@ -282,6 +283,12 @@ func (az *Cloud) createRouteTable() error {
|
||||
// route.Name will be ignored, although the cloud-provider may use nameHint
|
||||
// to create a more user-meaningful name.
|
||||
func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint string, kubeRoute *cloudprovider.Route) error {
|
||||
mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, "")
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
var targetIP string
|
||||
nodeName := string(kubeRoute.TargetNode)
|
||||
@ -351,12 +358,20 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s
|
||||
}
|
||||
|
||||
klog.V(2).Infof("CreateRoute: route created. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
|
||||
isOperationSucceeded = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeleteRoute deletes the specified managed route
|
||||
// Route should be as returned by ListRoutes
|
||||
func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute *cloudprovider.Route) error {
|
||||
mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, "")
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
nodeName := string(kubeRoute.TargetNode)
|
||||
unmanaged, err := az.IsNodeUnmanaged(nodeName)
|
||||
@ -392,6 +407,8 @@ func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute
|
||||
}
|
||||
|
||||
klog.V(2).Infof("DeleteRoute: route deleted. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
|
||||
isOperationSucceeded = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -38,10 +38,10 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
"k8s.io/component-base/featuregate"
|
||||
"k8s.io/klog/v2"
|
||||
azcache "k8s.io/legacy-cloud-providers/azure/cache"
|
||||
|
||||
"k8s.io/component-base/featuregate"
|
||||
"k8s.io/legacy-cloud-providers/azure/metrics"
|
||||
utilnet "k8s.io/utils/net"
|
||||
)
|
||||
|
||||
@ -808,6 +808,12 @@ func (as *availabilitySet) EnsureHostInPool(service *v1.Service, nodeName types.
|
||||
// EnsureHostsInPool ensures the given Node's primary IP configurations are
|
||||
// participating in the specified LoadBalancer Backend Pool.
|
||||
func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
|
||||
mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, service.Name)
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
hostUpdates := make([]func() error, 0, len(nodes))
|
||||
for _, node := range nodes {
|
||||
localNodeName := node.Name
|
||||
@ -836,6 +842,7 @@ func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.No
|
||||
return utilerrors.Flatten(errs)
|
||||
}
|
||||
|
||||
isOperationSucceeded = true
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,7 @@ import (
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
"k8s.io/klog/v2"
|
||||
azcache "k8s.io/legacy-cloud-providers/azure/cache"
|
||||
"k8s.io/legacy-cloud-providers/azure/metrics"
|
||||
utilnet "k8s.io/utils/net"
|
||||
)
|
||||
|
||||
@ -1183,6 +1184,12 @@ func (ss *scaleSet) ensureVMSSInPool(service *v1.Service, nodes []*v1.Node, back
|
||||
// EnsureHostsInPool ensures the given Node's primary IP configurations are
|
||||
// participating in the specified LoadBalancer Backend Pool.
|
||||
func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
|
||||
mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, service.Name)
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
hostUpdates := make([]func() error, 0, len(nodes))
|
||||
nodeUpdates := make(map[vmssMetaInfo]map[string]compute.VirtualMachineScaleSetVM)
|
||||
errors := make([]error, 0)
|
||||
@ -1281,6 +1288,7 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
|
||||
return err
|
||||
}
|
||||
|
||||
isOperationSucceeded = true
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -1484,6 +1492,12 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
|
||||
return nil
|
||||
}
|
||||
|
||||
mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, service.Name)
|
||||
isOperationSucceeded := false
|
||||
defer func() {
|
||||
mc.ObserveOperationWithResult(isOperationSucceeded)
|
||||
}()
|
||||
|
||||
ipConfigurationIDs := []string{}
|
||||
for _, backendPool := range *backendAddressPools {
|
||||
if strings.EqualFold(*backendPool.ID, backendPoolID) && backendPool.BackendIPConfigurations != nil {
|
||||
@ -1582,5 +1596,6 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
|
||||
return err
|
||||
}
|
||||
|
||||
isOperationSucceeded = true
|
||||
return nil
|
||||
}
|
||||
|
@ -26,6 +26,24 @@ import (
|
||||
"k8s.io/component-base/metrics/legacyregistry"
|
||||
)
|
||||
|
||||
const (
|
||||
azureMetricsNamespace = "cloudprovider_azure"
|
||||
)
|
||||
|
||||
var (
|
||||
metricLabels = []string{
|
||||
"request", // API function that is being invoked
|
||||
"resource_group", // Resource group of the resource being monitored
|
||||
"subscription_id", // Subscription ID of the resource being monitored
|
||||
"source", // Operation source(optional)
|
||||
}
|
||||
|
||||
apiMetrics = registerAPIMetrics(metricLabels...)
|
||||
operationMetrics = registerOperationMetrics(metricLabels...)
|
||||
)
|
||||
|
||||
// apiCallMetrics is the metrics measuring the performance of a single API call
|
||||
// e.g., GET, POST ...
|
||||
type apiCallMetrics struct {
|
||||
latency *metrics.HistogramVec
|
||||
errors *metrics.CounterVec
|
||||
@ -33,16 +51,12 @@ type apiCallMetrics struct {
|
||||
throttledCount *metrics.CounterVec
|
||||
}
|
||||
|
||||
var (
|
||||
metricLabels = []string{
|
||||
"request", // API function that is being invoked
|
||||
"resource_group", // Resource group of the resource being monitored
|
||||
"subscription_id", // Subscription ID of the resource being monitored
|
||||
"source", // Oeration source(optional)
|
||||
}
|
||||
|
||||
apiMetrics = registerAPIMetrics(metricLabels...)
|
||||
)
|
||||
// operationCallMetrics is the metrics measuring the performance of a whole operation
|
||||
// e.g., the create / update / delete process of a loadbalancer or route.
|
||||
type operationCallMetrics struct {
|
||||
operationLatency *metrics.HistogramVec
|
||||
operationFailureCount *metrics.CounterVec
|
||||
}
|
||||
|
||||
// MetricContext indicates the context for Azure client metrics.
|
||||
type MetricContext struct {
|
||||
@ -79,12 +93,27 @@ func (mc *MetricContext) Observe(err error) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// ObserveOperationWithResult observes the request latency and failed requests of an operation.
|
||||
func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool) {
|
||||
operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe(
|
||||
time.Since(mc.start).Seconds())
|
||||
if !isOperationSucceeded {
|
||||
mc.CountFailedOperation()
|
||||
}
|
||||
}
|
||||
|
||||
// CountFailedOperation increase the number of failed operations
|
||||
func (mc *MetricContext) CountFailedOperation() {
|
||||
operationMetrics.operationFailureCount.WithLabelValues(mc.attributes...).Inc()
|
||||
}
|
||||
|
||||
// registerAPIMetrics registers the API metrics.
|
||||
func registerAPIMetrics(attributes ...string) *apiCallMetrics {
|
||||
metrics := &apiCallMetrics{
|
||||
latency: metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Name: "cloudprovider_azure_api_request_duration_seconds",
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "api_request_duration_seconds",
|
||||
Help: "Latency of an Azure API call",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
@ -92,7 +121,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
|
||||
),
|
||||
errors: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Name: "cloudprovider_azure_api_request_errors",
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "api_request_errors",
|
||||
Help: "Number of errors for an Azure API call",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
@ -100,7 +130,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
|
||||
),
|
||||
rateLimitedCount: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Name: "cloudprovider_azure_api_request_ratelimited_count",
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "api_request_ratelimited_count",
|
||||
Help: "Number of rate limited Azure API calls",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
@ -108,7 +139,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
|
||||
),
|
||||
throttledCount: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Name: "cloudprovider_azure_api_request_throttled_count",
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "api_request_throttled_count",
|
||||
Help: "Number of throttled Azure API calls",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
@ -123,3 +155,33 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
// registerOperationMetrics registers the operation metrics.
|
||||
func registerOperationMetrics(attributes ...string) *operationCallMetrics {
|
||||
metrics := &operationCallMetrics{
|
||||
operationLatency: metrics.NewHistogramVec(
|
||||
&metrics.HistogramOpts{
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "op_duration_seconds",
|
||||
Help: "Latency of an Azure service operation",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
Buckets: []float64{0.1, 0.2, 0.5, 1, 10, 20, 30, 40, 50, 60, 100, 200, 300},
|
||||
},
|
||||
attributes,
|
||||
),
|
||||
operationFailureCount: metrics.NewCounterVec(
|
||||
&metrics.CounterOpts{
|
||||
Namespace: azureMetricsNamespace,
|
||||
Name: "op_failure_count",
|
||||
Help: "Number of failed Azure service operations",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
attributes,
|
||||
),
|
||||
}
|
||||
|
||||
legacyregistry.MustRegister(metrics.operationLatency)
|
||||
legacyregistry.MustRegister(metrics.operationFailureCount)
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user