From 88c72cc3ad4e33b2722fc2d6ea61e157e02b0197 Mon Sep 17 00:00:00 2001 From: niqi Date: Thu, 20 Aug 2020 14:38:12 +0800 Subject: [PATCH] Add metrics for azure service operations (route and loadbalancer). --- .../k8s.io/legacy-cloud-providers/azure/BUILD | 1 + .../azure/azure_loadbalancer.go | 17 ++++ .../azure/azure_routes.go | 17 ++++ .../azure/azure_standard.go | 11 ++- .../azure/azure_vmss.go | 15 ++++ .../azure/metrics/azure_metrics.go | 90 ++++++++++++++++--- 6 files changed, 135 insertions(+), 16 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/BUILD b/staging/src/k8s.io/legacy-cloud-providers/azure/BUILD index 5f1287be000..8e4bda171b8 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/BUILD +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/BUILD @@ -91,6 +91,7 @@ go_library( "//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient:go_default_library", "//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient:go_default_library", "//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient:go_default_library", + "//staging/src/k8s.io/legacy-cloud-providers/azure/metrics:go_default_library", "//staging/src/k8s.io/legacy-cloud-providers/azure/retry:go_default_library", "//vendor/github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute:go_default_library", "//vendor/github.com/Azure/azure-sdk-for-go/services/network/mgmt/2019-06-01/network:go_default_library", diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_loadbalancer.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_loadbalancer.go index 9744507c8cb..0ec8bf998fb 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_loadbalancer.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_loadbalancer.go @@ -36,6 +36,7 @@ import ( servicehelpers "k8s.io/cloud-provider/service/helpers" "k8s.io/klog/v2" azcache "k8s.io/legacy-cloud-providers/azure/cache" + "k8s.io/legacy-cloud-providers/azure/metrics" "k8s.io/legacy-cloud-providers/azure/retry" utilnet "k8s.io/utils/net" ) @@ -157,6 +158,12 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser serviceName := getServiceName(service) klog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName) + mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */) if err != nil { klog.Errorf("reconcileLoadBalancer(%s) failed: %v", serviceName, err) @@ -192,6 +199,8 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser return nil, err } + isOperationSucceeded = true + return lbStatus, nil } @@ -216,6 +225,12 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri serviceName := getServiceName(service) klog.V(5).Infof("Delete service (%s): START clusterName=%q", serviceName, clusterName) + mc := metrics.NewMetricContext("services", "ensure_loadbalancer_deleted", az.ResourceGroup, az.SubscriptionID, serviceName) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + serviceIPToCleanup, err := az.findServiceIPAddress(ctx, clusterName, service, isInternal) if err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) { return err @@ -235,6 +250,8 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri } klog.V(2).Infof("Delete service (%s): FINISH", serviceName) + isOperationSucceeded = true + return nil } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_routes.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_routes.go index d6797b28088..b3797da2310 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_routes.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_routes.go @@ -33,6 +33,7 @@ import ( cloudprovider "k8s.io/cloud-provider" "k8s.io/klog/v2" azcache "k8s.io/legacy-cloud-providers/azure/cache" + "k8s.io/legacy-cloud-providers/azure/metrics" utilnet "k8s.io/utils/net" ) @@ -282,6 +283,12 @@ func (az *Cloud) createRouteTable() error { // route.Name will be ignored, although the cloud-provider may use nameHint // to create a more user-meaningful name. func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint string, kubeRoute *cloudprovider.Route) error { + mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, "") + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + // Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them. var targetIP string nodeName := string(kubeRoute.TargetNode) @@ -351,12 +358,20 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s } klog.V(2).Infof("CreateRoute: route created. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) + isOperationSucceeded = true + return nil } // DeleteRoute deletes the specified managed route // Route should be as returned by ListRoutes func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute *cloudprovider.Route) error { + mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, "") + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + // Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them. nodeName := string(kubeRoute.TargetNode) unmanaged, err := az.IsNodeUnmanaged(nodeName) @@ -392,6 +407,8 @@ func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute } klog.V(2).Infof("DeleteRoute: route deleted. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) + isOperationSucceeded = true + return nil } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go index 174ba45e69d..07febdd4bbe 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go @@ -38,10 +38,10 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/uuid" cloudprovider "k8s.io/cloud-provider" + "k8s.io/component-base/featuregate" "k8s.io/klog/v2" azcache "k8s.io/legacy-cloud-providers/azure/cache" - - "k8s.io/component-base/featuregate" + "k8s.io/legacy-cloud-providers/azure/metrics" utilnet "k8s.io/utils/net" ) @@ -808,6 +808,12 @@ func (as *availabilitySet) EnsureHostInPool(service *v1.Service, nodeName types. // EnsureHostsInPool ensures the given Node's primary IP configurations are // participating in the specified LoadBalancer Backend Pool. func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error { + mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, service.Name) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + hostUpdates := make([]func() error, 0, len(nodes)) for _, node := range nodes { localNodeName := node.Name @@ -836,6 +842,7 @@ func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.No return utilerrors.Flatten(errs) } + isOperationSucceeded = true return nil } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go index fabd58c6804..14c350d8440 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go @@ -37,6 +37,7 @@ import ( cloudprovider "k8s.io/cloud-provider" "k8s.io/klog/v2" azcache "k8s.io/legacy-cloud-providers/azure/cache" + "k8s.io/legacy-cloud-providers/azure/metrics" utilnet "k8s.io/utils/net" ) @@ -1183,6 +1184,12 @@ func (ss *scaleSet) ensureVMSSInPool(service *v1.Service, nodes []*v1.Node, back // EnsureHostsInPool ensures the given Node's primary IP configurations are // participating in the specified LoadBalancer Backend Pool. func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error { + mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, service.Name) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + hostUpdates := make([]func() error, 0, len(nodes)) nodeUpdates := make(map[vmssMetaInfo]map[string]compute.VirtualMachineScaleSetVM) errors := make([]error, 0) @@ -1281,6 +1288,7 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac return err } + isOperationSucceeded = true return nil } @@ -1484,6 +1492,12 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID, return nil } + mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, service.Name) + isOperationSucceeded := false + defer func() { + mc.ObserveOperationWithResult(isOperationSucceeded) + }() + ipConfigurationIDs := []string{} for _, backendPool := range *backendAddressPools { if strings.EqualFold(*backendPool.ID, backendPoolID) && backendPool.BackendIPConfigurations != nil { @@ -1582,5 +1596,6 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID, return err } + isOperationSucceeded = true return nil } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/metrics/azure_metrics.go b/staging/src/k8s.io/legacy-cloud-providers/azure/metrics/azure_metrics.go index 93c51daa87c..34a37669e37 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/metrics/azure_metrics.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/metrics/azure_metrics.go @@ -26,6 +26,24 @@ import ( "k8s.io/component-base/metrics/legacyregistry" ) +const ( + azureMetricsNamespace = "cloudprovider_azure" +) + +var ( + metricLabels = []string{ + "request", // API function that is being invoked + "resource_group", // Resource group of the resource being monitored + "subscription_id", // Subscription ID of the resource being monitored + "source", // Operation source(optional) + } + + apiMetrics = registerAPIMetrics(metricLabels...) + operationMetrics = registerOperationMetrics(metricLabels...) +) + +// apiCallMetrics is the metrics measuring the performance of a single API call +// e.g., GET, POST ... type apiCallMetrics struct { latency *metrics.HistogramVec errors *metrics.CounterVec @@ -33,16 +51,12 @@ type apiCallMetrics struct { throttledCount *metrics.CounterVec } -var ( - metricLabels = []string{ - "request", // API function that is being invoked - "resource_group", // Resource group of the resource being monitored - "subscription_id", // Subscription ID of the resource being monitored - "source", // Oeration source(optional) - } - - apiMetrics = registerAPIMetrics(metricLabels...) -) +// operationCallMetrics is the metrics measuring the performance of a whole operation +// e.g., the create / update / delete process of a loadbalancer or route. +type operationCallMetrics struct { + operationLatency *metrics.HistogramVec + operationFailureCount *metrics.CounterVec +} // MetricContext indicates the context for Azure client metrics. type MetricContext struct { @@ -79,12 +93,27 @@ func (mc *MetricContext) Observe(err error) error { return err } +// ObserveOperationWithResult observes the request latency and failed requests of an operation. +func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool) { + operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe( + time.Since(mc.start).Seconds()) + if !isOperationSucceeded { + mc.CountFailedOperation() + } +} + +// CountFailedOperation increase the number of failed operations +func (mc *MetricContext) CountFailedOperation() { + operationMetrics.operationFailureCount.WithLabelValues(mc.attributes...).Inc() +} + // registerAPIMetrics registers the API metrics. func registerAPIMetrics(attributes ...string) *apiCallMetrics { metrics := &apiCallMetrics{ latency: metrics.NewHistogramVec( &metrics.HistogramOpts{ - Name: "cloudprovider_azure_api_request_duration_seconds", + Namespace: azureMetricsNamespace, + Name: "api_request_duration_seconds", Help: "Latency of an Azure API call", StabilityLevel: metrics.ALPHA, }, @@ -92,7 +121,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics { ), errors: metrics.NewCounterVec( &metrics.CounterOpts{ - Name: "cloudprovider_azure_api_request_errors", + Namespace: azureMetricsNamespace, + Name: "api_request_errors", Help: "Number of errors for an Azure API call", StabilityLevel: metrics.ALPHA, }, @@ -100,7 +130,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics { ), rateLimitedCount: metrics.NewCounterVec( &metrics.CounterOpts{ - Name: "cloudprovider_azure_api_request_ratelimited_count", + Namespace: azureMetricsNamespace, + Name: "api_request_ratelimited_count", Help: "Number of rate limited Azure API calls", StabilityLevel: metrics.ALPHA, }, @@ -108,7 +139,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics { ), throttledCount: metrics.NewCounterVec( &metrics.CounterOpts{ - Name: "cloudprovider_azure_api_request_throttled_count", + Namespace: azureMetricsNamespace, + Name: "api_request_throttled_count", Help: "Number of throttled Azure API calls", StabilityLevel: metrics.ALPHA, }, @@ -123,3 +155,33 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics { return metrics } + +// registerOperationMetrics registers the operation metrics. +func registerOperationMetrics(attributes ...string) *operationCallMetrics { + metrics := &operationCallMetrics{ + operationLatency: metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Namespace: azureMetricsNamespace, + Name: "op_duration_seconds", + Help: "Latency of an Azure service operation", + StabilityLevel: metrics.ALPHA, + Buckets: []float64{0.1, 0.2, 0.5, 1, 10, 20, 30, 40, 50, 60, 100, 200, 300}, + }, + attributes, + ), + operationFailureCount: metrics.NewCounterVec( + &metrics.CounterOpts{ + Namespace: azureMetricsNamespace, + Name: "op_failure_count", + Help: "Number of failed Azure service operations", + StabilityLevel: metrics.ALPHA, + }, + attributes, + ), + } + + legacyregistry.MustRegister(metrics.operationLatency) + legacyregistry.MustRegister(metrics.operationFailureCount) + + return metrics +}