Add metrics for azure service operations (route and loadbalancer).

This commit is contained in:
niqi 2020-08-20 14:38:12 +08:00 committed by qini
parent 16ea9dc6bc
commit 88c72cc3ad
6 changed files with 135 additions and 16 deletions

View File

@ -91,6 +91,7 @@ go_library(
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/azure/metrics:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/azure/retry:go_default_library",
"//vendor/github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute:go_default_library",
"//vendor/github.com/Azure/azure-sdk-for-go/services/network/mgmt/2019-06-01/network:go_default_library",

View File

@ -36,6 +36,7 @@ import (
servicehelpers "k8s.io/cloud-provider/service/helpers"
"k8s.io/klog/v2"
azcache "k8s.io/legacy-cloud-providers/azure/cache"
"k8s.io/legacy-cloud-providers/azure/metrics"
"k8s.io/legacy-cloud-providers/azure/retry"
utilnet "k8s.io/utils/net"
)
@ -157,6 +158,12 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
serviceName := getServiceName(service)
klog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName)
mc := metrics.NewMetricContext("services", "ensure_loadbalancer", az.ResourceGroup, az.SubscriptionID, serviceName)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
if err != nil {
klog.Errorf("reconcileLoadBalancer(%s) failed: %v", serviceName, err)
@ -192,6 +199,8 @@ func (az *Cloud) EnsureLoadBalancer(ctx context.Context, clusterName string, ser
return nil, err
}
isOperationSucceeded = true
return lbStatus, nil
}
@ -216,6 +225,12 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
serviceName := getServiceName(service)
klog.V(5).Infof("Delete service (%s): START clusterName=%q", serviceName, clusterName)
mc := metrics.NewMetricContext("services", "ensure_loadbalancer_deleted", az.ResourceGroup, az.SubscriptionID, serviceName)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
serviceIPToCleanup, err := az.findServiceIPAddress(ctx, clusterName, service, isInternal)
if err != nil && !retry.HasStatusForbiddenOrIgnoredError(err) {
return err
@ -235,6 +250,8 @@ func (az *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName stri
}
klog.V(2).Infof("Delete service (%s): FINISH", serviceName)
isOperationSucceeded = true
return nil
}

View File

@ -33,6 +33,7 @@ import (
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"
azcache "k8s.io/legacy-cloud-providers/azure/cache"
"k8s.io/legacy-cloud-providers/azure/metrics"
utilnet "k8s.io/utils/net"
)
@ -282,6 +283,12 @@ func (az *Cloud) createRouteTable() error {
// route.Name will be ignored, although the cloud-provider may use nameHint
// to create a more user-meaningful name.
func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint string, kubeRoute *cloudprovider.Route) error {
mc := metrics.NewMetricContext("routes", "create_route", az.ResourceGroup, az.SubscriptionID, "")
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
var targetIP string
nodeName := string(kubeRoute.TargetNode)
@ -351,12 +358,20 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s
}
klog.V(2).Infof("CreateRoute: route created. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
isOperationSucceeded = true
return nil
}
// DeleteRoute deletes the specified managed route
// Route should be as returned by ListRoutes
func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute *cloudprovider.Route) error {
mc := metrics.NewMetricContext("routes", "delete_route", az.ResourceGroup, az.SubscriptionID, "")
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
// Returns for unmanaged nodes because azure cloud provider couldn't fetch information for them.
nodeName := string(kubeRoute.TargetNode)
unmanaged, err := az.IsNodeUnmanaged(nodeName)
@ -392,6 +407,8 @@ func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute
}
klog.V(2).Infof("DeleteRoute: route deleted. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR)
isOperationSucceeded = true
return nil
}

View File

@ -38,10 +38,10 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/component-base/featuregate"
"k8s.io/klog/v2"
azcache "k8s.io/legacy-cloud-providers/azure/cache"
"k8s.io/component-base/featuregate"
"k8s.io/legacy-cloud-providers/azure/metrics"
utilnet "k8s.io/utils/net"
)
@ -808,6 +808,12 @@ func (as *availabilitySet) EnsureHostInPool(service *v1.Service, nodeName types.
// EnsureHostsInPool ensures the given Node's primary IP configurations are
// participating in the specified LoadBalancer Backend Pool.
func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
mc := metrics.NewMetricContext("services", "vmas_ensure_hosts_in_pool", as.ResourceGroup, as.SubscriptionID, service.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
hostUpdates := make([]func() error, 0, len(nodes))
for _, node := range nodes {
localNodeName := node.Name
@ -836,6 +842,7 @@ func (as *availabilitySet) EnsureHostsInPool(service *v1.Service, nodes []*v1.No
return utilerrors.Flatten(errs)
}
isOperationSucceeded = true
return nil
}

View File

@ -37,6 +37,7 @@ import (
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"
azcache "k8s.io/legacy-cloud-providers/azure/cache"
"k8s.io/legacy-cloud-providers/azure/metrics"
utilnet "k8s.io/utils/net"
)
@ -1183,6 +1184,12 @@ func (ss *scaleSet) ensureVMSSInPool(service *v1.Service, nodes []*v1.Node, back
// EnsureHostsInPool ensures the given Node's primary IP configurations are
// participating in the specified LoadBalancer Backend Pool.
func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, backendPoolID string, vmSetName string, isInternal bool) error {
mc := metrics.NewMetricContext("services", "vmss_ensure_hosts_in_pool", ss.ResourceGroup, ss.SubscriptionID, service.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
hostUpdates := make([]func() error, 0, len(nodes))
nodeUpdates := make(map[vmssMetaInfo]map[string]compute.VirtualMachineScaleSetVM)
errors := make([]error, 0)
@ -1281,6 +1288,7 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
return err
}
isOperationSucceeded = true
return nil
}
@ -1484,6 +1492,12 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
return nil
}
mc := metrics.NewMetricContext("services", "vmss_ensure_backend_pool_deleted", ss.ResourceGroup, ss.SubscriptionID, service.Name)
isOperationSucceeded := false
defer func() {
mc.ObserveOperationWithResult(isOperationSucceeded)
}()
ipConfigurationIDs := []string{}
for _, backendPool := range *backendAddressPools {
if strings.EqualFold(*backendPool.ID, backendPoolID) && backendPool.BackendIPConfigurations != nil {
@ -1582,5 +1596,6 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(service *v1.Service, backendPoolID,
return err
}
isOperationSucceeded = true
return nil
}

View File

@ -26,6 +26,24 @@ import (
"k8s.io/component-base/metrics/legacyregistry"
)
const (
azureMetricsNamespace = "cloudprovider_azure"
)
var (
metricLabels = []string{
"request", // API function that is being invoked
"resource_group", // Resource group of the resource being monitored
"subscription_id", // Subscription ID of the resource being monitored
"source", // Operation source(optional)
}
apiMetrics = registerAPIMetrics(metricLabels...)
operationMetrics = registerOperationMetrics(metricLabels...)
)
// apiCallMetrics is the metrics measuring the performance of a single API call
// e.g., GET, POST ...
type apiCallMetrics struct {
latency *metrics.HistogramVec
errors *metrics.CounterVec
@ -33,16 +51,12 @@ type apiCallMetrics struct {
throttledCount *metrics.CounterVec
}
var (
metricLabels = []string{
"request", // API function that is being invoked
"resource_group", // Resource group of the resource being monitored
"subscription_id", // Subscription ID of the resource being monitored
"source", // Oeration source(optional)
}
apiMetrics = registerAPIMetrics(metricLabels...)
)
// operationCallMetrics is the metrics measuring the performance of a whole operation
// e.g., the create / update / delete process of a loadbalancer or route.
type operationCallMetrics struct {
operationLatency *metrics.HistogramVec
operationFailureCount *metrics.CounterVec
}
// MetricContext indicates the context for Azure client metrics.
type MetricContext struct {
@ -79,12 +93,27 @@ func (mc *MetricContext) Observe(err error) error {
return err
}
// ObserveOperationWithResult observes the request latency and failed requests of an operation.
func (mc *MetricContext) ObserveOperationWithResult(isOperationSucceeded bool) {
operationMetrics.operationLatency.WithLabelValues(mc.attributes...).Observe(
time.Since(mc.start).Seconds())
if !isOperationSucceeded {
mc.CountFailedOperation()
}
}
// CountFailedOperation increase the number of failed operations
func (mc *MetricContext) CountFailedOperation() {
operationMetrics.operationFailureCount.WithLabelValues(mc.attributes...).Inc()
}
// registerAPIMetrics registers the API metrics.
func registerAPIMetrics(attributes ...string) *apiCallMetrics {
metrics := &apiCallMetrics{
latency: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Name: "cloudprovider_azure_api_request_duration_seconds",
Namespace: azureMetricsNamespace,
Name: "api_request_duration_seconds",
Help: "Latency of an Azure API call",
StabilityLevel: metrics.ALPHA,
},
@ -92,7 +121,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
),
errors: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "cloudprovider_azure_api_request_errors",
Namespace: azureMetricsNamespace,
Name: "api_request_errors",
Help: "Number of errors for an Azure API call",
StabilityLevel: metrics.ALPHA,
},
@ -100,7 +130,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
),
rateLimitedCount: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "cloudprovider_azure_api_request_ratelimited_count",
Namespace: azureMetricsNamespace,
Name: "api_request_ratelimited_count",
Help: "Number of rate limited Azure API calls",
StabilityLevel: metrics.ALPHA,
},
@ -108,7 +139,8 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
),
throttledCount: metrics.NewCounterVec(
&metrics.CounterOpts{
Name: "cloudprovider_azure_api_request_throttled_count",
Namespace: azureMetricsNamespace,
Name: "api_request_throttled_count",
Help: "Number of throttled Azure API calls",
StabilityLevel: metrics.ALPHA,
},
@ -123,3 +155,33 @@ func registerAPIMetrics(attributes ...string) *apiCallMetrics {
return metrics
}
// registerOperationMetrics registers the operation metrics.
func registerOperationMetrics(attributes ...string) *operationCallMetrics {
metrics := &operationCallMetrics{
operationLatency: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: azureMetricsNamespace,
Name: "op_duration_seconds",
Help: "Latency of an Azure service operation",
StabilityLevel: metrics.ALPHA,
Buckets: []float64{0.1, 0.2, 0.5, 1, 10, 20, 30, 40, 50, 60, 100, 200, 300},
},
attributes,
),
operationFailureCount: metrics.NewCounterVec(
&metrics.CounterOpts{
Namespace: azureMetricsNamespace,
Name: "op_failure_count",
Help: "Number of failed Azure service operations",
StabilityLevel: metrics.ALPHA,
},
attributes,
),
}
legacyregistry.MustRegister(metrics.operationLatency)
legacyregistry.MustRegister(metrics.operationFailureCount)
return metrics
}