From f200f9a1e8b54c11420183b5bd403cc430324c1e Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Fri, 26 May 2017 14:13:41 -0700 Subject: [PATCH 01/13] Azure cloudprovider retry using flowcontrol An initial attempt at engaging exponential backoff for API error responses. Uses k8s.io/client-go/util/flowcontrol; implementation inspired by GCE cloudprovider backoff. --- pkg/cloudprovider/providers/azure/azure.go | 25 +-- .../providers/azure/azure_backoff.go | 149 ++++++++++++++++++ .../providers/azure/azure_loadbalancer.go | 61 ++++++- .../providers/azure/azure_routes.go | 24 ++- .../providers/azure/azure_storage.go | 16 +- 5 files changed, 252 insertions(+), 23 deletions(-) create mode 100644 pkg/cloudprovider/providers/azure/azure_backoff.go diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index 7694693e261..5ba1fba24ef 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -22,6 +22,7 @@ import ( "io/ioutil" "time" + "k8s.io/client-go/util/flowcontrol" "k8s.io/kubernetes/pkg/cloudprovider" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/version" @@ -74,16 +75,17 @@ type Config struct { // Cloud holds the config and clients type Cloud struct { Config - Environment azure.Environment - RoutesClient network.RoutesClient - SubnetsClient network.SubnetsClient - InterfacesClient network.InterfacesClient - RouteTablesClient network.RouteTablesClient - LoadBalancerClient network.LoadBalancersClient - PublicIPAddressesClient network.PublicIPAddressesClient - SecurityGroupsClient network.SecurityGroupsClient - VirtualMachinesClient compute.VirtualMachinesClient - StorageAccountClient storage.AccountsClient + Environment azure.Environment + RoutesClient network.RoutesClient + SubnetsClient network.SubnetsClient + InterfacesClient network.InterfacesClient + RouteTablesClient network.RouteTablesClient + LoadBalancerClient network.LoadBalancersClient + PublicIPAddressesClient network.PublicIPAddressesClient + SecurityGroupsClient network.SecurityGroupsClient + VirtualMachinesClient compute.VirtualMachinesClient + StorageAccountClient storage.AccountsClient + operationPollRateLimiter flowcontrol.RateLimiter } func init() { @@ -177,6 +179,9 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { az.StorageAccountClient = storage.NewAccountsClientWithBaseURI(az.Environment.ResourceManagerEndpoint, az.SubscriptionID) az.StorageAccountClient.Authorizer = servicePrincipalToken + // 1 qps, up to 5 burst when in flowcontrol; i.e., aggressive backoff enforcement + az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter(1, 5) + return &az, nil } diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go new file mode 100644 index 00000000000..7847e63c19a --- /dev/null +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -0,0 +1,149 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package azure + +import ( + "time" + + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/Azure/azure-sdk-for-go/arm/compute" + "github.com/Azure/azure-sdk-for-go/arm/network" + "github.com/Azure/go-autorest/autorest" +) + +const ( + operationPollInterval = 3 * time.Second + operationPollTimeoutDuration = time.Hour +) + +// CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) + return processRetryResponse(resp, err) + }) +} + +// DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry +func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) + return processRetryResponse(resp, err) + }) +} + +// DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry +func (az *Cloud) DeleteLBWithRetry(lbName string) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) + return processRetryResponse(resp, err) + }) +} + +// DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry +func (az *Cloud) DeleteRouteWithRetry(routeName string) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) + return processRetryResponse(resp, err) + }) +} + +// CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error { + return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) + return processRetryResponse(resp, err) + }) +} + +// An in-progress convenience function to deal with common HTTP backoff response conditions +func processRetryResponse(resp autorest.Response, err error) (bool, error) { + if isSuccessHTTPResponse(resp) { + return true, nil + } + if shouldRetryAPIRequest(resp, err) { + return false, err + } + // TODO determine the complete set of short-circuit conditions + if err != nil { + return false, err + } + // Fall-through: stop periodic backoff, return error object from most recent request + return true, err +} + +// shouldRetryAPIRequest determines if the response from an HTTP request suggests periodic retry behavior +func shouldRetryAPIRequest(resp autorest.Response, err error) bool { + // TODO determine the complete set of retry conditions + if err != nil { + return true + } + if resp.StatusCode == 429 || resp.StatusCode == 500 { + return true + } + return false +} + +// isSuccessHTTPResponse determines if the response from an HTTP request suggests success +func isSuccessHTTPResponse(resp autorest.Response) bool { + // TODO determine the complete set of success conditions + if resp.StatusCode == 200 || resp.StatusCode == 201 || resp.StatusCode == 202 { + return true + } + return false +} diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 940723d2a46..a7cbeb742a5 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -149,7 +149,13 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod // to nil. This is a workaround until https://github.com/Azure/go-autorest/issues/112 is fixed sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil - _, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) + resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateSGWithRetry(sg) + if retryErr != nil { + return nil, retryErr + } + } if err != nil { return nil, err } @@ -219,7 +225,13 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod } if !existsLb || lbNeedsUpdate { glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName) - _, err = az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) + resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateLBWithRetry(lb) + if retryErr != nil { + return nil, retryErr + } + } if err != nil { return nil, err } @@ -310,7 +322,13 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi // to nil. This is a workaround until https://github.com/Azure/go-autorest/issues/112 is fixed sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil - _, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) + resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } @@ -339,14 +357,26 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if lbNeedsUpdate { if len(*lb.FrontendIPConfigurations) > 0 { glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName) - _, err = az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) + resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateLBWithRetry(lb) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } } else { glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) - _, err = az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) + resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.DeleteLBWithRetry(lbName) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } @@ -392,7 +422,13 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub pip.Tags = &map[string]*string{"service": &serviceName} glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name) - _, err = az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) + resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdatePIPWithRetry(pip) + if retryErr != nil { + return nil, retryErr + } + } if err != nil { return nil, err } @@ -407,7 +443,10 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub } func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error { - _, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) + resp, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) + if shouldRetryAPIRequest(resp, deleteErr) { + deleteErr = az.DeletePublicIPWithRetry(pipName) + } _, realErr := checkResourceExistsFromError(deleteErr) if realErr != nil { return nil @@ -848,7 +887,13 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b primaryIPConfig.LoadBalancerBackendAddressPools = &newBackendPools glog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) - _, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) + resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateInterfaceWithRetry(nic) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index aab962f7088..ee8832d606c 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -76,7 +76,13 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo } glog.V(3).Infof("create: creating routetable. routeTableName=%q", az.RouteTableName) - _, err = az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) + resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } @@ -103,7 +109,13 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo } glog.V(3).Infof("create: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - _, err = az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) + resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateRouteWithRetry(route) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } @@ -118,7 +130,13 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) glog.V(2).Infof("delete: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) - _, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) + resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.DeleteRouteWithRetry(routeName) + if retryErr != nil { + return retryErr + } + } if err != nil { return err } diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index abaae7c4019..5a7a6cb2130 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -64,7 +64,13 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l }, } vmName := mapNodeNameToVMName(nodeName) - _, err = az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) + resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) + if retryErr != nil { + return retryErr + } + } if err != nil { glog.Errorf("azure attach failed, err: %v", err) detail := err.Error() @@ -135,7 +141,13 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN }, } vmName := mapNodeNameToVMName(nodeName) - _, err = az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) + resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) + if shouldRetryAPIRequest(resp, err) { + retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) + if retryErr != nil { + return retryErr + } + } if err != nil { glog.Errorf("azure disk detach failed, err: %v", err) } else { From c6c6cc790e07a010c0e68bcd4d135568f54c5fd5 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Wed, 31 May 2017 11:53:02 -0700 Subject: [PATCH 02/13] errata, wait.ExponentialBackoff, regex HTTP codes - corrected Copyright copy/paste - now actually implementing exponential backoff instead of regular interval retries - using more general HTTP response code success/failure determination (e.g., 5xx for retry) - net/http constants ftw --- .../providers/azure/azure_backoff.go | 68 ++++++++++++++----- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 7847e63c19a..8b51b619427 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -1,5 +1,5 @@ /* -Copyright 2016 The Kubernetes Authors. +Copyright 2017 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,6 +17,9 @@ limitations under the License. package azure import ( + "net/http" + "regexp" + "strconv" "time" "k8s.io/apimachinery/pkg/util/wait" @@ -29,11 +32,23 @@ import ( const ( operationPollInterval = 3 * time.Second operationPollTimeoutDuration = time.Hour + backoffRetries = 12 + backoffExponent = 2 + backoffDuration = 1 * time.Second + backoffJitter = 1.0 ) +var azAPIBackoff = wait.Backoff{ + Steps: backoffRetries, + Factor: backoffExponent, + Duration: backoffDuration, + Jitter: backoffJitter, +} + // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { +func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup, delay time.Duration) error { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) return processRetryResponse(resp, err) }) @@ -41,7 +56,8 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) return processRetryResponse(resp, err) }) @@ -49,7 +65,8 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) return processRetryResponse(resp, err) }) @@ -57,7 +74,8 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) return processRetryResponse(resp, err) }) @@ -65,7 +83,8 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) return processRetryResponse(resp, err) }) @@ -73,7 +92,8 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry func (az *Cloud) DeleteLBWithRetry(lbName string) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) return processRetryResponse(resp, err) }) @@ -81,7 +101,8 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error { // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) return processRetryResponse(resp, err) }) @@ -89,7 +110,8 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) return processRetryResponse(resp, err) }) @@ -97,7 +119,8 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry func (az *Cloud) DeleteRouteWithRetry(routeName string) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) return processRetryResponse(resp, err) }) @@ -105,7 +128,8 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error { // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error { - return wait.Poll(operationPollInterval, operationPollTimeoutDuration, func() (bool, error) { + return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) return processRetryResponse(resp, err) }) @@ -129,11 +153,19 @@ func processRetryResponse(resp autorest.Response, err error) (bool, error) { // shouldRetryAPIRequest determines if the response from an HTTP request suggests periodic retry behavior func shouldRetryAPIRequest(resp autorest.Response, err error) bool { - // TODO determine the complete set of retry conditions if err != nil { return true } - if resp.StatusCode == 429 || resp.StatusCode == 500 { + // HTTP 429 suggests we should retry + if resp.StatusCode == http.StatusTooManyRequests { + return true + } + // HTTP 5xx suggests we should retry + r, err := regexp.Compile(`^5\d\d$`) + if err != nil { + return false + } + if r.MatchString(strconv.Itoa(resp.StatusCode)) { return true } return false @@ -141,8 +173,12 @@ func shouldRetryAPIRequest(resp autorest.Response, err error) bool { // isSuccessHTTPResponse determines if the response from an HTTP request suggests success func isSuccessHTTPResponse(resp autorest.Response) bool { - // TODO determine the complete set of success conditions - if resp.StatusCode == 200 || resp.StatusCode == 201 || resp.StatusCode == 202 { + // HTTP 2xx suggests a successful response + r, err := regexp.Compile(`^2\d\d$`) + if err != nil { + return false + } + if r.MatchString(strconv.Itoa(resp.StatusCode)) { return true } return false From c95af0615435c242ea69096399bc943775395985 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Wed, 31 May 2017 12:03:22 -0700 Subject: [PATCH 03/13] errata arg cruft in CreateOrUpdateSGWithRetry function declaration --- pkg/cloudprovider/providers/azure/azure_backoff.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 8b51b619427..97bf84fa868 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -46,7 +46,7 @@ var azAPIBackoff = wait.Backoff{ } // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup, delay time.Duration) error { +func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) From 17f8dc53af410ab57a33993ad66188eb283bd675 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Thu, 1 Jun 2017 13:58:11 -0700 Subject: [PATCH 04/13] two optimizations - removed unnecessary return statements - optimized HTTP response code evaluations as numeric comparisons --- .../providers/azure/azure_backoff.go | 22 ++++--------------- .../providers/azure/azure_loadbalancer.go | 8 +++---- .../providers/azure/azure_routes.go | 6 ++--- .../providers/azure/azure_storage.go | 4 ++-- 4 files changed, 13 insertions(+), 27 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 97bf84fa868..812de4d9fae 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -17,9 +17,6 @@ limitations under the License. package azure import ( - "net/http" - "regexp" - "strconv" "time" "k8s.io/apimachinery/pkg/util/wait" @@ -153,19 +150,12 @@ func processRetryResponse(resp autorest.Response, err error) (bool, error) { // shouldRetryAPIRequest determines if the response from an HTTP request suggests periodic retry behavior func shouldRetryAPIRequest(resp autorest.Response, err error) bool { + // non-nil error from HTTP request suggests we should retry if err != nil { return true } - // HTTP 429 suggests we should retry - if resp.StatusCode == http.StatusTooManyRequests { - return true - } - // HTTP 5xx suggests we should retry - r, err := regexp.Compile(`^5\d\d$`) - if err != nil { - return false - } - if r.MatchString(strconv.Itoa(resp.StatusCode)) { + // HTTP 4xx or 5xx suggests we should retry + if 399 < resp.StatusCode && resp.StatusCode < 600 { return true } return false @@ -174,11 +164,7 @@ func shouldRetryAPIRequest(resp autorest.Response, err error) bool { // isSuccessHTTPResponse determines if the response from an HTTP request suggests success func isSuccessHTTPResponse(resp autorest.Response) bool { // HTTP 2xx suggests a successful response - r, err := regexp.Compile(`^2\d\d$`) - if err != nil { - return false - } - if r.MatchString(strconv.Itoa(resp.StatusCode)) { + if 199 < resp.StatusCode && resp.StatusCode < 300 { return true } return false diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index a7cbeb742a5..2aba67d2f3a 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -326,7 +326,7 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -361,7 +361,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -374,7 +374,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if shouldRetryAPIRequest(resp, err) { retryErr := az.DeleteLBWithRetry(lbName) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -891,7 +891,7 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateInterfaceWithRetry(nic) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index ee8832d606c..37c42a9b600 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -80,7 +80,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -113,7 +113,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateRouteWithRetry(route) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -134,7 +134,7 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) if shouldRetryAPIRequest(resp, err) { retryErr := az.DeleteRouteWithRetry(routeName) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index 5a7a6cb2130..445ff3db22f 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -68,7 +68,7 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { @@ -145,7 +145,7 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN if shouldRetryAPIRequest(resp, err) { retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { - return retryErr + err = retryErr } } if err != nil { From c5dd95fc2232077e6afd79e792503c3aa5ba4651 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Fri, 2 Jun 2017 09:59:07 -0700 Subject: [PATCH 05/13] update-bazel.sh mods --- pkg/cloudprovider/providers/azure/BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/cloudprovider/providers/azure/BUILD b/pkg/cloudprovider/providers/azure/BUILD index 3c7707158c1..37e23ace381 100644 --- a/pkg/cloudprovider/providers/azure/BUILD +++ b/pkg/cloudprovider/providers/azure/BUILD @@ -12,6 +12,7 @@ go_library( name = "go_default_library", srcs = [ "azure.go", + "azure_backoff.go", "azure_blob.go", "azure_file.go", "azure_instances.go", @@ -44,6 +45,8 @@ go_library( "//vendor/github.com/rubiojr/go-vhd/vhd:go_default_library", "//vendor/k8s.io/apimachinery/pkg/types:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library", + "//vendor/k8s.io/client-go/util/flowcontrol:go_default_library", ], ) From 7e6c689e588aa8888b5a91aede173b1c3b8db089 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Fri, 2 Jun 2017 15:35:20 -0700 Subject: [PATCH 06/13] backoff logging, error handling, wait.ConditionFunc - added info and error logs for appropriate backoff conditions/states - rationalized log idioms across all resource requests that are backoff-enabled - processRetryResponse as a wait.ConditionFunc needs to supress errors if it wants the caller to continue backing off --- .../providers/azure/azure_backoff.go | 13 +++++----- .../providers/azure/azure_loadbalancer.go | 24 +++++++++++++++++-- .../providers/azure/azure_routes.go | 7 ++++++ .../providers/azure/azure_storage.go | 6 +++++ 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 812de4d9fae..64f6ce9b3ec 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -24,6 +24,7 @@ import ( "github.com/Azure/azure-sdk-for-go/arm/compute" "github.com/Azure/azure-sdk-for-go/arm/network" "github.com/Azure/go-autorest/autorest" + "github.com/golang/glog" ) const ( @@ -132,17 +133,16 @@ func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualM }) } -// An in-progress convenience function to deal with common HTTP backoff response conditions +// A wait.ConditionFunc function to deal with common HTTP backoff response conditions func processRetryResponse(resp autorest.Response, err error) (bool, error) { if isSuccessHTTPResponse(resp) { + glog.V(2).Infof("backoff: success, HTTP response=%d", resp.StatusCode) return true, nil } if shouldRetryAPIRequest(resp, err) { - return false, err - } - // TODO determine the complete set of short-circuit conditions - if err != nil { - return false, err + glog.Errorf("backoff: failure, will retry, HTTP response=%d, err=%v", resp.StatusCode, err) + // suppress the error object so that backoff process continues + return false, nil } // Fall-through: stop periodic backoff, return error object from most recent request return true, err @@ -150,7 +150,6 @@ func processRetryResponse(resp autorest.Response, err error) (bool, error) { // shouldRetryAPIRequest determines if the response from an HTTP request suggests periodic retry behavior func shouldRetryAPIRequest(resp autorest.Response, err error) bool { - // non-nil error from HTTP request suggests we should retry if err != nil { return true } diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 2aba67d2f3a..302a0d8c6cd 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -151,8 +151,10 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name) retryErr := az.CreateOrUpdateSGWithRetry(sg) if retryErr != nil { + glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name) return nil, retryErr } } @@ -227,8 +229,10 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { + glog.V(2).Infof("ensure(%s) abort backoff: lb(%s) - updating", serviceName, lbName) return nil, retryErr } } @@ -324,9 +328,11 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg) if retryErr != nil { err = retryErr + glog.V(2).Infof("delete(%s) abort backoff: sg(%s) - updating", serviceName, az.SecurityGroupName) } } if err != nil { @@ -359,9 +365,11 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { err = retryErr + glog.V(2).Infof("delete(%s) abort backoff: sg(%s) - updating", serviceName, az.SecurityGroupName) } } if err != nil { @@ -372,9 +380,11 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) retryErr := az.DeleteLBWithRetry(lbName) if retryErr != nil { err = retryErr + glog.V(2).Infof("delete(%s) abort backoff: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) } } if err != nil { @@ -424,9 +434,11 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name) resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("ensure(%s) backing off: pip(%s) - creating", serviceName, *pip.Name) retryErr := az.CreateOrUpdatePIPWithRetry(pip) if retryErr != nil { - return nil, retryErr + glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - creating", serviceName, *pip.Name) + err = retryErr } } if err != nil { @@ -443,9 +455,15 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub } func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error { + glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName) resp, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) if shouldRetryAPIRequest(resp, deleteErr) { - deleteErr = az.DeletePublicIPWithRetry(pipName) + glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName) + retryErr := az.DeletePublicIPWithRetry(pipName) + if retryErr != nil { + glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - deleting", serviceName, pipName) + return retryErr + } } _, realErr := checkResourceExistsFromError(deleteErr) if realErr != nil { @@ -889,9 +907,11 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b glog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating", serviceName, nicName) retryErr := az.CreateOrUpdateInterfaceWithRetry(nic) if retryErr != nil { err = retryErr + glog.V(2).Infof("nicupdate(%s) abort backoff: nic(%s) - updating", serviceName, nicName) } } if err != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index 37c42a9b600..5c5fa2c132e 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -66,6 +66,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo routeTable, existsRouteTable, err := az.getRouteTable() if err != nil { + glog.V(2).Infof("create error: couldn't get routetable. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) return err } if !existsRouteTable { @@ -78,9 +79,11 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating routetable. routeTableName=%q", az.RouteTableName) resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("create backing off: creating routetable. routeTableName=%q", az.RouteTableName) retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) if retryErr != nil { err = retryErr + glog.V(2).Infof("create abort backoff: creating routetable. routeTableName=%q", az.RouteTableName) } } if err != nil { @@ -111,9 +114,11 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("create backing off: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.CreateOrUpdateRouteWithRetry(route) if retryErr != nil { err = retryErr + glog.V(2).Infof("create abort backoff: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) } } if err != nil { @@ -132,9 +137,11 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("delete backing off: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.DeleteRouteWithRetry(routeName) if retryErr != nil { err = retryErr + glog.V(2).Infof("delete abort backoff: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) } } if err != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index 445ff3db22f..b35e312b353 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -64,11 +64,14 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l }, } vmName := mapNodeNameToVMName(nodeName) + glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { err = retryErr + glog.V(2).Infof("create(%s) abort backoff: vm(%s)", az.ResourceGroup, vmName) } } if err != nil { @@ -141,11 +144,14 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN }, } vmName := mapNodeNameToVMName(nodeName) + glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) if shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { err = retryErr + glog.V(2).Infof("create(%s) abort backoff: vm(%s)", az.ResourceGroup, vmName) } } if err != nil { From 3f3aa279b9bee2a33123f458debe139d6165411c Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Mon, 5 Jun 2017 16:06:50 -0700 Subject: [PATCH 07/13] configurable backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - leveraging Config struct (—cloud-config) to store backoff and rate limit on/off and performance configuration - added add’l error logging - enabled backoff for vm GET requests --- pkg/cloudprovider/providers/azure/azure.go | 80 ++++++++++++++++++- .../providers/azure/azure_backoff.go | 53 ++++++------ .../providers/azure/azure_instances.go | 22 ++++- .../providers/azure/azure_loadbalancer.go | 19 ++--- .../providers/azure/azure_routes.go | 6 +- .../providers/azure/azure_storage.go | 4 +- .../providers/azure/azure_test.go | 42 +++++++++- .../providers/azure/azure_util.go | 6 ++ .../providers/azure/azure_wrap.go | 1 + 9 files changed, 186 insertions(+), 47 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index 5ba1fba24ef..eb8a92c16ef 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -33,10 +33,20 @@ import ( "github.com/Azure/go-autorest/autorest" "github.com/Azure/go-autorest/autorest/azure" "github.com/ghodss/yaml" + "github.com/golang/glog" + "k8s.io/apimachinery/pkg/util/wait" ) -// CloudProviderName is the value used for the --cloud-provider flag -const CloudProviderName = "azure" +const ( + // CloudProviderName is the value used for the --cloud-provider flag + CloudProviderName = "azure" + rateLimitQPSDefault = 1 + rateLimitBucketDefault = 5 + backoffRetriesDefault = 6 + backoffExponentDefault = 1.5 + backoffDurationDefault = 5 // in seconds + backoffJitterDefault = 1.0 +) // Config holds the configuration parsed from the --cloud-config flag // All fields are required unless otherwise specified @@ -70,6 +80,22 @@ type Config struct { AADClientID string `json:"aadClientId" yaml:"aadClientId"` // The ClientSecret for an AAD application with RBAC access to talk to Azure RM APIs AADClientSecret string `json:"aadClientSecret" yaml:"aadClientSecret"` + // Enable exponential backoff to manage resource request retries + CloudProviderBackoff bool `json:"cloudProviderBackoff" yaml:"cloudProviderBackoff"` + // Backoff retry limit + CloudProviderBackoffRetries int `json:"cloudProviderBackoffRetries" yaml:"cloudProviderBackoffRetries"` + // Backoff exponent + CloudProviderBackoffExponent float64 `json:"cloudProviderBackoffExponent" yaml:"cloudProviderBackoffExponent"` + // Backoff duration + CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration" yaml:"cloudProviderBackoffDuration"` + // Backoff jitter + CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter" yaml:"cloudProviderBackoffJitter"` + // Enable rate limiting + CloudProviderRateLimit bool `json:"cloudProviderRateLimit" yaml:"cloudProviderRateLimit"` + // Rate limit QPS + CloudProviderRateLimitQPS int `json:"cloudProviderRateLimitQPS" yaml:"cloudProviderRateLimitQPS"` + // Rate limit Bucket Size + CloudProviderRateLimitBucket int `json:"cloudProviderRateLimitBucket" yaml:"cloudProviderRateLimitBucket"` } // Cloud holds the config and clients @@ -86,6 +112,7 @@ type Cloud struct { VirtualMachinesClient compute.VirtualMachinesClient StorageAccountClient storage.AccountsClient operationPollRateLimiter flowcontrol.RateLimiter + resourceRequestBackoff wait.Backoff } func init() { @@ -179,8 +206,53 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { az.StorageAccountClient = storage.NewAccountsClientWithBaseURI(az.Environment.ResourceManagerEndpoint, az.SubscriptionID) az.StorageAccountClient.Authorizer = servicePrincipalToken - // 1 qps, up to 5 burst when in flowcontrol; i.e., aggressive backoff enforcement - az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter(1, 5) + // Conditionally configure rate limits + if az.CloudProviderRateLimit { + // Assign rate limit defaults if no configuration was passed in + if az.CloudProviderRateLimitQPS == 0 { + az.CloudProviderRateLimitQPS = rateLimitQPSDefault + } + if az.CloudProviderRateLimitBucket == 0 { + az.CloudProviderRateLimitBucket = rateLimitBucketDefault + } + az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter( + float32(az.CloudProviderRateLimitQPS), + az.CloudProviderRateLimitBucket) + glog.V(2).Infof("Azure cloudprovider using rate limits: QPS=%d, bucket=%d", + az.CloudProviderRateLimitQPS, + az.CloudProviderRateLimitBucket) + } else { + // if rate limits are configured off, az.operationPollRateLimiter.Accept() is a no-op + az.operationPollRateLimiter = flowcontrol.NewFakeAlwaysRateLimiter() + } + + // Conditionally configure resource request backoff + if az.CloudProviderBackoff { + // Assign backoff defaults if no configuration was passed in + if az.CloudProviderBackoffRetries == 0 { + az.CloudProviderBackoffRetries = backoffRetriesDefault + } + if az.CloudProviderBackoffExponent == 0 { + az.CloudProviderBackoffExponent = backoffExponentDefault + } + if az.CloudProviderBackoffDuration == 0 { + az.CloudProviderBackoffDuration = backoffDurationDefault + } + if az.CloudProviderBackoffJitter == 0 { + az.CloudProviderBackoffJitter = backoffJitterDefault + } + az.resourceRequestBackoff = wait.Backoff{ + Steps: az.CloudProviderBackoffRetries, + Factor: az.CloudProviderBackoffExponent, + Duration: time.Duration(az.CloudProviderBackoffDuration) * time.Second, + Jitter: az.CloudProviderBackoffJitter, + } + glog.V(2).Infof("Azure cloudprovider using retry backoff: retries=%d, exponent=%f, duration=%d, jitter=%f", + az.CloudProviderBackoffRetries, + az.CloudProviderBackoffExponent, + az.CloudProviderBackoffDuration, + az.CloudProviderBackoffJitter) + } return &az, nil } diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 64f6ce9b3ec..4ad3d7fbe10 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -17,35 +17,36 @@ limitations under the License. package azure import ( - "time" - "k8s.io/apimachinery/pkg/util/wait" "github.com/Azure/azure-sdk-for-go/arm/compute" "github.com/Azure/azure-sdk-for-go/arm/network" "github.com/Azure/go-autorest/autorest" "github.com/golang/glog" + "k8s.io/apimachinery/pkg/types" ) -const ( - operationPollInterval = 3 * time.Second - operationPollTimeoutDuration = time.Hour - backoffRetries = 12 - backoffExponent = 2 - backoffDuration = 1 * time.Second - backoffJitter = 1.0 -) - -var azAPIBackoff = wait.Backoff{ - Steps: backoffRetries, - Factor: backoffExponent, - Duration: backoffDuration, - Jitter: backoffJitter, +// GetVirtualMachineWithRetry invokes az.getVirtualMachine with exponential backoff retry +func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) { + var machine compute.VirtualMachine + var exists bool + err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { + az.operationPollRateLimiter.Accept() + var retryErr error + machine, exists, retryErr = az.getVirtualMachine(name) + if retryErr != nil { + glog.Errorf("backoff: failure, will retry,err=%v", retryErr) + return false, nil + } + glog.V(2).Infof("backoff: success") + return true, nil + }) + return machine, exists, err } // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) return processRetryResponse(resp, err) @@ -54,7 +55,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error { // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) return processRetryResponse(resp, err) @@ -63,7 +64,7 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error { // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) return processRetryResponse(resp, err) @@ -72,7 +73,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error { // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) return processRetryResponse(resp, err) @@ -81,7 +82,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error { // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) return processRetryResponse(resp, err) @@ -90,7 +91,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error { // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry func (az *Cloud) DeleteLBWithRetry(lbName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) return processRetryResponse(resp, err) @@ -99,7 +100,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error { // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) return processRetryResponse(resp, err) @@ -108,7 +109,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) return processRetryResponse(resp, err) @@ -117,7 +118,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry func (az *Cloud) DeleteRouteWithRetry(routeName string) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) return processRetryResponse(resp, err) @@ -126,7 +127,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error { // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error { - return wait.ExponentialBackoff(azAPIBackoff, func() (bool, error) { + return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { az.operationPollRateLimiter.Accept() resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) return processRetryResponse(resp, err) diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 45b109e1478..c4c40884208 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -24,6 +24,7 @@ import ( "k8s.io/kubernetes/pkg/cloudprovider" "github.com/Azure/azure-sdk-for-go/arm/compute" + "github.com/golang/glog" "k8s.io/apimachinery/pkg/types" ) @@ -31,6 +32,7 @@ import ( func (az *Cloud) NodeAddresses(name types.NodeName) ([]v1.NodeAddress, error) { ip, err := az.getIPForMachine(name) if err != nil { + glog.Errorf("error: az.NodeAddresses, az.getIPForMachine(%s), err=%v", name, err) return nil, err } @@ -55,9 +57,22 @@ func (az *Cloud) ExternalID(name types.NodeName) (string, error) { // InstanceID returns the cloud provider ID of the specified instance. // Note that if the instance does not exist or is no longer running, we must return ("", cloudprovider.InstanceNotFound) func (az *Cloud) InstanceID(name types.NodeName) (string, error) { - machine, exists, err := az.getVirtualMachine(name) + var machine compute.VirtualMachine + var exists bool + var err error + az.operationPollRateLimiter.Accept() + machine, exists, err = az.getVirtualMachine(name) if err != nil { - return "", err + if az.CloudProviderBackoff { + glog.V(2).Infof("InstanceID(%s) backing off", name) + machine, exists, err = az.GetVirtualMachineWithRetry(name) + if err != nil { + glog.V(2).Infof("InstanceID(%s) abort backoff", name) + return "", err + } + } else { + return "", err + } } else if !exists { return "", cloudprovider.InstanceNotFound } @@ -78,6 +93,7 @@ func (az *Cloud) InstanceTypeByProviderID(providerID string) (string, error) { func (az *Cloud) InstanceType(name types.NodeName) (string, error) { machine, exists, err := az.getVirtualMachine(name) if err != nil { + glog.Errorf("error: az.InstanceType(%s), az.getVirtualMachine(%s) err=%v", name, name, err) return "", err } else if !exists { return "", cloudprovider.InstanceNotFound @@ -102,6 +118,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) result, err := az.VirtualMachinesClient.List(az.ResourceGroup) if err != nil { + glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.List(%s), err=%v", az.ResourceGroup, err) return nil, err } @@ -112,6 +129,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) result, err = az.VirtualMachinesClient.ListAllNextResults(result) if err != nil { + glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%s), err=%v", result, err) return nil, err } diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 302a0d8c6cd..7e9f151e004 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -150,7 +150,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name) retryErr := az.CreateOrUpdateSGWithRetry(sg) if retryErr != nil { @@ -228,7 +228,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod if !existsLb || lbNeedsUpdate { glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { @@ -327,7 +327,7 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg) if retryErr != nil { @@ -364,7 +364,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if len(*lb.FrontendIPConfigurations) > 0 { glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName) resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) retryErr := az.CreateOrUpdateLBWithRetry(lb) if retryErr != nil { @@ -379,7 +379,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) retryErr := az.DeleteLBWithRetry(lbName) if retryErr != nil { @@ -433,7 +433,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name) resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - creating", serviceName, *pip.Name) retryErr := az.CreateOrUpdatePIPWithRetry(pip) if retryErr != nil { @@ -457,7 +457,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error { glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName) resp, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) - if shouldRetryAPIRequest(resp, deleteErr) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, deleteErr) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName) retryErr := az.DeletePublicIPWithRetry(pipName) if retryErr != nil { @@ -849,6 +849,7 @@ func findSecurityRule(rules []network.SecurityRule, rule network.SecurityRule) b // participating in the specified LoadBalancer Backend Pool. func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, backendPoolID string) error { vmName := mapNodeNameToVMName(nodeName) + az.operationPollRateLimiter.Accept() machine, err := az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "") if err != nil { return err @@ -906,8 +907,8 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b glog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) - if shouldRetryAPIRequest(resp, err) { - glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating", serviceName, nicName) + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { + glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating, err=%v", serviceName, nicName, err) retryErr := az.CreateOrUpdateInterfaceWithRetry(nic) if retryErr != nil { err = retryErr diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index 5c5fa2c132e..8332c9d124f 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -78,7 +78,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating routetable. routeTableName=%q", az.RouteTableName) resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating routetable. routeTableName=%q", az.RouteTableName) retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) if retryErr != nil { @@ -113,7 +113,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo glog.V(3).Infof("create: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.CreateOrUpdateRouteWithRetry(route) if retryErr != nil { @@ -136,7 +136,7 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete backing off: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) retryErr := az.DeleteRouteWithRetry(routeName) if retryErr != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index b35e312b353..bb09cc1c5e5 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -66,7 +66,7 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { @@ -146,7 +146,7 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) - if shouldRetryAPIRequest(resp, err) { + if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) retryErr := az.CreateOrUpdateVMWithRetry(vmName, newVM) if retryErr != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go index 98e9eead48c..4176230a720 100644 --- a/pkg/cloudprovider/providers/azure/azure_test.go +++ b/pkg/cloudprovider/providers/azure/azure_test.go @@ -591,7 +591,15 @@ func TestNewCloudFromJSON(t *testing.T) { "securityGroupName": "--security-group-name--", "vnetName": "--vnet-name--", "routeTableName": "--route-table-name--", - "primaryAvailabilitySetName": "--primary-availability-set-name--" + "primaryAvailabilitySetName": "--primary-availability-set-name--", + "cloudProviderBackoff": true, + "cloudProviderBackoffRetries": 6, + "cloudProviderBackoffExponent": 1.5, + "cloudProviderBackoffDuration": 5, + "cloudProviderBackoffJitter": 1.0, + "cloudProviderRatelimit": true, + "cloudProviderRateLimitQPS": 1, + "cloudProviderRateLimitBucket": 5 }` validateConfig(t, config) } @@ -610,6 +618,14 @@ securityGroupName: --security-group-name-- vnetName: --vnet-name-- routeTableName: --route-table-name-- primaryAvailabilitySetName: --primary-availability-set-name-- +cloudProviderBackoff: true +cloudProviderBackoffRetries: 6 +cloudProviderBackoffExponent: 1.5 +cloudProviderBackoffDuration: 5 +cloudProviderBackoffJitter: 1.0 +cloudProviderRatelimit: true +cloudProviderRateLimitQPS: 1 +cloudProviderRateLimitBucket: 5 ` validateConfig(t, config) } @@ -659,6 +675,30 @@ func validateConfig(t *testing.T, config string) { if azureCloud.PrimaryAvailabilitySetName != "--primary-availability-set-name--" { t.Errorf("got incorrect value for PrimaryAvailabilitySetName") } + if azureCloud.CloudProviderBackoff != true { + t.Errorf("got incorrect value for CloudProviderBackoff") + } + if azureCloud.CloudProviderBackoffRetries != 6 { + t.Errorf("got incorrect value for CloudProviderBackoffRetries") + } + if azureCloud.CloudProviderBackoffExponent != 1.5 { + t.Errorf("got incorrect value for CloudProviderBackoffExponent") + } + if azureCloud.CloudProviderBackoffDuration != 5 { + t.Errorf("got incorrect value for CloudProviderBackoffDuration") + } + if azureCloud.CloudProviderBackoffJitter != 1.0 { + t.Errorf("got incorrect value for CloudProviderBackoffJitter") + } + if azureCloud.CloudProviderRateLimit != true { + t.Errorf("got incorrect value for CloudProviderRateLimit") + } + if azureCloud.CloudProviderRateLimitQPS != 1 { + t.Errorf("got incorrect value for CloudProviderRateLimitQPS") + } + if azureCloud.CloudProviderRateLimitBucket != 5 { + t.Errorf("got incorrect value for CloudProviderRateLimitBucket") + } } func TestDecodeInstanceInfo(t *testing.T) { diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go index 094f3aaf903..bebea9a64ab 100644 --- a/pkg/cloudprovider/providers/azure/azure_util.go +++ b/pkg/cloudprovider/providers/azure/azure_util.go @@ -25,6 +25,7 @@ import ( "github.com/Azure/azure-sdk-for-go/arm/compute" "github.com/Azure/azure-sdk-for-go/arm/network" + "github.com/golang/glog" "k8s.io/apimachinery/pkg/types" ) @@ -242,26 +243,31 @@ func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) { return "", cloudprovider.InstanceNotFound } if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), az.getVirtualMachine(%s), err=%v", nodeName, nodeName, err) return "", err } nicID, err := getPrimaryInterfaceID(machine) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryInterfaceID(%s), err=%v", nodeName, machine, err) return "", err } nicName, err := getLastSegment(nicID) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getLastSegment(%s), err=%v", nodeName, nicID, err) return "", err } nic, err := az.InterfacesClient.Get(az.ResourceGroup, nicName, "") if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), az.InterfacesClient.Get(%s, %s, %s), err=%v", nodeName, az.ResourceGroup, nicName, "", err) return "", err } ipConfig, err := getPrimaryIPConfig(nic) if err != nil { + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryIPConfig(%s), err=%v", nodeName, nic, err) return "", err } diff --git a/pkg/cloudprovider/providers/azure/azure_wrap.go b/pkg/cloudprovider/providers/azure/azure_wrap.go index 185b7347211..2eb5f6d9e3c 100644 --- a/pkg/cloudprovider/providers/azure/azure_wrap.go +++ b/pkg/cloudprovider/providers/azure/azure_wrap.go @@ -43,6 +43,7 @@ func (az *Cloud) getVirtualMachine(nodeName types.NodeName) (vm compute.VirtualM var realErr error vmName := string(nodeName) + az.operationPollRateLimiter.Accept() vm, err = az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "") exists, realErr = checkResourceExistsFromError(err) From af5ce2fcc575db44e7334778d236295d156f7f8c Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 09:50:28 -0700 Subject: [PATCH 08/13] test coverage We want to ensure that backoff and rate limit configuration is opt-in --- .../providers/azure/azure_test.go | 53 +++++++++++++++---- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go index 4176230a720..f7cf5eca184 100644 --- a/pkg/cloudprovider/providers/azure/azure_test.go +++ b/pkg/cloudprovider/providers/azure/azure_test.go @@ -604,6 +604,20 @@ func TestNewCloudFromJSON(t *testing.T) { validateConfig(t, config) } +// Test Backoff and Rate Limit defaults (json) +func TestCloudDefaultConfigFromJSON(t *testing.T) { + config := `{}` + + validateEmptyConfig(t, config) +} + +// Test Backoff and Rate Limit defaults (yaml) +func TestCloudDefaultConfigFromYAML(t *testing.T) { + config := `` + + validateEmptyConfig(t, config) +} + // Test Configuration deserialization (yaml) func TestNewCloudFromYAML(t *testing.T) { config := ` @@ -631,16 +645,7 @@ cloudProviderRateLimitBucket: 5 } func validateConfig(t *testing.T, config string) { - configReader := strings.NewReader(config) - cloud, err := NewCloud(configReader) - if err != nil { - t.Error(err) - } - - azureCloud, ok := cloud.(*Cloud) - if !ok { - t.Error("NewCloud returned incorrect type") - } + azureCloud := getCloudFromConfig(t, config) if azureCloud.TenantID != "--tenant-id--" { t.Errorf("got incorrect value for TenantID") @@ -701,6 +706,34 @@ func validateConfig(t *testing.T, config string) { } } +func getCloudFromConfig(t *testing.T, config string) *Cloud { + configReader := strings.NewReader(config) + cloud, err := NewCloud(configReader) + if err != nil { + t.Error(err) + } + azureCloud, ok := cloud.(*Cloud) + if !ok { + t.Error("NewCloud returned incorrect type") + } + return azureCloud +} + +// TODO include checks for other appropriate default config parameters +func validateEmptyConfig(t *testing.T, config string) { + azureCloud := getCloudFromConfig(t, config) + + // backoff should be disabled by default if not explicitly enabled in config + if azureCloud.CloudProviderBackoff != false { + t.Errorf("got incorrect value for CloudProviderBackoff") + } + + // rate limits should be disabled by default if not explicitly enabled in config + if azureCloud.CloudProviderRateLimit != false { + t.Errorf("got incorrect value for CloudProviderRateLimit") + } +} + func TestDecodeInstanceInfo(t *testing.T) { response := `{"ID":"_azdev","UD":"0","FD":"99"}` From ac931aa1e03c14ad7b3044812c2fd404528c9c31 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 11:19:29 -0700 Subject: [PATCH 09/13] rate limiting on all azure sdk GET requests --- pkg/cloudprovider/providers/azure/azure.go | 2 +- pkg/cloudprovider/providers/azure/azure_instances.go | 2 ++ pkg/cloudprovider/providers/azure/azure_loadbalancer.go | 4 ++++ pkg/cloudprovider/providers/azure/azure_storageaccount.go | 2 ++ pkg/cloudprovider/providers/azure/azure_util.go | 1 + pkg/cloudprovider/providers/azure/azure_wrap.go | 5 +++++ 6 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index eb8a92c16ef..8532e0a3f5b 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -218,7 +218,7 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter( float32(az.CloudProviderRateLimitQPS), az.CloudProviderRateLimitBucket) - glog.V(2).Infof("Azure cloudprovider using rate limits: QPS=%d, bucket=%d", + glog.V(2).Infof("Azure cloudprovider using rate limit config: QPS=%d, bucket=%d", az.CloudProviderRateLimitQPS, az.CloudProviderRateLimitBucket) } else { diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index c4c40884208..13ec02170f9 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -116,6 +116,7 @@ func (az *Cloud) CurrentNodeName(hostname string) (types.NodeName, error) { func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) { allNodes := []compute.VirtualMachine{} + az.operationPollRateLimiter.Accept() result, err := az.VirtualMachinesClient.List(az.ResourceGroup) if err != nil { glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.List(%s), err=%v", az.ResourceGroup, err) @@ -127,6 +128,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) for morePages { allNodes = append(allNodes, *result.Value...) + az.operationPollRateLimiter.Accept() result, err = az.VirtualMachinesClient.ListAllNextResults(result) if err != nil { glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%s), err=%v", result, err) diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 7e9f151e004..d7da541ec9c 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -92,6 +92,7 @@ func (az *Cloud) getPublicIPName(clusterName string, service *v1.Service) (strin return fmt.Sprintf("%s-%s", clusterName, cloudprovider.GetLoadBalancerName(service)), nil } + az.operationPollRateLimiter.Accept() list, err := az.PublicIPAddressesClient.List(az.ResourceGroup) if err != nil { return "", err @@ -135,6 +136,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod serviceName := getServiceName(service) glog.V(5).Infof("ensure(%s): START clusterName=%q lbName=%q", serviceName, clusterName, lbName) + az.operationPollRateLimiter.Accept() sg, err := az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "") if err != nil { return nil, err @@ -445,6 +447,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub return nil, err } + az.operationPollRateLimiter.Accept() pip, err = az.PublicIPAddressesClient.Get(az.ResourceGroup, *pip.Name, "") if err != nil { return nil, err @@ -875,6 +878,7 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b } } + az.operationPollRateLimiter.Accept() nic, err := az.InterfacesClient.Get(az.ResourceGroup, nicName, "") if err != nil { return err diff --git a/pkg/cloudprovider/providers/azure/azure_storageaccount.go b/pkg/cloudprovider/providers/azure/azure_storageaccount.go index 4d33fb21e66..fa436eedf05 100644 --- a/pkg/cloudprovider/providers/azure/azure_storageaccount.go +++ b/pkg/cloudprovider/providers/azure/azure_storageaccount.go @@ -27,6 +27,7 @@ type accountWithLocation struct { // getStorageAccounts gets the storage accounts' name, type, location in a resource group func (az *Cloud) getStorageAccounts() ([]accountWithLocation, error) { + az.operationPollRateLimiter.Accept() result, err := az.StorageAccountClient.ListByResourceGroup(az.ResourceGroup) if err != nil { return nil, err @@ -56,6 +57,7 @@ func (az *Cloud) getStorageAccounts() ([]accountWithLocation, error) { // getStorageAccesskey gets the storage account access key func (az *Cloud) getStorageAccesskey(account string) (string, error) { + az.operationPollRateLimiter.Accept() result, err := az.StorageAccountClient.ListKeys(az.ResourceGroup, account) if err != nil { return "", err diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go index bebea9a64ab..9ddeaca8ffa 100644 --- a/pkg/cloudprovider/providers/azure/azure_util.go +++ b/pkg/cloudprovider/providers/azure/azure_util.go @@ -259,6 +259,7 @@ func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) { return "", err } + az.operationPollRateLimiter.Accept() nic, err := az.InterfacesClient.Get(az.ResourceGroup, nicName, "") if err != nil { glog.Errorf("error: az.getIPForMachine(%s), az.InterfacesClient.Get(%s, %s, %s), err=%v", nodeName, az.ResourceGroup, nicName, "", err) diff --git a/pkg/cloudprovider/providers/azure/azure_wrap.go b/pkg/cloudprovider/providers/azure/azure_wrap.go index 2eb5f6d9e3c..613e59a439d 100644 --- a/pkg/cloudprovider/providers/azure/azure_wrap.go +++ b/pkg/cloudprovider/providers/azure/azure_wrap.go @@ -61,6 +61,7 @@ func (az *Cloud) getVirtualMachine(nodeName types.NodeName) (vm compute.VirtualM func (az *Cloud) getRouteTable() (routeTable network.RouteTable, exists bool, err error) { var realErr error + az.operationPollRateLimiter.Accept() routeTable, err = az.RouteTablesClient.Get(az.ResourceGroup, az.RouteTableName, "") exists, realErr = checkResourceExistsFromError(err) @@ -78,6 +79,7 @@ func (az *Cloud) getRouteTable() (routeTable network.RouteTable, exists bool, er func (az *Cloud) getSecurityGroup() (sg network.SecurityGroup, exists bool, err error) { var realErr error + az.operationPollRateLimiter.Accept() sg, err = az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "") exists, realErr = checkResourceExistsFromError(err) @@ -95,6 +97,7 @@ func (az *Cloud) getSecurityGroup() (sg network.SecurityGroup, exists bool, err func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exists bool, err error) { var realErr error + az.operationPollRateLimiter.Accept() lb, err = az.LoadBalancerClient.Get(az.ResourceGroup, name, "") exists, realErr = checkResourceExistsFromError(err) @@ -112,6 +115,7 @@ func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exi func (az *Cloud) getPublicIPAddress(name string) (pip network.PublicIPAddress, exists bool, err error) { var realErr error + az.operationPollRateLimiter.Accept() pip, err = az.PublicIPAddressesClient.Get(az.ResourceGroup, name, "") exists, realErr = checkResourceExistsFromError(err) @@ -129,6 +133,7 @@ func (az *Cloud) getPublicIPAddress(name string) (pip network.PublicIPAddress, e func (az *Cloud) getSubnet(virtualNetworkName string, subnetName string) (subnet network.Subnet, exists bool, err error) { var realErr error + az.operationPollRateLimiter.Accept() subnet, err = az.SubnetsClient.Get(az.ResourceGroup, virtualNetworkName, subnetName, "") exists, realErr = checkResourceExistsFromError(err) From 148e923f653bdb2ae216d066c6119631bc5bdd5d Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 14:55:07 -0700 Subject: [PATCH 10/13] az.getVirtualMachine already rate-limited MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit we don’t need to rate limit the calls _to_ it --- pkg/cloudprovider/providers/azure/azure_backoff.go | 1 - pkg/cloudprovider/providers/azure/azure_instances.go | 1 - 2 files changed, 2 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index 4ad3d7fbe10..3fca4c49334 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -31,7 +31,6 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua var machine compute.VirtualMachine var exists bool err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) { - az.operationPollRateLimiter.Accept() var retryErr error machine, exists, retryErr = az.getVirtualMachine(name) if retryErr != nil { diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 13ec02170f9..964dd38063a 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -60,7 +60,6 @@ func (az *Cloud) InstanceID(name types.NodeName) (string, error) { var machine compute.VirtualMachine var exists bool var err error - az.operationPollRateLimiter.Accept() machine, exists, err = az.getVirtualMachine(name) if err != nil { if az.CloudProviderBackoff { From 6d73a09dcc8b53899cc2cadab25ca208672041dd Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 22:09:57 -0700 Subject: [PATCH 11/13] rate limiting everywhere not waiting to rate limit until we get an error response from the API, doing so on initial request for all API requests --- pkg/cloudprovider/providers/azure/azure_instances.go | 1 + pkg/cloudprovider/providers/azure/azure_loadbalancer.go | 8 ++++++++ pkg/cloudprovider/providers/azure/azure_routes.go | 3 +++ pkg/cloudprovider/providers/azure/azure_storage.go | 2 ++ 4 files changed, 14 insertions(+) diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 964dd38063a..13ec02170f9 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -60,6 +60,7 @@ func (az *Cloud) InstanceID(name types.NodeName) (string, error) { var machine compute.VirtualMachine var exists bool var err error + az.operationPollRateLimiter.Accept() machine, exists, err = az.getVirtualMachine(name) if err != nil { if az.CloudProviderBackoff { diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index d7da541ec9c..6121d26b12b 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -151,6 +151,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod // to nil. This is a workaround until https://github.com/Azure/go-autorest/issues/112 is fixed sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil + az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name) @@ -229,6 +230,7 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod } if !existsLb || lbNeedsUpdate { glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName) + az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName) @@ -328,6 +330,7 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi // to nil. This is a workaround until https://github.com/Azure/go-autorest/issues/112 is fixed sg.SecurityGroupPropertiesFormat.NetworkInterfaces = nil sg.SecurityGroupPropertiesFormat.Subnets = nil + az.operationPollRateLimiter.Accept() resp, err := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) @@ -365,6 +368,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is if lbNeedsUpdate { if len(*lb.FrontendIPConfigurations) > 0 { glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName) + az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName) @@ -380,6 +384,7 @@ func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, is } else { glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) + az.operationPollRateLimiter.Accept() resp, err := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName) @@ -434,6 +439,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub pip.Tags = &map[string]*string{"service": &serviceName} glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name) + az.operationPollRateLimiter.Accept() resp, err := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - creating", serviceName, *pip.Name) @@ -459,6 +465,7 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName string) (*network.Pub func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error { glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName) + az.operationPollRateLimiter.Accept() resp, deleteErr := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, deleteErr) { glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName) @@ -910,6 +917,7 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b primaryIPConfig.LoadBalancerBackendAddressPools = &newBackendPools glog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) + az.operationPollRateLimiter.Accept() resp, err := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating, err=%v", serviceName, nicName, err) diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index 8332c9d124f..0d7a23ebfd8 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -77,6 +77,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo } glog.V(3).Infof("create: creating routetable. routeTableName=%q", az.RouteTableName) + az.operationPollRateLimiter.Accept() resp, err := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating routetable. routeTableName=%q", az.RouteTableName) @@ -112,6 +113,7 @@ func (az *Cloud) CreateRoute(clusterName string, nameHint string, kubeRoute *clo } glog.V(3).Infof("create: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) + az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create backing off: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) @@ -135,6 +137,7 @@ func (az *Cloud) DeleteRoute(clusterName string, kubeRoute *cloudprovider.Route) glog.V(2).Infof("delete: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) + az.operationPollRateLimiter.Accept() resp, err := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("delete backing off: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) diff --git a/pkg/cloudprovider/providers/azure/azure_storage.go b/pkg/cloudprovider/providers/azure/azure_storage.go index bb09cc1c5e5..b810480ab47 100644 --- a/pkg/cloudprovider/providers/azure/azure_storage.go +++ b/pkg/cloudprovider/providers/azure/azure_storage.go @@ -65,6 +65,7 @@ func (az *Cloud) AttachDisk(diskName, diskURI string, nodeName types.NodeName, l } vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) + az.operationPollRateLimiter.Accept() resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) @@ -145,6 +146,7 @@ func (az *Cloud) DetachDiskByName(diskName, diskURI string, nodeName types.NodeN } vmName := mapNodeNameToVMName(nodeName) glog.V(2).Infof("create(%s): vm(%s)", az.ResourceGroup, vmName) + az.operationPollRateLimiter.Accept() resp, err := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil) if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) { glog.V(2).Infof("create(%s) backing off: vm(%s)", az.ResourceGroup, vmName) From 2accbbd61819d6779a5dd44007f48a121b165cc0 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 22:12:49 -0700 Subject: [PATCH 12/13] go vet errata --- pkg/cloudprovider/providers/azure/azure_instances.go | 2 +- pkg/cloudprovider/providers/azure/azure_util.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 13ec02170f9..fdd0bbdd9ca 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -131,7 +131,7 @@ func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) az.operationPollRateLimiter.Accept() result, err = az.VirtualMachinesClient.ListAllNextResults(result) if err != nil { - glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%s), err=%v", result, err) + glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%v), err=%v", result, err) return nil, err } diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go index 9ddeaca8ffa..853ea29e98b 100644 --- a/pkg/cloudprovider/providers/azure/azure_util.go +++ b/pkg/cloudprovider/providers/azure/azure_util.go @@ -249,7 +249,7 @@ func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) { nicID, err := getPrimaryInterfaceID(machine) if err != nil { - glog.Errorf("error: az.getIPForMachine(%s), getPrimaryInterfaceID(%s), err=%v", nodeName, machine, err) + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryInterfaceID(%v), err=%v", nodeName, machine, err) return "", err } @@ -268,7 +268,7 @@ func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) { ipConfig, err := getPrimaryIPConfig(nic) if err != nil { - glog.Errorf("error: az.getIPForMachine(%s), getPrimaryIPConfig(%s), err=%v", nodeName, nic, err) + glog.Errorf("error: az.getIPForMachine(%s), getPrimaryIPConfig(%v), err=%v", nodeName, nic, err) return "", err } From acb65170f3cf3169090af68532e968fc5f8e30e3 Mon Sep 17 00:00:00 2001 From: Jack Francis Date: Tue, 6 Jun 2017 22:21:14 -0700 Subject: [PATCH 13/13] preferring float32 for rate limit QPS param --- pkg/cloudprovider/providers/azure/azure.go | 6 +++--- pkg/cloudprovider/providers/azure/azure_test.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index 8532e0a3f5b..ca2e48b7e6c 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -40,7 +40,7 @@ import ( const ( // CloudProviderName is the value used for the --cloud-provider flag CloudProviderName = "azure" - rateLimitQPSDefault = 1 + rateLimitQPSDefault = 1.0 rateLimitBucketDefault = 5 backoffRetriesDefault = 6 backoffExponentDefault = 1.5 @@ -93,7 +93,7 @@ type Config struct { // Enable rate limiting CloudProviderRateLimit bool `json:"cloudProviderRateLimit" yaml:"cloudProviderRateLimit"` // Rate limit QPS - CloudProviderRateLimitQPS int `json:"cloudProviderRateLimitQPS" yaml:"cloudProviderRateLimitQPS"` + CloudProviderRateLimitQPS float32 `json:"cloudProviderRateLimitQPS" yaml:"cloudProviderRateLimitQPS"` // Rate limit Bucket Size CloudProviderRateLimitBucket int `json:"cloudProviderRateLimitBucket" yaml:"cloudProviderRateLimitBucket"` } @@ -216,7 +216,7 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { az.CloudProviderRateLimitBucket = rateLimitBucketDefault } az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter( - float32(az.CloudProviderRateLimitQPS), + az.CloudProviderRateLimitQPS, az.CloudProviderRateLimitBucket) glog.V(2).Infof("Azure cloudprovider using rate limit config: QPS=%d, bucket=%d", az.CloudProviderRateLimitQPS, diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go index f7cf5eca184..15f86751abe 100644 --- a/pkg/cloudprovider/providers/azure/azure_test.go +++ b/pkg/cloudprovider/providers/azure/azure_test.go @@ -598,7 +598,7 @@ func TestNewCloudFromJSON(t *testing.T) { "cloudProviderBackoffDuration": 5, "cloudProviderBackoffJitter": 1.0, "cloudProviderRatelimit": true, - "cloudProviderRateLimitQPS": 1, + "cloudProviderRateLimitQPS": 0.5, "cloudProviderRateLimitBucket": 5 }` validateConfig(t, config) @@ -638,7 +638,7 @@ cloudProviderBackoffExponent: 1.5 cloudProviderBackoffDuration: 5 cloudProviderBackoffJitter: 1.0 cloudProviderRatelimit: true -cloudProviderRateLimitQPS: 1 +cloudProviderRateLimitQPS: 0.5 cloudProviderRateLimitBucket: 5 ` validateConfig(t, config) @@ -698,7 +698,7 @@ func validateConfig(t *testing.T, config string) { if azureCloud.CloudProviderRateLimit != true { t.Errorf("got incorrect value for CloudProviderRateLimit") } - if azureCloud.CloudProviderRateLimitQPS != 1 { + if azureCloud.CloudProviderRateLimitQPS != 0.5 { t.Errorf("got incorrect value for CloudProviderRateLimitQPS") } if azureCloud.CloudProviderRateLimitBucket != 5 {