diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go index a64e5730c42..d834935d0b0 100644 --- a/pkg/cloudprovider/providers/azure/azure.go +++ b/pkg/cloudprovider/providers/azure/azure.go @@ -59,6 +59,9 @@ const ( vmTypeVMSS = "vmss" vmTypeStandard = "standard" + backoffModeDefault = "default" + backoffModeV2 = "v2" + loadBalancerSkuBasic = "basic" loadBalancerSkuStandard = "standard" @@ -115,6 +118,12 @@ type Config struct { CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration" yaml:"cloudProviderBackoffDuration"` // Backoff jitter CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter" yaml:"cloudProviderBackoffJitter"` + // Backoff mode, options are v2 and default. + // * default means two-layer backoff retrying, one in the cloud provider and the other in the Azure SDK. + // * v2 means only backoff in the Azure SDK is used. In such mode, CloudProviderBackoffDuration and + // CloudProviderBackoffJitter are omitted. + // "default" will be used if not specified. + CloudProviderBackoffMode string `json:"cloudProviderBackoffMode" yaml:"cloudProviderBackoffMode"` // Enable rate limiting CloudProviderRateLimit bool `json:"cloudProviderRateLimit" yaml:"cloudProviderRateLimit"` // Rate limit QPS (Read) @@ -268,25 +277,71 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { config.CloudProviderRateLimitBucketWrite) } + // Conditionally configure resource request backoff + resourceRequestBackoff := wait.Backoff{ + Steps: 1, + } + if config.CloudProviderBackoff { + // Assign backoff defaults if no configuration was passed in + if config.CloudProviderBackoffRetries == 0 { + config.CloudProviderBackoffRetries = backoffRetriesDefault + } + if config.CloudProviderBackoffDuration == 0 { + config.CloudProviderBackoffDuration = backoffDurationDefault + } + if config.CloudProviderBackoffExponent == 0 { + config.CloudProviderBackoffExponent = backoffExponentDefault + } else if config.shouldOmitCloudProviderBackoff() { + klog.Warning("Azure cloud provider config 'cloudProviderBackoffExponent' has been deprecated for 'v2' backoff mode. 2 is always used as the backoff exponent.") + } + if config.CloudProviderBackoffJitter == 0 { + config.CloudProviderBackoffJitter = backoffJitterDefault + } else if config.shouldOmitCloudProviderBackoff() { + klog.Warning("Azure cloud provider config 'cloudProviderBackoffJitter' has been deprecated for 'v2' backoff mode.") + } + + if !config.shouldOmitCloudProviderBackoff() { + resourceRequestBackoff = wait.Backoff{ + Steps: config.CloudProviderBackoffRetries, + Factor: config.CloudProviderBackoffExponent, + Duration: time.Duration(config.CloudProviderBackoffDuration) * time.Second, + Jitter: config.CloudProviderBackoffJitter, + } + } + klog.V(2).Infof("Azure cloudprovider using try backoff: retries=%d, exponent=%f, duration=%d, jitter=%f", + config.CloudProviderBackoffRetries, + config.CloudProviderBackoffExponent, + config.CloudProviderBackoffDuration, + config.CloudProviderBackoffJitter) + } else { + // CloudProviderBackoffRetries will be set to 1 by default as the requirements of Azure SDK. + config.CloudProviderBackoffRetries = 1 + config.CloudProviderBackoffDuration = backoffDurationDefault + } + // Do not add master nodes to standard LB by default. if config.ExcludeMasterFromStandardLB == nil { config.ExcludeMasterFromStandardLB = &defaultExcludeMasterFromStandardLB } azClientConfig := &azClientConfig{ - subscriptionID: config.SubscriptionID, - resourceManagerEndpoint: env.ResourceManagerEndpoint, - servicePrincipalToken: servicePrincipalToken, - rateLimiterReader: operationPollRateLimiter, - rateLimiterWriter: operationPollRateLimiterWrite, + subscriptionID: config.SubscriptionID, + resourceManagerEndpoint: env.ResourceManagerEndpoint, + servicePrincipalToken: servicePrincipalToken, + rateLimiterReader: operationPollRateLimiter, + rateLimiterWriter: operationPollRateLimiterWrite, + CloudProviderBackoffRetries: config.CloudProviderBackoffRetries, + CloudProviderBackoffDuration: config.CloudProviderBackoffDuration, + ShouldOmitCloudProviderBackoff: config.shouldOmitCloudProviderBackoff(), } az := Cloud{ - Config: *config, - Environment: *env, - nodeZones: map[string]sets.String{}, - nodeResourceGroups: map[string]string{}, - unmanagedNodes: sets.NewString(), - routeCIDRs: map[string]string{}, + Config: *config, + Environment: *env, + nodeZones: map[string]sets.String{}, + nodeResourceGroups: map[string]string{}, + unmanagedNodes: sets.NewString(), + routeCIDRs: map[string]string{}, + resourceRequestBackoff: resourceRequestBackoff, DisksClient: newAzDisksClient(azClientConfig), RoutesClient: newAzRoutesClient(azClientConfig), @@ -304,34 +359,6 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) { FileClient: &azureFileClient{env: *env}, } - // Conditionally configure resource request backoff - if az.CloudProviderBackoff { - // Assign backoff defaults if no configuration was passed in - if az.CloudProviderBackoffRetries == 0 { - az.CloudProviderBackoffRetries = backoffRetriesDefault - } - if az.CloudProviderBackoffExponent == 0 { - az.CloudProviderBackoffExponent = backoffExponentDefault - } - if az.CloudProviderBackoffDuration == 0 { - az.CloudProviderBackoffDuration = backoffDurationDefault - } - if az.CloudProviderBackoffJitter == 0 { - az.CloudProviderBackoffJitter = backoffJitterDefault - } - az.resourceRequestBackoff = wait.Backoff{ - Steps: az.CloudProviderBackoffRetries, - Factor: az.CloudProviderBackoffExponent, - Duration: time.Duration(az.CloudProviderBackoffDuration) * time.Second, - Jitter: az.CloudProviderBackoffJitter, - } - klog.V(2).Infof("Azure cloudprovider using try backoff: retries=%d, exponent=%f, duration=%d, jitter=%f", - az.CloudProviderBackoffRetries, - az.CloudProviderBackoffExponent, - az.CloudProviderBackoffDuration, - az.CloudProviderBackoffJitter) - } - az.metadata, err = NewInstanceMetadataService(metadataURL) if err != nil { return nil, err diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go index c0a6f4a34ee..2771bef26b1 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff.go @@ -17,19 +17,18 @@ limitations under the License. package azure import ( - "context" "fmt" "net/http" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network" - "k8s.io/klog" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog" ) // requestBackoff if backoff is disabled in cloud provider it @@ -43,7 +42,6 @@ func (az *Cloud) requestBackoff() (resourceRequestBackoff wait.Backoff) { resourceRequestBackoff = wait.Backoff{ Steps: 1, } - return resourceRequestBackoff } @@ -73,12 +71,11 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua if err == wait.ErrWaitTimeout { err = retryErr } - return machine, err } -// VirtualMachineClientListWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry -func (az *Cloud) VirtualMachineClientListWithRetry(resourceGroup string) ([]compute.VirtualMachine, error) { +// ListVirtualMachinesWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry +func (az *Cloud) ListVirtualMachinesWithRetry(resourceGroup string) ([]compute.VirtualMachine, error) { allNodes := []compute.VirtualMachine{} err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { var retryErr error @@ -101,12 +98,38 @@ func (az *Cloud) VirtualMachineClientListWithRetry(resourceGroup string) ([]comp return allNodes, err } +// ListVirtualMachines invokes az.VirtualMachinesClient.List with exponential backoff retry +func (az *Cloud) ListVirtualMachines(resourceGroup string) ([]compute.VirtualMachine, error) { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + allNodes, err := az.VirtualMachinesClient.List(ctx, resourceGroup) + if err != nil { + klog.Errorf("VirtualMachinesClient.List(%v) failure with err=%v", resourceGroup, err) + return nil, err + } + klog.V(2).Infof("VirtualMachinesClient.List(%v) success", resourceGroup) + return allNodes, nil + } + + return az.ListVirtualMachinesWithRetry(resourceGroup) +} + +func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, string, error) { + if az.Config.shouldOmitCloudProviderBackoff() { + return az.vmSet.GetIPByNodeName(string(nodeName)) + } + + return az.GetIPForMachineWithRetry(nodeName) +} + // GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string, error) { var ip, publicIP string err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { var retryErr error - ip, publicIP, retryErr = az.getIPForMachine(name) + ip, publicIP, retryErr = az.vmSet.GetIPByNodeName(string(name)) if retryErr != nil { klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr) return false, nil @@ -117,6 +140,28 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string, return ip, publicIP, err } +// CreateOrUpdateSecurityGroup invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateSecurityGroup(service *v1.Service, sg network.SecurityGroup) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.SecurityGroupsClient.CreateOrUpdate(ctx, az.ResourceGroup, *sg.Name, sg) + klog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%s): end", *sg.Name) + if err == nil { + if isSuccessHTTPResponse(resp) { + // Invalidate the cache right after updating + az.nsgCache.Delete(*sg.Name) + } else if resp != nil { + return fmt.Errorf("HTTP response %q", resp.Status) + } + } + return err + } + + return az.CreateOrUpdateSGWithRetry(service, sg) +} + // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry func (az *Cloud) CreateOrUpdateSGWithRetry(service *v1.Service, sg network.SecurityGroup) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { @@ -134,8 +179,30 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(service *v1.Service, sg network.Secur }) } -// CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateLBWithRetry(service *v1.Service, lb network.LoadBalancer) error { +// CreateOrUpdateLB invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateLB(service *v1.Service, lb network.LoadBalancer) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.LoadBalancerClient.CreateOrUpdate(ctx, az.ResourceGroup, *lb.Name, lb) + klog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): end", *lb.Name) + if err == nil { + if isSuccessHTTPResponse(resp) { + // Invalidate the cache right after updating + az.lbCache.Delete(*lb.Name) + } else if resp != nil { + return fmt.Errorf("HTTP response %q", resp.Status) + } + } + return err + } + + return az.createOrUpdateLBWithRetry(service, lb) +} + +// createOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) createOrUpdateLBWithRetry(service *v1.Service, lb network.LoadBalancer) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -151,8 +218,27 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(service *v1.Service, lb network.LoadB }) } -// ListLBWithRetry invokes az.LoadBalancerClient.List with exponential backoff retry -func (az *Cloud) ListLBWithRetry(service *v1.Service) ([]network.LoadBalancer, error) { +// ListLB invokes az.LoadBalancerClient.List with exponential backoff retry +func (az *Cloud) ListLB(service *v1.Service) ([]network.LoadBalancer, error) { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + allLBs, err := az.LoadBalancerClient.List(ctx, az.ResourceGroup) + if err != nil { + az.Event(service, v1.EventTypeWarning, "ListLoadBalancers", err.Error()) + klog.Errorf("LoadBalancerClient.List(%v) failure with err=%v", az.ResourceGroup, err) + return nil, err + } + klog.V(2).Infof("LoadBalancerClient.List(%v) success", az.ResourceGroup) + return allLBs, nil + } + + return az.listLBWithRetry(service) +} + +// listLBWithRetry invokes az.LoadBalancerClient.List with exponential backoff retry +func (az *Cloud) listLBWithRetry(service *v1.Service) ([]network.LoadBalancer, error) { var allLBs []network.LoadBalancer err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { @@ -178,8 +264,27 @@ func (az *Cloud) ListLBWithRetry(service *v1.Service) ([]network.LoadBalancer, e return allLBs, nil } -// ListPIPWithRetry list the PIP resources in the given resource group -func (az *Cloud) ListPIPWithRetry(service *v1.Service, pipResourceGroup string) ([]network.PublicIPAddress, error) { +// ListPIP list the PIP resources in the given resource group +func (az *Cloud) ListPIP(service *v1.Service, pipResourceGroup string) ([]network.PublicIPAddress, error) { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + allPIPs, err := az.PublicIPAddressesClient.List(ctx, pipResourceGroup) + if err != nil { + az.Event(service, v1.EventTypeWarning, "ListPublicIPs", err.Error()) + klog.Errorf("PublicIPAddressesClient.List(%v) failure with err=%v", pipResourceGroup, err) + return nil, err + } + klog.V(2).Infof("PublicIPAddressesClient.List(%v) success", pipResourceGroup) + return allPIPs, nil + } + + return az.listPIPWithRetry(service, pipResourceGroup) +} + +// listPIPWithRetry list the PIP resources in the given resource group +func (az *Cloud) listPIPWithRetry(service *v1.Service, pipResourceGroup string) ([]network.PublicIPAddress, error) { var allPIPs []network.PublicIPAddress err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { @@ -205,8 +310,22 @@ func (az *Cloud) ListPIPWithRetry(service *v1.Service, pipResourceGroup string) return allPIPs, nil } -// CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdatePIPWithRetry(service *v1.Service, pipResourceGroup string, pip network.PublicIPAddress) error { +// CreateOrUpdatePIP invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdatePIP(service *v1.Service, pipResourceGroup string, pip network.PublicIPAddress) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.PublicIPAddressesClient.CreateOrUpdate(ctx, pipResourceGroup, *pip.Name, pip) + klog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s, %s): end", pipResourceGroup, *pip.Name) + return az.processHTTPResponse(service, "CreateOrUpdatePublicIPAddress", resp, err) + } + + return az.createOrUpdatePIPWithRetry(service, pipResourceGroup, pip) +} + +// createOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) createOrUpdatePIPWithRetry(service *v1.Service, pipResourceGroup string, pip network.PublicIPAddress) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -217,8 +336,22 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(service *v1.Service, pipResourceGrou }) } -// CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateInterfaceWithRetry(service *v1.Service, nic network.Interface) error { +// CreateOrUpdateInterface invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateInterface(service *v1.Service, nic network.Interface) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.InterfacesClient.CreateOrUpdate(ctx, az.ResourceGroup, *nic.Name, nic) + klog.V(10).Infof("InterfacesClient.CreateOrUpdate(%s): end", *nic.Name) + return az.processHTTPResponse(service, "CreateOrUpdateInterface", resp, err) + } + + return az.createOrUpdateInterfaceWithRetry(service, nic) +} + +// createOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) createOrUpdateInterfaceWithRetry(service *v1.Service, nic network.Interface) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -229,8 +362,21 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(service *v1.Service, nic netwo }) } -// DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry -func (az *Cloud) DeletePublicIPWithRetry(service *v1.Service, pipResourceGroup string, pipName string) error { +// DeletePublicIP invokes az.PublicIPAddressesClient.Delete with exponential backoff retry +func (az *Cloud) DeletePublicIP(service *v1.Service, pipResourceGroup string, pipName string) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.PublicIPAddressesClient.Delete(ctx, pipResourceGroup, pipName) + return az.processHTTPResponse(service, "DeletePublicIPAddress", resp, err) + } + + return az.deletePublicIPWithRetry(service, pipResourceGroup, pipName) +} + +// deletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry +func (az *Cloud) deletePublicIPWithRetry(service *v1.Service, pipResourceGroup string, pipName string) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -240,8 +386,29 @@ func (az *Cloud) DeletePublicIPWithRetry(service *v1.Service, pipResourceGroup s }) } -// DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry -func (az *Cloud) DeleteLBWithRetry(service *v1.Service, lbName string) error { +// DeleteLB invokes az.LoadBalancerClient.Delete with exponential backoff retry +func (az *Cloud) DeleteLB(service *v1.Service, lbName string) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.LoadBalancerClient.Delete(ctx, az.ResourceGroup, lbName) + if err == nil { + if isSuccessHTTPResponse(resp) { + // Invalidate the cache right after updating + az.lbCache.Delete(lbName) + } else if resp != nil { + return fmt.Errorf("HTTP response %q", resp.Status) + } + } + return err + } + + return az.deleteLBWithRetry(service, lbName) +} + +// deleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry +func (az *Cloud) deleteLBWithRetry(service *v1.Service, lbName string) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -256,8 +423,21 @@ func (az *Cloud) DeleteLBWithRetry(service *v1.Service, lbName string) error { }) } -// CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { +// CreateOrUpdateRouteTable invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateRouteTable(routeTable network.RouteTable) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.RouteTablesClient.CreateOrUpdate(ctx, az.ResourceGroup, az.RouteTableName, routeTable) + return az.processHTTPResponse(nil, "", resp, err) + } + + return az.createOrUpdateRouteTableWithRetry(routeTable) +} + +// createOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) createOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -267,8 +447,22 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable }) } -// CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry -func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { +// CreateOrUpdateRoute invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) CreateOrUpdateRoute(route network.Route) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.RoutesClient.CreateOrUpdate(ctx, az.ResourceGroup, az.RouteTableName, *route.Name, route) + klog.V(10).Infof("RoutesClient.CreateOrUpdate(%s): end", *route.Name) + return az.processHTTPResponse(nil, "", resp, err) + } + + return az.createOrUpdateRouteWithRetry(route) +} + +// createOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) createOrUpdateRouteWithRetry(route network.Route) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() @@ -279,14 +473,28 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error { }) } -// DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry -func (az *Cloud) DeleteRouteWithRetry(routeName string) error { +// DeleteRouteWithName invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry +func (az *Cloud) DeleteRouteWithName(routeName string) error { + if az.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + + resp, err := az.RoutesClient.Delete(ctx, az.ResourceGroup, az.RouteTableName, routeName) + klog.V(10).Infof("RoutesClient.Delete(%s,%s): end", az.RouteTableName, routeName) + return az.processHTTPResponse(nil, "", resp, err) + } + + return az.deleteRouteWithRetry(routeName) +} + +// deleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry +func (az *Cloud) deleteRouteWithRetry(routeName string) error { return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { ctx, cancel := getContextWithCancel() defer cancel() resp, err := az.RoutesClient.Delete(ctx, az.ResourceGroup, az.RouteTableName, routeName) - klog.V(10).Infof("RoutesClient.Delete(%s): end", az.RouteTableName) + klog.V(10).Infof("RoutesClient.Delete(%s,%s): end", az.RouteTableName, routeName) return az.processHTTPRetryResponse(nil, "", resp, err) }) } @@ -303,21 +511,17 @@ func (az *Cloud) CreateOrUpdateVMWithRetry(resourceGroup, vmName string, newVM c }) } -// UpdateVmssVMWithRetry invokes az.VirtualMachineScaleSetVMsClient.Update with exponential backoff retry -func (az *Cloud) UpdateVmssVMWithRetry(ctx context.Context, resourceGroupName string, VMScaleSetName string, instanceID string, parameters compute.VirtualMachineScaleSetVM) error { - return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) { - resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters) - klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID) - return az.processHTTPRetryResponse(nil, "", resp, err) - }) -} - // isSuccessHTTPResponse determines if the response from an HTTP request suggests success -func isSuccessHTTPResponse(resp http.Response) bool { +func isSuccessHTTPResponse(resp *http.Response) bool { + if resp == nil { + return false + } + // HTTP 2xx suggests a successful response if 199 < resp.StatusCode && resp.StatusCode < 300 { return true } + return false } @@ -337,7 +541,7 @@ func shouldRetryHTTPRequest(resp *http.Response, err error) bool { } func (az *Cloud) processHTTPRetryResponse(service *v1.Service, reason string, resp *http.Response, err error) (bool, error) { - if resp != nil && isSuccessHTTPResponse(*resp) { + if resp != nil && isSuccessHTTPResponse(resp) { // HTTP 2xx suggests a successful response return true, nil } @@ -358,3 +562,24 @@ func (az *Cloud) processHTTPRetryResponse(service *v1.Service, reason string, re // Fall-through: stop periodic backoff return true, nil } + +func (az *Cloud) processHTTPResponse(service *v1.Service, reason string, resp *http.Response, err error) error { + if isSuccessHTTPResponse(resp) { + // HTTP 2xx suggests a successful response + return nil + } + + if err != nil { + az.Event(service, v1.EventTypeWarning, reason, err.Error()) + klog.Errorf("processHTTPRetryResponse failure with err: %v", err) + } else if resp != nil { + az.Event(service, v1.EventTypeWarning, reason, fmt.Sprintf("Azure HTTP response %d", resp.StatusCode)) + klog.Errorf("processHTTPRetryResponse failure with HTTP response %q", resp.Status) + } + + return err +} + +func (cfg *Config) shouldOmitCloudProviderBackoff() bool { + return cfg.CloudProviderBackoffMode == backoffModeV2 +} diff --git a/pkg/cloudprovider/providers/azure/azure_backoff_test.go b/pkg/cloudprovider/providers/azure/azure_backoff_test.go index 767c7cc30e3..d0a1399e27c 100644 --- a/pkg/cloudprovider/providers/azure/azure_backoff_test.go +++ b/pkg/cloudprovider/providers/azure/azure_backoff_test.go @@ -50,7 +50,6 @@ func TestShouldRetryHTTPRequest(t *testing.T) { expected: false, }, } - for _, test := range tests { resp := &http.Response{ StatusCode: test.code, @@ -85,7 +84,7 @@ func TestIsSuccessResponse(t *testing.T) { resp := http.Response{ StatusCode: test.code, } - res := isSuccessHTTPResponse(resp) + res := isSuccessHTTPResponse(&resp) if res != test.expected { t.Errorf("expected: %v, saw: %v", test.expected, res) } diff --git a/pkg/cloudprovider/providers/azure/azure_client.go b/pkg/cloudprovider/providers/azure/azure_client.go index 94a97034275..e7d2a9d6cec 100644 --- a/pkg/cloudprovider/providers/azure/azure_client.go +++ b/pkg/cloudprovider/providers/azure/azure_client.go @@ -145,6 +145,10 @@ type azClientConfig struct { //Details: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-request-limits rateLimiterReader flowcontrol.RateLimiter rateLimiterWriter flowcontrol.RateLimiter + + CloudProviderBackoffRetries int + CloudProviderBackoffDuration int + ShouldOmitCloudProviderBackoff bool } // azVirtualMachinesClient implements VirtualMachinesClient. @@ -163,6 +167,10 @@ func newAzVirtualMachinesClient(config *azClientConfig) *azVirtualMachinesClient virtualMachinesClient.BaseURI = config.resourceManagerEndpoint virtualMachinesClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) virtualMachinesClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + virtualMachinesClient.RetryAttempts = config.CloudProviderBackoffRetries + virtualMachinesClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&virtualMachinesClient.Client) return &azVirtualMachinesClient{ @@ -254,6 +262,10 @@ func newAzInterfacesClient(config *azClientConfig) *azInterfacesClient { interfacesClient.BaseURI = config.resourceManagerEndpoint interfacesClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) interfacesClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + interfacesClient.RetryAttempts = config.CloudProviderBackoffRetries + interfacesClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&interfacesClient.Client) return &azInterfacesClient{ @@ -333,6 +345,10 @@ func newAzLoadBalancersClient(config *azClientConfig) *azLoadBalancersClient { loadBalancerClient.BaseURI = config.resourceManagerEndpoint loadBalancerClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) loadBalancerClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + loadBalancerClient.RetryAttempts = config.CloudProviderBackoffRetries + loadBalancerClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&loadBalancerClient.Client) return &azLoadBalancersClient{ @@ -449,6 +465,10 @@ func newAzPublicIPAddressesClient(config *azClientConfig) *azPublicIPAddressesCl publicIPAddressClient.BaseURI = config.resourceManagerEndpoint publicIPAddressClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) publicIPAddressClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + publicIPAddressClient.RetryAttempts = config.CloudProviderBackoffRetries + publicIPAddressClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&publicIPAddressClient.Client) return &azPublicIPAddressesClient{ @@ -564,6 +584,10 @@ func newAzSubnetsClient(config *azClientConfig) *azSubnetsClient { subnetsClient.BaseURI = config.resourceManagerEndpoint subnetsClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) subnetsClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + subnetsClient.RetryAttempts = config.CloudProviderBackoffRetries + subnetsClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&subnetsClient.Client) return &azSubnetsClient{ @@ -679,6 +703,10 @@ func newAzSecurityGroupsClient(config *azClientConfig) *azSecurityGroupsClient { securityGroupsClient.BaseURI = config.resourceManagerEndpoint securityGroupsClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) securityGroupsClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + securityGroupsClient.RetryAttempts = config.CloudProviderBackoffRetries + securityGroupsClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&securityGroupsClient.Client) return &azSecurityGroupsClient{ @@ -794,6 +822,10 @@ func newAzVirtualMachineScaleSetsClient(config *azClientConfig) *azVirtualMachin virtualMachineScaleSetsClient.BaseURI = config.resourceManagerEndpoint virtualMachineScaleSetsClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) virtualMachineScaleSetsClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + virtualMachineScaleSetsClient.RetryAttempts = config.CloudProviderBackoffRetries + virtualMachineScaleSetsClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&virtualMachineScaleSetsClient.Client) return &azVirtualMachineScaleSetsClient{ @@ -910,6 +942,10 @@ func newAzVirtualMachineScaleSetVMsClient(config *azClientConfig) *azVirtualMach virtualMachineScaleSetVMsClient.BaseURI = config.resourceManagerEndpoint virtualMachineScaleSetVMsClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) virtualMachineScaleSetVMsClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + virtualMachineScaleSetVMsClient.RetryAttempts = config.CloudProviderBackoffRetries + virtualMachineScaleSetVMsClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&virtualMachineScaleSetVMsClient.Client) return &azVirtualMachineScaleSetVMsClient{ @@ -1018,6 +1054,10 @@ func newAzRoutesClient(config *azClientConfig) *azRoutesClient { routesClient.BaseURI = config.resourceManagerEndpoint routesClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) routesClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + routesClient.RetryAttempts = config.CloudProviderBackoffRetries + routesClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&routesClient.Client) return &azRoutesClient{ @@ -1087,6 +1127,10 @@ func newAzRouteTablesClient(config *azClientConfig) *azRouteTablesClient { routeTablesClient.BaseURI = config.resourceManagerEndpoint routeTablesClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) routeTablesClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + routeTablesClient.RetryAttempts = config.CloudProviderBackoffRetries + routeTablesClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&routeTablesClient.Client) return &azRouteTablesClient{ @@ -1148,6 +1192,10 @@ func newAzStorageAccountClient(config *azClientConfig) *azStorageAccountClient { storageAccountClient := storage.NewAccountsClientWithBaseURI(config.resourceManagerEndpoint, config.subscriptionID) storageAccountClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) storageAccountClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + storageAccountClient.RetryAttempts = config.CloudProviderBackoffRetries + storageAccountClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&storageAccountClient.Client) return &azStorageAccountClient{ @@ -1259,6 +1307,10 @@ func newAzDisksClient(config *azClientConfig) *azDisksClient { disksClient := compute.NewDisksClientWithBaseURI(config.resourceManagerEndpoint, config.subscriptionID) disksClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) disksClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + disksClient.RetryAttempts = config.CloudProviderBackoffRetries + disksClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&disksClient.Client) return &azDisksClient{ @@ -1345,6 +1397,10 @@ func newAzVirtualMachineSizesClient(config *azClientConfig) *azVirtualMachineSiz VirtualMachineSizesClient.BaseURI = config.resourceManagerEndpoint VirtualMachineSizesClient.Authorizer = autorest.NewBearerAuthorizer(config.servicePrincipalToken) VirtualMachineSizesClient.PollingDelay = 5 * time.Second + if config.ShouldOmitCloudProviderBackoff { + VirtualMachineSizesClient.RetryAttempts = config.CloudProviderBackoffRetries + VirtualMachineSizesClient.RetryDuration = time.Duration(config.CloudProviderBackoffDuration) * time.Second + } configureUserAgent(&VirtualMachineSizesClient.Client) return &azVirtualMachineSizesClient{ diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 580983dc336..87acadfd59f 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -47,7 +47,7 @@ func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.N } addressGetter := func(nodeName types.NodeName) ([]v1.NodeAddress, error) { - ip, publicIP, err := az.GetIPForMachineWithRetry(nodeName) + ip, publicIP, err := az.getIPForMachine(nodeName) if err != nil { klog.V(2).Infof("NodeAddresses(%s) abort backoff: %v", nodeName, err) return nil, err diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go index 77bfdd20de2..bab5f1daaff 100644 --- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go +++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go @@ -27,11 +27,11 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog" serviceapi "k8s.io/kubernetes/pkg/api/v1/service" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network" "github.com/Azure/go-autorest/autorest/to" - "k8s.io/klog" ) const ( @@ -210,7 +210,7 @@ func (az *Cloud) getServiceLoadBalancer(service *v1.Service, clusterName string, primaryVMSetName := az.vmSet.GetPrimaryVMSetName() defaultLBName := az.getAzureLoadBalancerName(clusterName, primaryVMSetName, isInternal) - existingLBs, err := az.ListLBWithRetry(service) + existingLBs, err := az.ListLB(service) if err != nil { return nil, nil, false, err } @@ -387,7 +387,7 @@ func (az *Cloud) determinePublicIPName(clusterName string, service *v1.Service) pipResourceGroup := az.getPublicIPAddressResourceGroup(service) - pips, err := az.ListPIPWithRetry(service, pipResourceGroup) + pips, err := az.ListPIP(service, pipResourceGroup) if err != nil { return "", err } @@ -474,13 +474,13 @@ func (az *Cloud) ensurePublicIPExists(service *v1.Service, pipName string, domai } klog.V(2).Infof("ensurePublicIPExists for service(%s): pip(%s) - creating", serviceName, *pip.Name) - klog.V(10).Infof("CreateOrUpdatePIPWithRetry(%s, %q): start", pipResourceGroup, *pip.Name) - err = az.CreateOrUpdatePIPWithRetry(service, pipResourceGroup, pip) + klog.V(10).Infof("CreateOrUpdatePIP(%s, %q): start", pipResourceGroup, *pip.Name) + err = az.CreateOrUpdatePIP(service, pipResourceGroup, pip) if err != nil { klog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - creating", serviceName, *pip.Name) return nil, err } - klog.V(10).Infof("CreateOrUpdatePIPWithRetry(%s, %q): end", pipResourceGroup, *pip.Name) + klog.V(10).Infof("CreateOrUpdatePIP(%s, %q): end", pipResourceGroup, *pip.Name) ctx, cancel := getContextWithCancel() defer cancel() @@ -818,16 +818,16 @@ func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service, klog.V(10).Infof("EnsureBackendPoolDeleted(%s, %s): end", lbBackendPoolID, vmSetName) // Remove the LB. - klog.V(10).Infof("reconcileLoadBalancer: az.DeleteLBWithRetry(%q): start", lbName) - err = az.DeleteLBWithRetry(service, lbName) + klog.V(10).Infof("reconcileLoadBalancer: az.DeleteLB(%q): start", lbName) + err = az.DeleteLB(service, lbName) if err != nil { klog.V(2).Infof("reconcileLoadBalancer for service(%s) abort backoff: lb(%s) - deleting; no remaining frontendIPConfigurations", serviceName, lbName) return nil, err } - klog.V(10).Infof("az.DeleteLBWithRetry(%q): end", lbName) + klog.V(10).Infof("az.DeleteLB(%q): end", lbName) } else { klog.V(2).Infof("reconcileLoadBalancer: reconcileLoadBalancer for service(%s): lb(%s) - updating", serviceName, lbName) - err := az.CreateOrUpdateLBWithRetry(service, *lb) + err := az.CreateOrUpdateLB(service, *lb) if err != nil { klog.V(2).Infof("reconcileLoadBalancer for service(%s) abort backoff: lb(%s) - updating", serviceName, lbName) return nil, err @@ -1143,8 +1143,8 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service, if dirtySg { sg.SecurityRules = &updatedRules klog.V(2).Infof("reconcileSecurityGroup for service(%s): sg(%s) - updating", serviceName, *sg.Name) - klog.V(10).Infof("CreateOrUpdateSGWithRetry(%q): start", *sg.Name) - err := az.CreateOrUpdateSGWithRetry(service, sg) + klog.V(10).Infof("CreateOrUpdateSecurityGroup(%q): start", *sg.Name) + err := az.CreateOrUpdateSecurityGroup(service, sg) if err != nil { klog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name) // TODO (Nov 2017): remove when augmented security rules are out of preview @@ -1157,7 +1157,7 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service, // END TODO return nil, err } - klog.V(10).Infof("CreateOrUpdateSGWithRetry(%q): end", *sg.Name) + klog.V(10).Infof("CreateOrUpdateSecurityGroup(%q): end", *sg.Name) } return &sg, nil } @@ -1315,7 +1315,7 @@ func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, lb * pipResourceGroup := az.getPublicIPAddressResourceGroup(service) - pips, err := az.ListPIPWithRetry(service, pipResourceGroup) + pips, err := az.ListPIP(service, pipResourceGroup) if err != nil { return nil, err } @@ -1414,7 +1414,7 @@ func (az *Cloud) safeDeletePublicIP(service *v1.Service, pipResourceGroup string // Update load balancer when frontendIPConfigUpdated or loadBalancerRuleUpdated. if frontendIPConfigUpdated || loadBalancerRuleUpdated { - err := az.CreateOrUpdateLBWithRetry(service, *lb) + err := az.CreateOrUpdateLB(service, *lb) if err != nil { klog.Errorf("safeDeletePublicIP for service(%s) failed with error: %v", getServiceName(service), err) return err @@ -1423,14 +1423,14 @@ func (az *Cloud) safeDeletePublicIP(service *v1.Service, pipResourceGroup string } pipName := to.String(pip.Name) - klog.V(10).Infof("DeletePublicIPWithRetry(%s, %q): start", pipResourceGroup, pipName) - err := az.DeletePublicIPWithRetry(service, pipResourceGroup, pipName) + klog.V(10).Infof("DeletePublicIP(%s, %q): start", pipResourceGroup, pipName) + err := az.DeletePublicIP(service, pipResourceGroup, pipName) if err != nil { if err = ignoreStatusNotFoundFromError(err); err != nil { return err } } - klog.V(10).Infof("DeletePublicIPWithRetry(%s, %q): end", pipResourceGroup, pipName) + klog.V(10).Infof("DeletePublicIP(%s, %q): end", pipResourceGroup, pipName) return nil } diff --git a/pkg/cloudprovider/providers/azure/azure_routes.go b/pkg/cloudprovider/providers/azure/azure_routes.go index 36219b2f917..3a586263148 100644 --- a/pkg/cloudprovider/providers/azure/azure_routes.go +++ b/pkg/cloudprovider/providers/azure/azure_routes.go @@ -20,11 +20,11 @@ import ( "context" "fmt" - cloudprovider "k8s.io/cloud-provider" - "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network" "github.com/Azure/go-autorest/autorest/to" + "k8s.io/apimachinery/pkg/types" + cloudprovider "k8s.io/cloud-provider" "k8s.io/klog" ) @@ -104,18 +104,7 @@ func (az *Cloud) createRouteTable() error { } klog.V(3).Infof("createRouteTableIfNotExists: creating routetable. routeTableName=%q", az.RouteTableName) - ctx, cancel := getContextWithCancel() - defer cancel() - resp, err := az.RouteTablesClient.CreateOrUpdate(ctx, az.ResourceGroup, az.RouteTableName, routeTable) - klog.V(10).Infof("RouteTablesClient.CreateOrUpdate(%q): end", az.RouteTableName) - if az.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("createRouteTableIfNotExists backing off: creating routetable. routeTableName=%q", az.RouteTableName) - retryErr := az.CreateOrUpdateRouteTableWithRetry(routeTable) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("createRouteTableIfNotExists abort backoff: creating routetable. routeTableName=%q", az.RouteTableName) - } - } + err := az.CreateOrUpdateRouteTable(routeTable) if err != nil { return err } @@ -163,18 +152,7 @@ func (az *Cloud) CreateRoute(ctx context.Context, clusterName string, nameHint s } klog.V(3).Infof("CreateRoute: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - ctx, cancel := getContextWithCancel() - defer cancel() - resp, err := az.RoutesClient.CreateOrUpdate(ctx, az.ResourceGroup, az.RouteTableName, *route.Name, route) - klog.V(10).Infof("RoutesClient.CreateOrUpdate(%q): end", az.RouteTableName) - if az.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("CreateRoute backing off: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - retryErr := az.CreateOrUpdateRouteWithRetry(route) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("CreateRoute abort backoff: creating route: instance=%q cidr=%q", kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - } - } + err = az.CreateOrUpdateRoute(route) if err != nil { return err } @@ -202,20 +180,8 @@ func (az *Cloud) DeleteRoute(ctx context.Context, clusterName string, kubeRoute klog.V(2).Infof("DeleteRoute: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - ctx, cancel := getContextWithCancel() - defer cancel() routeName := mapNodeNameToRouteName(kubeRoute.TargetNode) - resp, err := az.RoutesClient.Delete(ctx, az.ResourceGroup, az.RouteTableName, routeName) - klog.V(10).Infof("RoutesClient.Delete(%q): end", az.RouteTableName) - - if az.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("DeleteRoute backing off: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - retryErr := az.DeleteRouteWithRetry(routeName) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("DeleteRoute abort backoff: deleting route. clusterName=%q instance=%q cidr=%q", clusterName, kubeRoute.TargetNode, kubeRoute.DestinationCIDR) - } - } + err = az.DeleteRouteWithName(routeName) if err != nil { return err } diff --git a/pkg/cloudprovider/providers/azure/azure_standard.go b/pkg/cloudprovider/providers/azure/azure_standard.go index e60cc4902a6..baad60fd639 100644 --- a/pkg/cloudprovider/providers/azure/azure_standard.go +++ b/pkg/cloudprovider/providers/azure/azure_standard.go @@ -26,16 +26,16 @@ import ( "strconv" "strings" - "k8s.io/api/core/v1" - cloudprovider "k8s.io/cloud-provider" - "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network" "github.com/Azure/go-autorest/autorest/to" + + "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/uuid" + cloudprovider "k8s.io/cloud-provider" "k8s.io/klog" ) @@ -294,10 +294,6 @@ outer: return -1, fmt.Errorf("securityGroup priorities are exhausted") } -func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, string, error) { - return az.vmSet.GetIPByNodeName(string(nodeName)) -} - var polyTable = crc32.MakeTable(crc32.Koopman) //MakeCRC32 : convert string to CRC32 format @@ -460,9 +456,9 @@ func (as *availabilitySet) GetIPByNodeName(name string) (string, string, error) // getAgentPoolAvailabiliySets lists the virtual machines for the resource group and then builds // a list of availability sets that match the nodes available to k8s. func (as *availabilitySet) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAvailabilitySets *[]string, err error) { - vms, err := as.VirtualMachineClientListWithRetry(as.ResourceGroup) + vms, err := as.ListVirtualMachines(as.ResourceGroup) if err != nil { - klog.Errorf("as.getNodeAvailabilitySet - VirtualMachineClientListWithRetry failed, err=%v", err) + klog.Errorf("as.getNodeAvailabilitySet - ListVirtualMachines failed, err=%v", err) return nil, err } vmNameToAvailabilitySetID := make(map[string]string, len(vms)) @@ -695,18 +691,7 @@ func (as *availabilitySet) ensureHostInPool(service *v1.Service, nodeName types. nicName := *nic.Name klog.V(3).Infof("nicupdate(%s): nic(%s) - updating", serviceName, nicName) - ctx, cancel := getContextWithCancel() - defer cancel() - resp, err := as.InterfacesClient.CreateOrUpdate(ctx, as.ResourceGroup, *nic.Name, nic) - klog.V(10).Infof("InterfacesClient.CreateOrUpdate(%q): end", *nic.Name) - if as.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("nicupdate(%s) backing off: nic(%s) - updating, err=%v", serviceName, nicName, err) - retryErr := as.CreateOrUpdateInterfaceWithRetry(service, nic) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("nicupdate(%s) abort backoff: nic(%s) - updating", serviceName, nicName) - } - } + err := as.CreateOrUpdateInterface(service, nic) if err != nil { return err } diff --git a/pkg/cloudprovider/providers/azure/azure_vmss.go b/pkg/cloudprovider/providers/azure/azure_vmss.go index 4d5df404628..26d1bd9d1af 100644 --- a/pkg/cloudprovider/providers/azure/azure_vmss.go +++ b/pkg/cloudprovider/providers/azure/azure_vmss.go @@ -27,13 +27,13 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2018-10-01/compute" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2017-09-01/network" "github.com/Azure/go-autorest/autorest/to" - "k8s.io/klog" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog" ) var ( @@ -563,6 +563,30 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err return nic, nil } +// getScaleSet gets scale set with exponential backoff retry +func (ss *scaleSet) getScaleSet(service *v1.Service, name string) (compute.VirtualMachineScaleSet, bool, error) { + if ss.Config.shouldOmitCloudProviderBackoff() { + var result compute.VirtualMachineScaleSet + var exists bool + + cached, err := ss.vmssCache.Get(name) + if err != nil { + ss.Event(service, v1.EventTypeWarning, "GetVirtualMachineScaleSet", err.Error()) + klog.Errorf("backoff: failure for scale set %q, will retry,err=%v", name, err) + return result, false, nil + } + + if cached != nil { + exists = true + result = *(cached.(*compute.VirtualMachineScaleSet)) + } + + return result, exists, err + } + + return ss.getScaleSetWithRetry(service, name) +} + // getScaleSetWithRetry gets scale set with exponential backoff retry func (ss *scaleSet) getScaleSetWithRetry(service *v1.Service, name string) (compute.VirtualMachineScaleSet, bool, error) { var result compute.VirtualMachineScaleSet @@ -621,6 +645,19 @@ func (ss *scaleSet) getPrimaryIPConfigForScaleSet(config *compute.VirtualMachine return nil, fmt.Errorf("failed to find a primary IP configuration for the scale set %q", scaleSetName) } +// createOrUpdateVMSS invokes ss.VirtualMachineScaleSetsClient.CreateOrUpdate with exponential backoff retry. +func (ss *scaleSet) createOrUpdateVMSS(service *v1.Service, virtualMachineScaleSet compute.VirtualMachineScaleSet) error { + if ss.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + resp, err := ss.VirtualMachineScaleSetsClient.CreateOrUpdate(ctx, ss.ResourceGroup, *virtualMachineScaleSet.Name, virtualMachineScaleSet) + klog.V(10).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate(%s): end", *virtualMachineScaleSet.Name) + return ss.processHTTPResponse(service, "CreateOrUpdateVMSS", resp, err) + } + + return ss.createOrUpdateVMSSWithRetry(service, virtualMachineScaleSet) +} + // createOrUpdateVMSSWithRetry invokes ss.VirtualMachineScaleSetsClient.CreateOrUpdate with exponential backoff retry. func (ss *scaleSet) createOrUpdateVMSSWithRetry(service *v1.Service, virtualMachineScaleSet compute.VirtualMachineScaleSet) error { return wait.ExponentialBackoff(ss.requestBackoff(), func() (bool, error) { @@ -632,6 +669,19 @@ func (ss *scaleSet) createOrUpdateVMSSWithRetry(service *v1.Service, virtualMach }) } +// updateVMSSInstances invokes ss.VirtualMachineScaleSetsClient.UpdateInstances with exponential backoff retry. +func (ss *scaleSet) updateVMSSInstances(service *v1.Service, scaleSetName string, vmInstanceIDs compute.VirtualMachineScaleSetVMInstanceRequiredIDs) error { + if ss.Config.shouldOmitCloudProviderBackoff() { + ctx, cancel := getContextWithCancel() + defer cancel() + resp, err := ss.VirtualMachineScaleSetsClient.UpdateInstances(ctx, ss.ResourceGroup, scaleSetName, vmInstanceIDs) + klog.V(10).Infof("VirtualMachineScaleSetsClient.UpdateInstances(%s): end", scaleSetName) + return ss.processHTTPResponse(service, "CreateOrUpdateVMSSInstance", resp, err) + } + + return ss.updateVMSSInstancesWithRetry(service, scaleSetName, vmInstanceIDs) +} + // updateVMSSInstancesWithRetry invokes ss.VirtualMachineScaleSetsClient.UpdateInstances with exponential backoff retry. func (ss *scaleSet) updateVMSSInstancesWithRetry(service *v1.Service, scaleSetName string, vmInstanceIDs compute.VirtualMachineScaleSetVMInstanceRequiredIDs) error { return wait.ExponentialBackoff(ss.requestBackoff(), func() (bool, error) { @@ -687,9 +737,9 @@ func (ss *scaleSet) getNodesScaleSets(nodes []*v1.Node) (map[string]sets.String, func (ss *scaleSet) ensureHostsInVMSetPool(service *v1.Service, backendPoolID string, vmSetName string, instanceIDs []string, isInternal bool) error { klog.V(3).Infof("ensuring hosts %q of scaleset %q in LB backendpool %q", instanceIDs, vmSetName, backendPoolID) serviceName := getServiceName(service) - virtualMachineScaleSet, exists, err := ss.getScaleSetWithRetry(service, vmSetName) + virtualMachineScaleSet, exists, err := ss.getScaleSet(service, vmSetName) if err != nil { - klog.Errorf("ss.getScaleSetWithRetry(%s) for service %q failed: %v", vmSetName, serviceName, err) + klog.Errorf("ss.getScaleSet(%s) for service %q failed: %v", vmSetName, serviceName, err) return err } if !exists { @@ -748,19 +798,7 @@ func (ss *scaleSet) ensureHostsInVMSetPool(service *v1.Service, backendPoolID st }) primaryIPConfiguration.LoadBalancerBackendAddressPools = &newBackendPools - ctx, cancel := getContextWithCancel() - defer cancel() - klog.V(3).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate for service (%s): scale set (%s) - updating", serviceName, vmSetName) - resp, err := ss.VirtualMachineScaleSetsClient.CreateOrUpdate(ctx, ss.ResourceGroup, vmSetName, virtualMachineScaleSet) - klog.V(10).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate(%q): end", vmSetName) - if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate for service (%s): scale set (%s) - updating, err=%v", serviceName, vmSetName, err) - retryErr := ss.createOrUpdateVMSSWithRetry(service, virtualMachineScaleSet) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate for service (%s) abort backoff: scale set (%s) - updating", serviceName, vmSetName) - } - } + err := ss.createOrUpdateVMSS(service, virtualMachineScaleSet) if err != nil { return err } @@ -770,18 +808,7 @@ func (ss *scaleSet) ensureHostsInVMSetPool(service *v1.Service, backendPoolID st vmInstanceIDs := compute.VirtualMachineScaleSetVMInstanceRequiredIDs{ InstanceIds: &instanceIDs, } - ctx, cancel := getContextWithCancel() - defer cancel() - instanceResp, err := ss.VirtualMachineScaleSetsClient.UpdateInstances(ctx, ss.ResourceGroup, vmSetName, vmInstanceIDs) - klog.V(10).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate(%q): end", vmSetName) - if ss.CloudProviderBackoff && shouldRetryHTTPRequest(instanceResp, err) { - klog.V(2).Infof("VirtualMachineScaleSetsClient.UpdateInstances for service (%s): scale set (%s) - updating, err=%v", serviceName, vmSetName, err) - retryErr := ss.updateVMSSInstancesWithRetry(service, vmSetName, vmInstanceIDs) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("VirtualMachineScaleSetsClient.UpdateInstances for service (%s) abort backoff: scale set (%s) - updating", serviceName, vmSetName) - } - } + err = ss.updateVMSSInstances(service, vmSetName, vmInstanceIDs) if err != nil { return err } @@ -833,9 +860,9 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac // ensureScaleSetBackendPoolDeleted ensures the loadBalancer backendAddressPools deleted from the specified scaleset. func (ss *scaleSet) ensureScaleSetBackendPoolDeleted(service *v1.Service, poolID, ssName string) error { klog.V(3).Infof("ensuring backend pool %q deleted from scaleset %q", poolID, ssName) - virtualMachineScaleSet, exists, err := ss.getScaleSetWithRetry(service, ssName) + virtualMachineScaleSet, exists, err := ss.getScaleSet(service, ssName) if err != nil { - klog.Errorf("ss.ensureScaleSetBackendPoolDeleted(%s, %s) getScaleSetWithRetry(%s) failed: %v", poolID, ssName, ssName, err) + klog.Errorf("ss.ensureScaleSetBackendPoolDeleted(%s, %s) getScaleSet(%s) failed: %v", poolID, ssName, ssName, err) return err } if !exists { @@ -879,18 +906,7 @@ func (ss *scaleSet) ensureScaleSetBackendPoolDeleted(service *v1.Service, poolID // Update scale set with backoff. primaryIPConfiguration.LoadBalancerBackendAddressPools = &newBackendPools klog.V(3).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate: scale set (%s) - updating", ssName) - ctx, cancel := getContextWithCancel() - defer cancel() - resp, err := ss.VirtualMachineScaleSetsClient.CreateOrUpdate(ctx, ss.ResourceGroup, ssName, virtualMachineScaleSet) - klog.V(10).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate(%q): end", ssName) - if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate: scale set (%s) - updating, err=%v", ssName, err) - retryErr := ss.createOrUpdateVMSSWithRetry(service, virtualMachineScaleSet) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate abort backoff: scale set (%s) - updating", ssName) - } - } + err = ss.createOrUpdateVMSS(service, virtualMachineScaleSet) if err != nil { return err } @@ -900,18 +916,7 @@ func (ss *scaleSet) ensureScaleSetBackendPoolDeleted(service *v1.Service, poolID vmInstanceIDs := compute.VirtualMachineScaleSetVMInstanceRequiredIDs{ InstanceIds: &instanceIDs, } - instanceCtx, instanceCancel := getContextWithCancel() - defer instanceCancel() - instanceResp, err := ss.VirtualMachineScaleSetsClient.UpdateInstances(instanceCtx, ss.ResourceGroup, ssName, vmInstanceIDs) - klog.V(10).Infof("VirtualMachineScaleSetsClient.UpdateInstances(%q): end", ssName) - if ss.CloudProviderBackoff && shouldRetryHTTPRequest(instanceResp, err) { - klog.V(2).Infof("VirtualMachineScaleSetsClient.UpdateInstances scale set (%s) - updating, err=%v", ssName, err) - retryErr := ss.updateVMSSInstancesWithRetry(service, ssName, vmInstanceIDs) - if retryErr != nil { - err = retryErr - klog.V(2).Infof("VirtualMachineScaleSetsClient.UpdateInstances abort backoff: scale set (%s) - updating", ssName) - } - } + err = ss.updateVMSSInstances(service, ssName, vmInstanceIDs) if err != nil { return err } @@ -919,17 +924,9 @@ func (ss *scaleSet) ensureScaleSetBackendPoolDeleted(service *v1.Service, poolID // Update virtualMachineScaleSet again. This is a workaround for removing VMSS reference from LB. // TODO: remove this workaround when figuring out the root cause. if len(newBackendPools) == 0 { - updateCtx, updateCancel := getContextWithCancel() - defer updateCancel() - klog.V(3).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate: scale set (%s) - updating second time", ssName) - resp, err = ss.VirtualMachineScaleSetsClient.CreateOrUpdate(updateCtx, ss.ResourceGroup, ssName, virtualMachineScaleSet) - klog.V(10).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate(%q): end", ssName) - if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) { - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate: scale set (%s) - updating, err=%v", ssName, err) - retryErr := ss.createOrUpdateVMSSWithRetry(service, virtualMachineScaleSet) - if retryErr != nil { - klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate abort backoff: scale set (%s) - updating", ssName) - } + err = ss.createOrUpdateVMSS(service, virtualMachineScaleSet) + if err != nil { + klog.V(2).Infof("VirtualMachineScaleSetsClient.CreateOrUpdate abort backoff: scale set (%s) - updating", ssName) } } diff --git a/pkg/cloudprovider/providers/azure/azure_vmss_cache.go b/pkg/cloudprovider/providers/azure/azure_vmss_cache.go index a9a46ba703b..534ba9d445b 100644 --- a/pkg/cloudprovider/providers/azure/azure_vmss_cache.go +++ b/pkg/cloudprovider/providers/azure/azure_vmss_cache.go @@ -132,7 +132,7 @@ func (ss *scaleSet) newAvailabilitySetNodesCache() (*timedCache, error) { } for _, resourceGroup := range resourceGroups.List() { - vmList, err := ss.Cloud.VirtualMachineClientListWithRetry(resourceGroup) + vmList, err := ss.Cloud.ListVirtualMachines(resourceGroup) if err != nil { return nil, err }