Merge pull request #48967 from jackfrancis/azure-lb-backoff

Automatic merge from submit-queue (batch tested with PRs 49218, 48253, 48967, 48460, 49230) additional backoff in azure cloudprovider Fixes #48971 **What this PR does / why we need it**: We want to be able to opt in to backoff retry logic for kubelet-originating request behavior: node IP address resolution and node load balancer pool membership enforcement. **Special notes for your reviewer**: The use-case for this is azure cloudprovider clusters with large node counts, especially during cluster installation, or other scenarios when lots of nodes come online at once and attempt to register all resources with the backend API. To allow clusters at scale more control over the API request rate in-cluster, backoff config has the ability to meaningful slow down this rate, when appropriate. **Release note**: ```additional backoff in azure cloudprovider ```
2025-07-22 19:31:44 +00:00 · 2017-07-19 20:05:34 -07:00 · 2017-07-19 20:05:34 -07:00 · ecadada7ef
commit ecadada7ef
parent 6d534b38e8 f76ef29512
4 changed files with 56 additions and 3 deletions
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@ -43,6 +43,38 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 	return machine, exists, err
 }

+// VirtualMachineClientGetWithRetry invokes az.VirtualMachinesClient.Get with exponential backoff retry
+func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string, types compute.InstanceViewTypes) (compute.VirtualMachine, error) {
+	var machine compute.VirtualMachine
+	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+		var retryErr error
+		machine, retryErr = az.VirtualMachinesClient.Get(resourceGroup, vmName, types)
+		if retryErr != nil {
+			glog.Errorf("backoff: failure, will retry,err=%v", retryErr)
+			return false, nil
+		}
+		glog.V(2).Infof("backoff: success")
+		return true, nil
+	})
+	return machine, err
+}
+
+// GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry
+func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
+	var ip string
+	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+		var retryErr error
+		ip, retryErr = az.getIPForMachine(name)
+		if retryErr != nil {
+			glog.Errorf("backoff: failure, will retry,err=%v", retryErr)
+			return false, nil
+		}
+		glog.V(2).Infof("backoff: success")
+		return true, nil
+	})
+	return ip, err
+}
+
 // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
 	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
--- a/pkg/cloudprovider/providers/azure/azure_instances.go
+++ b/pkg/cloudprovider/providers/azure/azure_instances.go
@ -41,9 +41,18 @@ func (az *Cloud) NodeAddresses(name types.NodeName) ([]v1.NodeAddress, error) {
 	}
 	ip, err := az.getIPForMachine(name)
 	if err != nil {
+		if az.CloudProviderBackoff {
+			glog.V(2).Infof("NodeAddresses(%s) backing off", name)
+			ip, err = az.GetIPForMachineWithRetry(name)
+			if err != nil {
+				glog.V(2).Infof("NodeAddresses(%s) abort backoff", name)
+				return nil, err
+			}
+		} else {
 			glog.Errorf("error: az.NodeAddresses, az.getIPForMachine(%s), err=%v", name, err)
 			return nil, err
 		}
+	}

 	return []v1.NodeAddress{
 		{Type: v1.NodeInternalIP, Address: ip},
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@ -25,6 +25,7 @@ import (
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	serviceapi "k8s.io/kubernetes/pkg/api/v1/service"

+	"github.com/Azure/azure-sdk-for-go/arm/compute"
 	"github.com/Azure/azure-sdk-for-go/arm/network"
 	"github.com/Azure/go-autorest/autorest/to"
 	"github.com/golang/glog"
@ -871,12 +872,22 @@ func findSecurityRule(rules []network.SecurityRule, rule network.SecurityRule) b
 // This ensures the given VM's Primary NIC's Primary IP Configuration is
 // participating in the specified LoadBalancer Backend Pool.
 func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, backendPoolID string) error {
+	var machine compute.VirtualMachine
 	vmName := mapNodeNameToVMName(nodeName)
 	az.operationPollRateLimiter.Accept()
 	machine, err := az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "")
 	if err != nil {
+		if az.CloudProviderBackoff {
+			glog.V(2).Infof("ensureHostInPool(%s, %s, %s) backing off", serviceName, nodeName, backendPoolID)
+			machine, err = az.VirtualMachineClientGetWithRetry(az.ResourceGroup, vmName, "")
+			if err != nil {
+				glog.V(2).Infof("ensureHostInPool(%s, %s, %s) abort backoff", serviceName, nodeName, backendPoolID)
 				return err
 			}
+		} else {
+			return err
+		}
+	}

 	primaryNicID, err := getPrimaryInterfaceID(machine)
 	if err != nil {
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@ -249,6 +249,7 @@ outer:
 }

 func (az *Cloud) getIPForMachine(nodeName types.NodeName) (string, error) {
+	az.operationPollRateLimiter.Accept()
 	machine, exists, err := az.getVirtualMachine(nodeName)
 	if !exists {
 		return "", cloudprovider.InstanceNotFound