diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go index 6b86259a7ac..6a584b723b6 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go @@ -19,6 +19,7 @@ package azure import ( "fmt" "net/http" + "strings" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute" "github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network" @@ -32,6 +33,11 @@ import ( "k8s.io/klog" ) +const ( + // not active means the instance is under deleting from Azure VMSS. + vmssVMNotActiveErrorMessage = "not an active Virtual Machine Scale Set VM instanceId" +) + // RequestBackoff if backoff is disabled in cloud provider it // returns a new Backoff object steps = 1 // This is to make sure that the requested command executes @@ -133,10 +139,14 @@ func (az *Cloud) getPrivateIPsForMachineWithRetry(nodeName types.NodeName) ([]st var retryErr error privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName)) if retryErr != nil { + // won't retry since the instance doesn't exist on Azure. + if retryErr == cloudprovider.InstanceNotFound { + return true, retryErr + } klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr) return false, nil } - klog.V(2).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName) + klog.V(3).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName) return true, nil }) return privateIPs, err @@ -160,7 +170,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string, klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr) return false, nil } - klog.V(2).Infof("GetIPForMachineWithRetry(%s): backoff success", name) + klog.V(3).Infof("GetIPForMachineWithRetry(%s): backoff success", name) return true, nil }) return ip, publicIP, err @@ -582,7 +592,15 @@ func (az *Cloud) UpdateVmssVMWithRetry(resourceGroupName string, VMScaleSetName defer cancel() resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source) - klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID) + klog.V(10).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s): end", VMScaleSetName, instanceID) + + if strings.Contains(err.Error(), vmssVMNotActiveErrorMessage) { + // When instances are under deleting, updating API would report "not an active Virtual Machine Scale Set VM instanceId" error. + // Since they're under deleting, we shouldn't send more update requests for it. + klog.V(3).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s) gets error message %q, abort backoff because it's probably under deleting", VMScaleSetName, instanceID, vmssVMNotActiveErrorMessage) + return true, nil + } + return az.processHTTPRetryResponse(nil, "", resp, err) }) } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances.go index ae6442ddb88..f3923f65714 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances.go @@ -34,6 +34,10 @@ const ( vmPowerStateDeallocated = "deallocated" ) +var ( + errNodeNotInitialized = fmt.Errorf("providerID is empty, the node is not initialized yet") +) + // NodeAddresses returns the addresses of the specified instance. func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.NodeAddress, error) { // Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them. @@ -143,6 +147,10 @@ func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.N // This method will not be called from the node that is requesting this ID. i.e. metadata service // and other local methods cannot be used here func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string) ([]v1.NodeAddress, error) { + if providerID == "" { + return nil, errNodeNotInitialized + } + // Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them. if az.IsNodeUnmanagedByProviderID(providerID) { klog.V(4).Infof("NodeAddressesByProviderID: omitting unmanaged node %q", providerID) @@ -160,6 +168,10 @@ func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID strin // InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. // If false is returned with no error, the instance will be immediately deleted by the cloud controller manager. func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) { + if providerID == "" { + return false, errNodeNotInitialized + } + // Returns true for unmanaged nodes because azure cloud provider always assumes them exists. if az.IsNodeUnmanagedByProviderID(providerID) { klog.V(4).Infof("InstanceExistsByProviderID: assuming unmanaged node %q exists", providerID) @@ -187,13 +199,27 @@ func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID stri // InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + if providerID == "" { + return false, nil + } + nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID) if err != nil { + // Returns false, so the controller manager will continue to check InstanceExistsByProviderID(). + if err == cloudprovider.InstanceNotFound { + return false, nil + } + return false, err } powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName)) if err != nil { + // Returns false, so the controller manager will continue to check InstanceExistsByProviderID(). + if err == cloudprovider.InstanceNotFound { + return false, nil + } + return false, err } klog.V(5).Infof("InstanceShutdownByProviderID gets power status %q for node %q", powerStatus, nodeName) @@ -283,6 +309,10 @@ func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, e // This method will not be called from the node that is requesting this ID. i.e. metadata service // and other local methods cannot be used here func (az *Cloud) InstanceTypeByProviderID(ctx context.Context, providerID string) (string, error) { + if providerID == "" { + return "", errNodeNotInitialized + } + // Returns "" for unmanaged nodes because azure cloud provider couldn't fetch information for them. if az.IsNodeUnmanagedByProviderID(providerID) { klog.V(4).Infof("InstanceTypeByProviderID: omitting unmanaged node %q", providerID) diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go index 6116e9b9ea3..ea4b576d484 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go @@ -204,10 +204,10 @@ func TestInstanceShutdownByProviderID(t *testing.T) { expected: false, }, { - name: "InstanceShutdownByProviderID should report error if VM doesn't exist", - vmList: map[string]string{"vm1": "PowerState/running"}, - nodeName: "vm8", - expectError: true, + name: "InstanceShutdownByProviderID should return false if VM doesn't exist", + vmList: map[string]string{"vm1": "PowerState/running"}, + nodeName: "vm8", + expected: false, }, } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go index 4d732f5018d..4feeca22077 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_standard.go @@ -366,7 +366,9 @@ func (as *availabilitySet) GetPowerStatusByNodeName(name string) (powerState str } } - return "", fmt.Errorf("failed to get power status for node %q", name) + // vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting. + klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name) + return vmPowerStateStopped, nil } // GetNodeNameByProviderID gets the node name by provider ID. diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go index 4048d0ebc2b..f2cee92ebd8 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go @@ -144,7 +144,9 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er } } - return "", fmt.Errorf("failed to get power status for node %q", name) + // vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting. + klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name) + return vmPowerStateStopped, nil } // getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache. @@ -589,8 +591,15 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err defer cancel() nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "") if err != nil { - klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err) - return network.Interface{}, err + exists, _, realErr := checkResourceExistsFromError(err) + if realErr != nil { + klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, realErr) + return network.Interface{}, err + } + + if !exists { + return network.Interface{}, cloudprovider.InstanceNotFound + } } // Fix interface's location, which is required when updating the interface. @@ -767,20 +776,24 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac } f := func() error { - // VMAS nodes should also be added to the SLB backends. - if ss.useStandardLoadBalancer() { - // Check whether the node is VMAS virtual machine. - managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName) - if err != nil { - klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err) - return err - } - if managedByAS { - return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal) - } + // Check whether the node is VMAS virtual machine. + managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName) + if err != nil { + klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err) + return err } - err := ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal) + if managedByAS { + // VMAS nodes should also be added to the SLB backends. + if ss.useStandardLoadBalancer() { + return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal) + } + + klog.V(3).Infof("EnsureHostsInPool skips node %s because VMAS nodes couldn't be added to basic LB with VMSS backends", localNodeName) + return nil + } + + err = ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal) if err != nil { return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err) } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_zones.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_zones.go index 5b18005453e..8ad5d14b921 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_zones.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_zones.go @@ -89,6 +89,10 @@ func (az *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) { // This is particularly useful in external cloud providers where the kubelet // does not initialize node data. func (az *Cloud) GetZoneByProviderID(ctx context.Context, providerID string) (cloudprovider.Zone, error) { + if providerID == "" { + return cloudprovider.Zone{}, errNodeNotInitialized + } + // Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them. if az.IsNodeUnmanagedByProviderID(providerID) { klog.V(2).Infof("GetZoneByProviderID: omitting unmanaged node %q", providerID)