Merge pull request #80419 from feiskyer/vmss-fix

Fix retry issues when the nodes are under deleting on Azure
This commit is contained in:
Kubernetes Prow Robot 2019-07-23 19:32:18 -07:00 committed by GitHub
commit c08a88a2d1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 90 additions and 23 deletions

View File

@ -19,6 +19,7 @@ package azure
import (
"fmt"
"net/http"
"strings"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute"
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network"
@ -32,6 +33,11 @@ import (
"k8s.io/klog"
)
const (
// not active means the instance is under deleting from Azure VMSS.
vmssVMNotActiveErrorMessage = "not an active Virtual Machine Scale Set VM instanceId"
)
// RequestBackoff if backoff is disabled in cloud provider it
// returns a new Backoff object steps = 1
// This is to make sure that the requested command executes
@ -133,10 +139,14 @@ func (az *Cloud) getPrivateIPsForMachineWithRetry(nodeName types.NodeName) ([]st
var retryErr error
privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName))
if retryErr != nil {
// won't retry since the instance doesn't exist on Azure.
if retryErr == cloudprovider.InstanceNotFound {
return true, retryErr
}
klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr)
return false, nil
}
klog.V(2).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
klog.V(3).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
return true, nil
})
return privateIPs, err
@ -160,7 +170,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string,
klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr)
return false, nil
}
klog.V(2).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
klog.V(3).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
return true, nil
})
return ip, publicIP, err
@ -582,7 +592,15 @@ func (az *Cloud) UpdateVmssVMWithRetry(resourceGroupName string, VMScaleSetName
defer cancel()
resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source)
klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID)
klog.V(10).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s): end", VMScaleSetName, instanceID)
if strings.Contains(err.Error(), vmssVMNotActiveErrorMessage) {
// When instances are under deleting, updating API would report "not an active Virtual Machine Scale Set VM instanceId" error.
// Since they're under deleting, we shouldn't send more update requests for it.
klog.V(3).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s) gets error message %q, abort backoff because it's probably under deleting", VMScaleSetName, instanceID, vmssVMNotActiveErrorMessage)
return true, nil
}
return az.processHTTPRetryResponse(nil, "", resp, err)
})
}

View File

@ -34,6 +34,10 @@ const (
vmPowerStateDeallocated = "deallocated"
)
var (
errNodeNotInitialized = fmt.Errorf("providerID is empty, the node is not initialized yet")
)
// NodeAddresses returns the addresses of the specified instance.
func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.NodeAddress, error) {
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
@ -143,6 +147,10 @@ func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.N
// This method will not be called from the node that is requesting this ID. i.e. metadata service
// and other local methods cannot be used here
func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string) ([]v1.NodeAddress, error) {
if providerID == "" {
return nil, errNodeNotInitialized
}
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
if az.IsNodeUnmanagedByProviderID(providerID) {
klog.V(4).Infof("NodeAddressesByProviderID: omitting unmanaged node %q", providerID)
@ -160,6 +168,10 @@ func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID strin
// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running.
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) {
if providerID == "" {
return false, errNodeNotInitialized
}
// Returns true for unmanaged nodes because azure cloud provider always assumes them exists.
if az.IsNodeUnmanagedByProviderID(providerID) {
klog.V(4).Infof("InstanceExistsByProviderID: assuming unmanaged node %q exists", providerID)
@ -187,13 +199,27 @@ func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID stri
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
if providerID == "" {
return false, nil
}
nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID)
if err != nil {
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName))
if err != nil {
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
klog.V(5).Infof("InstanceShutdownByProviderID gets power status %q for node %q", powerStatus, nodeName)
@ -283,6 +309,10 @@ func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, e
// This method will not be called from the node that is requesting this ID. i.e. metadata service
// and other local methods cannot be used here
func (az *Cloud) InstanceTypeByProviderID(ctx context.Context, providerID string) (string, error) {
if providerID == "" {
return "", errNodeNotInitialized
}
// Returns "" for unmanaged nodes because azure cloud provider couldn't fetch information for them.
if az.IsNodeUnmanagedByProviderID(providerID) {
klog.V(4).Infof("InstanceTypeByProviderID: omitting unmanaged node %q", providerID)

View File

@ -204,10 +204,10 @@ func TestInstanceShutdownByProviderID(t *testing.T) {
expected: false,
},
{
name: "InstanceShutdownByProviderID should report error if VM doesn't exist",
vmList: map[string]string{"vm1": "PowerState/running"},
nodeName: "vm8",
expectError: true,
name: "InstanceShutdownByProviderID should return false if VM doesn't exist",
vmList: map[string]string{"vm1": "PowerState/running"},
nodeName: "vm8",
expected: false,
},
}

View File

@ -366,7 +366,9 @@ func (as *availabilitySet) GetPowerStatusByNodeName(name string) (powerState str
}
}
return "", fmt.Errorf("failed to get power status for node %q", name)
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
return vmPowerStateStopped, nil
}
// GetNodeNameByProviderID gets the node name by provider ID.

View File

@ -144,7 +144,9 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er
}
}
return "", fmt.Errorf("failed to get power status for node %q", name)
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
return vmPowerStateStopped, nil
}
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
@ -589,8 +591,15 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err
defer cancel()
nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
if err != nil {
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err)
return network.Interface{}, err
exists, _, realErr := checkResourceExistsFromError(err)
if realErr != nil {
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, realErr)
return network.Interface{}, err
}
if !exists {
return network.Interface{}, cloudprovider.InstanceNotFound
}
}
// Fix interface's location, which is required when updating the interface.
@ -767,20 +776,24 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
}
f := func() error {
// VMAS nodes should also be added to the SLB backends.
if ss.useStandardLoadBalancer() {
// Check whether the node is VMAS virtual machine.
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
if err != nil {
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
return err
}
if managedByAS {
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
}
// Check whether the node is VMAS virtual machine.
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
if err != nil {
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
return err
}
err := ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
if managedByAS {
// VMAS nodes should also be added to the SLB backends.
if ss.useStandardLoadBalancer() {
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
}
klog.V(3).Infof("EnsureHostsInPool skips node %s because VMAS nodes couldn't be added to basic LB with VMSS backends", localNodeName)
return nil
}
err = ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
if err != nil {
return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err)
}

View File

@ -89,6 +89,10 @@ func (az *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) {
// This is particularly useful in external cloud providers where the kubelet
// does not initialize node data.
func (az *Cloud) GetZoneByProviderID(ctx context.Context, providerID string) (cloudprovider.Zone, error) {
if providerID == "" {
return cloudprovider.Zone{}, errNodeNotInitialized
}
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
if az.IsNodeUnmanagedByProviderID(providerID) {
klog.V(2).Infof("GetZoneByProviderID: omitting unmanaged node %q", providerID)