mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 20:53:33 +00:00
Merge pull request #80419 from feiskyer/vmss-fix
Fix retry issues when the nodes are under deleting on Azure
This commit is contained in:
commit
c08a88a2d1
@ -19,6 +19,7 @@ package azure
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute"
|
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute"
|
||||||
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network"
|
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network"
|
||||||
@ -32,6 +33,11 @@ import (
|
|||||||
"k8s.io/klog"
|
"k8s.io/klog"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// not active means the instance is under deleting from Azure VMSS.
|
||||||
|
vmssVMNotActiveErrorMessage = "not an active Virtual Machine Scale Set VM instanceId"
|
||||||
|
)
|
||||||
|
|
||||||
// RequestBackoff if backoff is disabled in cloud provider it
|
// RequestBackoff if backoff is disabled in cloud provider it
|
||||||
// returns a new Backoff object steps = 1
|
// returns a new Backoff object steps = 1
|
||||||
// This is to make sure that the requested command executes
|
// This is to make sure that the requested command executes
|
||||||
@ -133,10 +139,14 @@ func (az *Cloud) getPrivateIPsForMachineWithRetry(nodeName types.NodeName) ([]st
|
|||||||
var retryErr error
|
var retryErr error
|
||||||
privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName))
|
privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName))
|
||||||
if retryErr != nil {
|
if retryErr != nil {
|
||||||
|
// won't retry since the instance doesn't exist on Azure.
|
||||||
|
if retryErr == cloudprovider.InstanceNotFound {
|
||||||
|
return true, retryErr
|
||||||
|
}
|
||||||
klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr)
|
klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
klog.V(2).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
klog.V(3).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
||||||
return true, nil
|
return true, nil
|
||||||
})
|
})
|
||||||
return privateIPs, err
|
return privateIPs, err
|
||||||
@ -160,7 +170,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string,
|
|||||||
klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr)
|
klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
klog.V(2).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
klog.V(3).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
||||||
return true, nil
|
return true, nil
|
||||||
})
|
})
|
||||||
return ip, publicIP, err
|
return ip, publicIP, err
|
||||||
@ -582,7 +592,15 @@ func (az *Cloud) UpdateVmssVMWithRetry(resourceGroupName string, VMScaleSetName
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source)
|
resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source)
|
||||||
klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID)
|
klog.V(10).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s): end", VMScaleSetName, instanceID)
|
||||||
|
|
||||||
|
if strings.Contains(err.Error(), vmssVMNotActiveErrorMessage) {
|
||||||
|
// When instances are under deleting, updating API would report "not an active Virtual Machine Scale Set VM instanceId" error.
|
||||||
|
// Since they're under deleting, we shouldn't send more update requests for it.
|
||||||
|
klog.V(3).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s) gets error message %q, abort backoff because it's probably under deleting", VMScaleSetName, instanceID, vmssVMNotActiveErrorMessage)
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
return az.processHTTPRetryResponse(nil, "", resp, err)
|
return az.processHTTPRetryResponse(nil, "", resp, err)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,10 @@ const (
|
|||||||
vmPowerStateDeallocated = "deallocated"
|
vmPowerStateDeallocated = "deallocated"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
errNodeNotInitialized = fmt.Errorf("providerID is empty, the node is not initialized yet")
|
||||||
|
)
|
||||||
|
|
||||||
// NodeAddresses returns the addresses of the specified instance.
|
// NodeAddresses returns the addresses of the specified instance.
|
||||||
func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.NodeAddress, error) {
|
func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.NodeAddress, error) {
|
||||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||||
@ -143,6 +147,10 @@ func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.N
|
|||||||
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
||||||
// and other local methods cannot be used here
|
// and other local methods cannot be used here
|
||||||
func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string) ([]v1.NodeAddress, error) {
|
func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string) ([]v1.NodeAddress, error) {
|
||||||
|
if providerID == "" {
|
||||||
|
return nil, errNodeNotInitialized
|
||||||
|
}
|
||||||
|
|
||||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||||
klog.V(4).Infof("NodeAddressesByProviderID: omitting unmanaged node %q", providerID)
|
klog.V(4).Infof("NodeAddressesByProviderID: omitting unmanaged node %q", providerID)
|
||||||
@ -160,6 +168,10 @@ func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID strin
|
|||||||
// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running.
|
// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running.
|
||||||
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
|
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
|
||||||
func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) {
|
func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||||
|
if providerID == "" {
|
||||||
|
return false, errNodeNotInitialized
|
||||||
|
}
|
||||||
|
|
||||||
// Returns true for unmanaged nodes because azure cloud provider always assumes them exists.
|
// Returns true for unmanaged nodes because azure cloud provider always assumes them exists.
|
||||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||||
klog.V(4).Infof("InstanceExistsByProviderID: assuming unmanaged node %q exists", providerID)
|
klog.V(4).Infof("InstanceExistsByProviderID: assuming unmanaged node %q exists", providerID)
|
||||||
@ -187,13 +199,27 @@ func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID stri
|
|||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
||||||
func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||||
|
if providerID == "" {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID)
|
nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||||
|
if err == cloudprovider.InstanceNotFound {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName))
|
powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||||
|
if err == cloudprovider.InstanceNotFound {
|
||||||
|
return false, nil
|
||||||
|
}
|
||||||
|
|
||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
klog.V(5).Infof("InstanceShutdownByProviderID gets power status %q for node %q", powerStatus, nodeName)
|
klog.V(5).Infof("InstanceShutdownByProviderID gets power status %q for node %q", powerStatus, nodeName)
|
||||||
@ -283,6 +309,10 @@ func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, e
|
|||||||
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
||||||
// and other local methods cannot be used here
|
// and other local methods cannot be used here
|
||||||
func (az *Cloud) InstanceTypeByProviderID(ctx context.Context, providerID string) (string, error) {
|
func (az *Cloud) InstanceTypeByProviderID(ctx context.Context, providerID string) (string, error) {
|
||||||
|
if providerID == "" {
|
||||||
|
return "", errNodeNotInitialized
|
||||||
|
}
|
||||||
|
|
||||||
// Returns "" for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
// Returns "" for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||||
klog.V(4).Infof("InstanceTypeByProviderID: omitting unmanaged node %q", providerID)
|
klog.V(4).Infof("InstanceTypeByProviderID: omitting unmanaged node %q", providerID)
|
||||||
|
@ -204,10 +204,10 @@ func TestInstanceShutdownByProviderID(t *testing.T) {
|
|||||||
expected: false,
|
expected: false,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "InstanceShutdownByProviderID should report error if VM doesn't exist",
|
name: "InstanceShutdownByProviderID should return false if VM doesn't exist",
|
||||||
vmList: map[string]string{"vm1": "PowerState/running"},
|
vmList: map[string]string{"vm1": "PowerState/running"},
|
||||||
nodeName: "vm8",
|
nodeName: "vm8",
|
||||||
expectError: true,
|
expected: false,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,7 +366,9 @@ func (as *availabilitySet) GetPowerStatusByNodeName(name string) (powerState str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||||
|
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||||
|
return vmPowerStateStopped, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetNodeNameByProviderID gets the node name by provider ID.
|
// GetNodeNameByProviderID gets the node name by provider ID.
|
||||||
|
@ -144,7 +144,9 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||||
|
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||||
|
return vmPowerStateStopped, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
|
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
|
||||||
@ -589,10 +591,17 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err
|
|||||||
defer cancel()
|
defer cancel()
|
||||||
nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
|
nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err)
|
exists, _, realErr := checkResourceExistsFromError(err)
|
||||||
|
if realErr != nil {
|
||||||
|
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, realErr)
|
||||||
return network.Interface{}, err
|
return network.Interface{}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !exists {
|
||||||
|
return network.Interface{}, cloudprovider.InstanceNotFound
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Fix interface's location, which is required when updating the interface.
|
// Fix interface's location, which is required when updating the interface.
|
||||||
// TODO: is this a bug of azure SDK?
|
// TODO: is this a bug of azure SDK?
|
||||||
if nic.Location == nil || *nic.Location == "" {
|
if nic.Location == nil || *nic.Location == "" {
|
||||||
@ -767,20 +776,24 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
|
|||||||
}
|
}
|
||||||
|
|
||||||
f := func() error {
|
f := func() error {
|
||||||
// VMAS nodes should also be added to the SLB backends.
|
|
||||||
if ss.useStandardLoadBalancer() {
|
|
||||||
// Check whether the node is VMAS virtual machine.
|
// Check whether the node is VMAS virtual machine.
|
||||||
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if managedByAS {
|
if managedByAS {
|
||||||
|
// VMAS nodes should also be added to the SLB backends.
|
||||||
|
if ss.useStandardLoadBalancer() {
|
||||||
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
klog.V(3).Infof("EnsureHostsInPool skips node %s because VMAS nodes couldn't be added to basic LB with VMSS backends", localNodeName)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
err := ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
err = ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err)
|
return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err)
|
||||||
}
|
}
|
||||||
|
@ -89,6 +89,10 @@ func (az *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) {
|
|||||||
// This is particularly useful in external cloud providers where the kubelet
|
// This is particularly useful in external cloud providers where the kubelet
|
||||||
// does not initialize node data.
|
// does not initialize node data.
|
||||||
func (az *Cloud) GetZoneByProviderID(ctx context.Context, providerID string) (cloudprovider.Zone, error) {
|
func (az *Cloud) GetZoneByProviderID(ctx context.Context, providerID string) (cloudprovider.Zone, error) {
|
||||||
|
if providerID == "" {
|
||||||
|
return cloudprovider.Zone{}, errNodeNotInitialized
|
||||||
|
}
|
||||||
|
|
||||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||||
klog.V(2).Infof("GetZoneByProviderID: omitting unmanaged node %q", providerID)
|
klog.V(2).Infof("GetZoneByProviderID: omitting unmanaged node %q", providerID)
|
||||||
|
Loading…
Reference in New Issue
Block a user