mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #80419 from feiskyer/vmss-fix
Fix retry issues when the nodes are under deleting on Azure
This commit is contained in:
commit
c08a88a2d1
@ -19,6 +19,7 @@ package azure
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute"
|
||||
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network"
|
||||
@ -32,6 +33,11 @@ import (
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
const (
|
||||
// not active means the instance is under deleting from Azure VMSS.
|
||||
vmssVMNotActiveErrorMessage = "not an active Virtual Machine Scale Set VM instanceId"
|
||||
)
|
||||
|
||||
// RequestBackoff if backoff is disabled in cloud provider it
|
||||
// returns a new Backoff object steps = 1
|
||||
// This is to make sure that the requested command executes
|
||||
@ -133,10 +139,14 @@ func (az *Cloud) getPrivateIPsForMachineWithRetry(nodeName types.NodeName) ([]st
|
||||
var retryErr error
|
||||
privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName))
|
||||
if retryErr != nil {
|
||||
// won't retry since the instance doesn't exist on Azure.
|
||||
if retryErr == cloudprovider.InstanceNotFound {
|
||||
return true, retryErr
|
||||
}
|
||||
klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr)
|
||||
return false, nil
|
||||
}
|
||||
klog.V(2).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
||||
klog.V(3).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
||||
return true, nil
|
||||
})
|
||||
return privateIPs, err
|
||||
@ -160,7 +170,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string,
|
||||
klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr)
|
||||
return false, nil
|
||||
}
|
||||
klog.V(2).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
||||
klog.V(3).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
||||
return true, nil
|
||||
})
|
||||
return ip, publicIP, err
|
||||
@ -582,7 +592,15 @@ func (az *Cloud) UpdateVmssVMWithRetry(resourceGroupName string, VMScaleSetName
|
||||
defer cancel()
|
||||
|
||||
resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source)
|
||||
klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID)
|
||||
klog.V(10).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s): end", VMScaleSetName, instanceID)
|
||||
|
||||
if strings.Contains(err.Error(), vmssVMNotActiveErrorMessage) {
|
||||
// When instances are under deleting, updating API would report "not an active Virtual Machine Scale Set VM instanceId" error.
|
||||
// Since they're under deleting, we shouldn't send more update requests for it.
|
||||
klog.V(3).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s) gets error message %q, abort backoff because it's probably under deleting", VMScaleSetName, instanceID, vmssVMNotActiveErrorMessage)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return az.processHTTPRetryResponse(nil, "", resp, err)
|
||||
})
|
||||
}
|
||||
|
@ -34,6 +34,10 @@ const (
|
||||
vmPowerStateDeallocated = "deallocated"
|
||||
)
|
||||
|
||||
var (
|
||||
errNodeNotInitialized = fmt.Errorf("providerID is empty, the node is not initialized yet")
|
||||
)
|
||||
|
||||
// NodeAddresses returns the addresses of the specified instance.
|
||||
func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.NodeAddress, error) {
|
||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
@ -143,6 +147,10 @@ func (az *Cloud) NodeAddresses(ctx context.Context, name types.NodeName) ([]v1.N
|
||||
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
||||
// and other local methods cannot be used here
|
||||
func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string) ([]v1.NodeAddress, error) {
|
||||
if providerID == "" {
|
||||
return nil, errNodeNotInitialized
|
||||
}
|
||||
|
||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||
klog.V(4).Infof("NodeAddressesByProviderID: omitting unmanaged node %q", providerID)
|
||||
@ -160,6 +168,10 @@ func (az *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID strin
|
||||
// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running.
|
||||
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
|
||||
func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||
if providerID == "" {
|
||||
return false, errNodeNotInitialized
|
||||
}
|
||||
|
||||
// Returns true for unmanaged nodes because azure cloud provider always assumes them exists.
|
||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||
klog.V(4).Infof("InstanceExistsByProviderID: assuming unmanaged node %q exists", providerID)
|
||||
@ -187,13 +199,27 @@ func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID stri
|
||||
|
||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
||||
func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||
if providerID == "" {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID)
|
||||
if err != nil {
|
||||
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return false, err
|
||||
}
|
||||
|
||||
powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName))
|
||||
if err != nil {
|
||||
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return false, err
|
||||
}
|
||||
klog.V(5).Infof("InstanceShutdownByProviderID gets power status %q for node %q", powerStatus, nodeName)
|
||||
@ -283,6 +309,10 @@ func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, e
|
||||
// This method will not be called from the node that is requesting this ID. i.e. metadata service
|
||||
// and other local methods cannot be used here
|
||||
func (az *Cloud) InstanceTypeByProviderID(ctx context.Context, providerID string) (string, error) {
|
||||
if providerID == "" {
|
||||
return "", errNodeNotInitialized
|
||||
}
|
||||
|
||||
// Returns "" for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||
klog.V(4).Infof("InstanceTypeByProviderID: omitting unmanaged node %q", providerID)
|
||||
|
@ -204,10 +204,10 @@ func TestInstanceShutdownByProviderID(t *testing.T) {
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "InstanceShutdownByProviderID should report error if VM doesn't exist",
|
||||
vmList: map[string]string{"vm1": "PowerState/running"},
|
||||
nodeName: "vm8",
|
||||
expectError: true,
|
||||
name: "InstanceShutdownByProviderID should return false if VM doesn't exist",
|
||||
vmList: map[string]string{"vm1": "PowerState/running"},
|
||||
nodeName: "vm8",
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -366,7 +366,9 @@ func (as *availabilitySet) GetPowerStatusByNodeName(name string) (powerState str
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
||||
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||
return vmPowerStateStopped, nil
|
||||
}
|
||||
|
||||
// GetNodeNameByProviderID gets the node name by provider ID.
|
||||
|
@ -144,7 +144,9 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
||||
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||
return vmPowerStateStopped, nil
|
||||
}
|
||||
|
||||
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
|
||||
@ -589,8 +591,15 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err
|
||||
defer cancel()
|
||||
nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
|
||||
if err != nil {
|
||||
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err)
|
||||
return network.Interface{}, err
|
||||
exists, _, realErr := checkResourceExistsFromError(err)
|
||||
if realErr != nil {
|
||||
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, realErr)
|
||||
return network.Interface{}, err
|
||||
}
|
||||
|
||||
if !exists {
|
||||
return network.Interface{}, cloudprovider.InstanceNotFound
|
||||
}
|
||||
}
|
||||
|
||||
// Fix interface's location, which is required when updating the interface.
|
||||
@ -767,20 +776,24 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
|
||||
}
|
||||
|
||||
f := func() error {
|
||||
// VMAS nodes should also be added to the SLB backends.
|
||||
if ss.useStandardLoadBalancer() {
|
||||
// Check whether the node is VMAS virtual machine.
|
||||
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
||||
return err
|
||||
}
|
||||
if managedByAS {
|
||||
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
}
|
||||
// Check whether the node is VMAS virtual machine.
|
||||
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
||||
return err
|
||||
}
|
||||
|
||||
err := ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
if managedByAS {
|
||||
// VMAS nodes should also be added to the SLB backends.
|
||||
if ss.useStandardLoadBalancer() {
|
||||
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
}
|
||||
|
||||
klog.V(3).Infof("EnsureHostsInPool skips node %s because VMAS nodes couldn't be added to basic LB with VMSS backends", localNodeName)
|
||||
return nil
|
||||
}
|
||||
|
||||
err = ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
if err != nil {
|
||||
return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err)
|
||||
}
|
||||
|
@ -89,6 +89,10 @@ func (az *Cloud) GetZone(ctx context.Context) (cloudprovider.Zone, error) {
|
||||
// This is particularly useful in external cloud providers where the kubelet
|
||||
// does not initialize node data.
|
||||
func (az *Cloud) GetZoneByProviderID(ctx context.Context, providerID string) (cloudprovider.Zone, error) {
|
||||
if providerID == "" {
|
||||
return cloudprovider.Zone{}, errNodeNotInitialized
|
||||
}
|
||||
|
||||
// Returns nil for unmanaged nodes because azure cloud provider couldn't fetch information for them.
|
||||
if az.IsNodeUnmanagedByProviderID(providerID) {
|
||||
klog.V(2).Infof("GetZoneByProviderID: omitting unmanaged node %q", providerID)
|
||||
|
Loading…
Reference in New Issue
Block a user