mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 20:24:09 +00:00
Fix retry issues when the nodes are under deleting on Azure
This commit is contained in:
parent
f74b610036
commit
2a62bc74ad
@ -19,6 +19,7 @@ package azure
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-03-01/compute"
|
||||
"github.com/Azure/azure-sdk-for-go/services/network/mgmt/2018-08-01/network"
|
||||
@ -32,6 +33,11 @@ import (
|
||||
"k8s.io/klog"
|
||||
)
|
||||
|
||||
const (
|
||||
// not active means the instance is under deleting from Azure VMSS.
|
||||
vmssVMNotActiveErrorMessage = "not an active Virtual Machine Scale Set VM instanceId"
|
||||
)
|
||||
|
||||
// RequestBackoff if backoff is disabled in cloud provider it
|
||||
// returns a new Backoff object steps = 1
|
||||
// This is to make sure that the requested command executes
|
||||
@ -133,10 +139,14 @@ func (az *Cloud) getPrivateIPsForMachineWithRetry(nodeName types.NodeName) ([]st
|
||||
var retryErr error
|
||||
privateIPs, retryErr = az.vmSet.GetPrivateIPsByNodeName(string(nodeName))
|
||||
if retryErr != nil {
|
||||
// won't retry since the instance doesn't exist on Azure.
|
||||
if retryErr == cloudprovider.InstanceNotFound {
|
||||
return true, retryErr
|
||||
}
|
||||
klog.Errorf("GetPrivateIPsByNodeName(%s): backoff failure, will retry,err=%v", nodeName, retryErr)
|
||||
return false, nil
|
||||
}
|
||||
klog.V(2).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
||||
klog.V(3).Infof("GetPrivateIPsByNodeName(%s): backoff success", nodeName)
|
||||
return true, nil
|
||||
})
|
||||
return privateIPs, err
|
||||
@ -160,7 +170,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, string,
|
||||
klog.Errorf("GetIPForMachineWithRetry(%s): backoff failure, will retry,err=%v", name, retryErr)
|
||||
return false, nil
|
||||
}
|
||||
klog.V(2).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
||||
klog.V(3).Infof("GetIPForMachineWithRetry(%s): backoff success", name)
|
||||
return true, nil
|
||||
})
|
||||
return ip, publicIP, err
|
||||
@ -582,7 +592,15 @@ func (az *Cloud) UpdateVmssVMWithRetry(resourceGroupName string, VMScaleSetName
|
||||
defer cancel()
|
||||
|
||||
resp, err := az.VirtualMachineScaleSetVMsClient.Update(ctx, resourceGroupName, VMScaleSetName, instanceID, parameters, source)
|
||||
klog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s,%s): end", VMScaleSetName, instanceID)
|
||||
klog.V(10).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s): end", VMScaleSetName, instanceID)
|
||||
|
||||
if strings.Contains(err.Error(), vmssVMNotActiveErrorMessage) {
|
||||
// When instances are under deleting, updating API would report "not an active Virtual Machine Scale Set VM instanceId" error.
|
||||
// Since they're under deleting, we shouldn't send more update requests for it.
|
||||
klog.V(3).Infof("UpdateVmssVMWithRetry: VirtualMachineScaleSetVMsClient.Update(%s,%s) gets error message %q, abort backoff because it's probably under deleting", VMScaleSetName, instanceID, vmssVMNotActiveErrorMessage)
|
||||
return true, nil
|
||||
}
|
||||
|
||||
return az.processHTTPRetryResponse(nil, "", resp, err)
|
||||
})
|
||||
}
|
||||
|
@ -205,8 +205,7 @@ func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID st
|
||||
|
||||
nodeName, err := az.vmSet.GetNodeNameByProviderID(providerID)
|
||||
if err != nil {
|
||||
// returns false, because otherwise node is not deleted from cluster
|
||||
// false means that it will continue to check InstanceExistsByProviderID
|
||||
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
@ -216,8 +215,7 @@ func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID st
|
||||
|
||||
powerStatus, err := az.vmSet.GetPowerStatusByNodeName(string(nodeName))
|
||||
if err != nil {
|
||||
// returns false, because otherwise node is not deleted from cluster
|
||||
// false means that it will continue to check InstanceExistsByProviderID
|
||||
// Returns false, so the controller manager will continue to check InstanceExistsByProviderID().
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
|
@ -204,10 +204,10 @@ func TestInstanceShutdownByProviderID(t *testing.T) {
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "InstanceShutdownByProviderID should report error if VM doesn't exist",
|
||||
vmList: map[string]string{"vm1": "PowerState/running"},
|
||||
nodeName: "vm8",
|
||||
expectError: true,
|
||||
name: "InstanceShutdownByProviderID should return false if VM doesn't exist",
|
||||
vmList: map[string]string{"vm1": "PowerState/running"},
|
||||
nodeName: "vm8",
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -366,7 +366,9 @@ func (as *availabilitySet) GetPowerStatusByNodeName(name string) (powerState str
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
||||
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||
return vmPowerStateStopped, nil
|
||||
}
|
||||
|
||||
// GetNodeNameByProviderID gets the node name by provider ID.
|
||||
|
@ -144,7 +144,9 @@ func (ss *scaleSet) GetPowerStatusByNodeName(name string) (powerState string, er
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get power status for node %q", name)
|
||||
// vm.InstanceView or vm.InstanceView.Statuses are nil when the VM is under deleting.
|
||||
klog.V(3).Infof("InstanceView for node %q is nil, assuming it's stopped", name)
|
||||
return vmPowerStateStopped, nil
|
||||
}
|
||||
|
||||
// getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
|
||||
@ -589,8 +591,15 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err
|
||||
defer cancel()
|
||||
nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
|
||||
if err != nil {
|
||||
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err)
|
||||
return network.Interface{}, err
|
||||
exists, _, realErr := checkResourceExistsFromError(err)
|
||||
if realErr != nil {
|
||||
klog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, realErr)
|
||||
return network.Interface{}, err
|
||||
}
|
||||
|
||||
if !exists {
|
||||
return network.Interface{}, cloudprovider.InstanceNotFound
|
||||
}
|
||||
}
|
||||
|
||||
// Fix interface's location, which is required when updating the interface.
|
||||
@ -767,20 +776,24 @@ func (ss *scaleSet) EnsureHostsInPool(service *v1.Service, nodes []*v1.Node, bac
|
||||
}
|
||||
|
||||
f := func() error {
|
||||
// VMAS nodes should also be added to the SLB backends.
|
||||
if ss.useStandardLoadBalancer() {
|
||||
// Check whether the node is VMAS virtual machine.
|
||||
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
||||
return err
|
||||
}
|
||||
if managedByAS {
|
||||
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
}
|
||||
// Check whether the node is VMAS virtual machine.
|
||||
managedByAS, err := ss.isNodeManagedByAvailabilitySet(localNodeName)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to check isNodeManagedByAvailabilitySet(%s): %v", localNodeName, err)
|
||||
return err
|
||||
}
|
||||
|
||||
err := ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
if managedByAS {
|
||||
// VMAS nodes should also be added to the SLB backends.
|
||||
if ss.useStandardLoadBalancer() {
|
||||
return ss.availabilitySet.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
}
|
||||
|
||||
klog.V(3).Infof("EnsureHostsInPool skips node %s because VMAS nodes couldn't be added to basic LB with VMSS backends", localNodeName)
|
||||
return nil
|
||||
}
|
||||
|
||||
err = ss.EnsureHostInPool(service, types.NodeName(localNodeName), backendPoolID, vmSetName, isInternal)
|
||||
if err != nil {
|
||||
return fmt.Errorf("EnsureHostInPool(%s): backendPoolID(%s) - failed to ensure host in pool: %q", getServiceName(service), backendPoolID, err)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user