mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 19:56:01 +00:00
Azure: fix node removal race condition on VMSS deletion
When a VMSS is being deleted, instances are removed first. The VMSS itself will disappear once empty. That delay is generally enough for kube-controller-manager to delete the corresponding k8s nodes, but might not when busy or throttled (for instance). If kubernetes nodes remains after their backing VMSS were removed, Azure cloud-provider will fail listing that VMSS VMs, and downstream callers (ie. `InstanceExistsByProviderID`) won't account those errors for a missing instance. The nodes will remain (still considered as "existing"), and controller-manager will indefinitely retry to VMSS VMs list it, draining API calls quotas, potentially causing throttling. In practice a missing scale set implies instances attributed to that VMSS don't exists either: `InstanceExistsByProviderID` (part of the general cloud provider interface) should return false in that case.
This commit is contained in:
parent
0d1ac16ca4
commit
ee7cd252e8
@ -38,6 +38,8 @@ import (
|
||||
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient"
|
||||
"k8s.io/legacy-cloud-providers/azure/clients/publicipclient/mockpublicipclient"
|
||||
"k8s.io/legacy-cloud-providers/azure/clients/vmclient/mockvmclient"
|
||||
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
|
||||
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
|
||||
"k8s.io/legacy-cloud-providers/azure/retry"
|
||||
)
|
||||
|
||||
@ -633,6 +635,58 @@ func TestInstanceExistsByProviderID(t *testing.T) {
|
||||
assert.Equal(t, test.expectedErrMsg, err, test.name)
|
||||
assert.Equal(t, test.expected, exist, test.name)
|
||||
}
|
||||
|
||||
vmssTestCases := []struct {
|
||||
name string
|
||||
providerID string
|
||||
scaleSet string
|
||||
vmList []string
|
||||
expected bool
|
||||
rerr *retry.Error
|
||||
}{
|
||||
{
|
||||
name: "InstanceExistsByProviderID should return true if VMSS and VM exist",
|
||||
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
|
||||
scaleSet: "vmssee6c2",
|
||||
vmList: []string{"vmssee6c2000000"},
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "InstanceExistsByProviderID should return false if VMSS exist but VM doesn't",
|
||||
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
|
||||
scaleSet: "vmssee6c2",
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "InstanceExistsByProviderID should return false if VMSS doesn't exist",
|
||||
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/missing-vmss/virtualMachines/0",
|
||||
rerr: &retry.Error{HTTPStatusCode: 404},
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range vmssTestCases {
|
||||
ss, err := newTestScaleSet(ctrl)
|
||||
assert.NoError(t, err, test.name)
|
||||
cloud.VMSet = ss
|
||||
|
||||
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
|
||||
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
|
||||
ss.cloud.VirtualMachineScaleSetsClient = mockVMSSClient
|
||||
ss.cloud.VirtualMachineScaleSetVMsClient = mockVMSSVMClient
|
||||
|
||||
expectedScaleSet := buildTestVMSS(test.scaleSet, test.scaleSet)
|
||||
mockVMSSClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachineScaleSet{expectedScaleSet}, test.rerr).AnyTimes()
|
||||
|
||||
expectedVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, test.scaleSet, "", 0, test.vmList, "succeeded", false)
|
||||
mockVMSSVMClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedVMs, test.rerr).AnyTimes()
|
||||
|
||||
mockVMsClient := ss.cloud.VirtualMachinesClient.(*mockvmclient.MockInterface)
|
||||
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachine{}, nil).AnyTimes()
|
||||
|
||||
exist, _ := cloud.InstanceExistsByProviderID(context.Background(), test.providerID)
|
||||
assert.Equal(t, test.expected, exist, test.name)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNodeAddressesByProviderID(t *testing.T) {
|
||||
|
@ -697,6 +697,9 @@ func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compu
|
||||
allVMs, rerr := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, string(compute.InstanceView))
|
||||
if rerr != nil {
|
||||
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", rerr)
|
||||
if rerr.IsNotFound() {
|
||||
return nil, cloudprovider.InstanceNotFound
|
||||
}
|
||||
return nil, rerr.Error()
|
||||
}
|
||||
|
||||
|
@ -89,6 +89,15 @@ func (err *Error) IsThrottled() bool {
|
||||
return err.HTTPStatusCode == http.StatusTooManyRequests || err.RetryAfter.After(now())
|
||||
}
|
||||
|
||||
// IsNotFound returns true the if the requested object wasn't found
|
||||
func (err *Error) IsNotFound() bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return err.HTTPStatusCode == http.StatusNotFound
|
||||
}
|
||||
|
||||
// NewError creates a new Error.
|
||||
func NewError(retriable bool, err error) *Error {
|
||||
return &Error{
|
||||
|
Loading…
Reference in New Issue
Block a user