Azure: fix node removal race condition on VMSS deletion

When a VMSS is being deleted, instances are removed first. The VMSS
itself will disappear once empty. That delay is generally enough for
kube-controller-manager to delete the corresponding k8s nodes, but
might not when busy or throttled (for instance).

If kubernetes nodes remains after their backing VMSS were removed, Azure
cloud-provider will fail listing that VMSS VMs, and downstream callers
(ie. `InstanceExistsByProviderID`) won't account those errors for a
missing instance. The nodes will remain (still considered as "existing"),
and controller-manager will indefinitely retry to VMSS VMs list it,
draining API calls quotas, potentially causing throttling.

In practice a missing scale set implies instances attributed to that
VMSS don't exists either: `InstanceExistsByProviderID` (part of the
general cloud provider interface) should return false in that case.
This commit is contained in:
Benjamin Pineau 2020-10-04 10:59:19 +02:00
parent 0d1ac16ca4
commit ee7cd252e8
3 changed files with 66 additions and 0 deletions

View File

@ -38,6 +38,8 @@ import (
"k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient"
"k8s.io/legacy-cloud-providers/azure/clients/publicipclient/mockpublicipclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmclient/mockvmclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient"
"k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient"
"k8s.io/legacy-cloud-providers/azure/retry"
)
@ -633,6 +635,58 @@ func TestInstanceExistsByProviderID(t *testing.T) {
assert.Equal(t, test.expectedErrMsg, err, test.name)
assert.Equal(t, test.expected, exist, test.name)
}
vmssTestCases := []struct {
name string
providerID string
scaleSet string
vmList []string
expected bool
rerr *retry.Error
}{
{
name: "InstanceExistsByProviderID should return true if VMSS and VM exist",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
scaleSet: "vmssee6c2",
vmList: []string{"vmssee6c2000000"},
expected: true,
},
{
name: "InstanceExistsByProviderID should return false if VMSS exist but VM doesn't",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0",
scaleSet: "vmssee6c2",
expected: false,
},
{
name: "InstanceExistsByProviderID should return false if VMSS doesn't exist",
providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/missing-vmss/virtualMachines/0",
rerr: &retry.Error{HTTPStatusCode: 404},
expected: false,
},
}
for _, test := range vmssTestCases {
ss, err := newTestScaleSet(ctrl)
assert.NoError(t, err, test.name)
cloud.VMSet = ss
mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
ss.cloud.VirtualMachineScaleSetsClient = mockVMSSClient
ss.cloud.VirtualMachineScaleSetVMsClient = mockVMSSVMClient
expectedScaleSet := buildTestVMSS(test.scaleSet, test.scaleSet)
mockVMSSClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachineScaleSet{expectedScaleSet}, test.rerr).AnyTimes()
expectedVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, test.scaleSet, "", 0, test.vmList, "succeeded", false)
mockVMSSVMClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedVMs, test.rerr).AnyTimes()
mockVMsClient := ss.cloud.VirtualMachinesClient.(*mockvmclient.MockInterface)
mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachine{}, nil).AnyTimes()
exist, _ := cloud.InstanceExistsByProviderID(context.Background(), test.providerID)
assert.Equal(t, test.expected, exist, test.name)
}
}
func TestNodeAddressesByProviderID(t *testing.T) {

View File

@ -697,6 +697,9 @@ func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compu
allVMs, rerr := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, string(compute.InstanceView))
if rerr != nil {
klog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", rerr)
if rerr.IsNotFound() {
return nil, cloudprovider.InstanceNotFound
}
return nil, rerr.Error()
}

View File

@ -89,6 +89,15 @@ func (err *Error) IsThrottled() bool {
return err.HTTPStatusCode == http.StatusTooManyRequests || err.RetryAfter.After(now())
}
// IsNotFound returns true the if the requested object wasn't found
func (err *Error) IsNotFound() bool {
if err == nil {
return false
}
return err.HTTPStatusCode == http.StatusNotFound
}
// NewError creates a new Error.
func NewError(retriable bool, err error) *Error {
return &Error{