From ee7cd252e88b94c7209366ff641e7e3eb1b07fc4 Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Sun, 4 Oct 2020 10:59:19 +0200 Subject: [PATCH] Azure: fix node removal race condition on VMSS deletion When a VMSS is being deleted, instances are removed first. The VMSS itself will disappear once empty. That delay is generally enough for kube-controller-manager to delete the corresponding k8s nodes, but might not when busy or throttled (for instance). If kubernetes nodes remains after their backing VMSS were removed, Azure cloud-provider will fail listing that VMSS VMs, and downstream callers (ie. `InstanceExistsByProviderID`) won't account those errors for a missing instance. The nodes will remain (still considered as "existing"), and controller-manager will indefinitely retry to VMSS VMs list it, draining API calls quotas, potentially causing throttling. In practice a missing scale set implies instances attributed to that VMSS don't exists either: `InstanceExistsByProviderID` (part of the general cloud provider interface) should return false in that case. --- .../azure/azure_instances_test.go | 54 +++++++++++++++++++ .../azure/azure_vmss.go | 3 ++ .../azure/retry/azure_error.go | 9 ++++ 3 files changed, 66 insertions(+) diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go index 431a2d8660a..4602f45e4fe 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_instances_test.go @@ -38,6 +38,8 @@ import ( "k8s.io/legacy-cloud-providers/azure/clients/interfaceclient/mockinterfaceclient" "k8s.io/legacy-cloud-providers/azure/clients/publicipclient/mockpublicipclient" "k8s.io/legacy-cloud-providers/azure/clients/vmclient/mockvmclient" + "k8s.io/legacy-cloud-providers/azure/clients/vmssclient/mockvmssclient" + "k8s.io/legacy-cloud-providers/azure/clients/vmssvmclient/mockvmssvmclient" "k8s.io/legacy-cloud-providers/azure/retry" ) @@ -633,6 +635,58 @@ func TestInstanceExistsByProviderID(t *testing.T) { assert.Equal(t, test.expectedErrMsg, err, test.name) assert.Equal(t, test.expected, exist, test.name) } + + vmssTestCases := []struct { + name string + providerID string + scaleSet string + vmList []string + expected bool + rerr *retry.Error + }{ + { + name: "InstanceExistsByProviderID should return true if VMSS and VM exist", + providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0", + scaleSet: "vmssee6c2", + vmList: []string{"vmssee6c2000000"}, + expected: true, + }, + { + name: "InstanceExistsByProviderID should return false if VMSS exist but VM doesn't", + providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/vmssee6c2/virtualMachines/0", + scaleSet: "vmssee6c2", + expected: false, + }, + { + name: "InstanceExistsByProviderID should return false if VMSS doesn't exist", + providerID: "azure:///subscriptions/script/resourceGroups/rg/providers/Microsoft.Compute/virtualMachineScaleSets/missing-vmss/virtualMachines/0", + rerr: &retry.Error{HTTPStatusCode: 404}, + expected: false, + }, + } + + for _, test := range vmssTestCases { + ss, err := newTestScaleSet(ctrl) + assert.NoError(t, err, test.name) + cloud.VMSet = ss + + mockVMSSClient := mockvmssclient.NewMockInterface(ctrl) + mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl) + ss.cloud.VirtualMachineScaleSetsClient = mockVMSSClient + ss.cloud.VirtualMachineScaleSetVMsClient = mockVMSSVMClient + + expectedScaleSet := buildTestVMSS(test.scaleSet, test.scaleSet) + mockVMSSClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachineScaleSet{expectedScaleSet}, test.rerr).AnyTimes() + + expectedVMs, _, _ := buildTestVirtualMachineEnv(ss.cloud, test.scaleSet, "", 0, test.vmList, "succeeded", false) + mockVMSSVMClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(expectedVMs, test.rerr).AnyTimes() + + mockVMsClient := ss.cloud.VirtualMachinesClient.(*mockvmclient.MockInterface) + mockVMsClient.EXPECT().List(gomock.Any(), gomock.Any()).Return([]compute.VirtualMachine{}, nil).AnyTimes() + + exist, _ := cloud.InstanceExistsByProviderID(context.Background(), test.providerID) + assert.Equal(t, test.expected, exist, test.name) + } } func TestNodeAddressesByProviderID(t *testing.T) { diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go index 14c350d8440..c0512ff01eb 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_vmss.go @@ -697,6 +697,9 @@ func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compu allVMs, rerr := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, string(compute.InstanceView)) if rerr != nil { klog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", rerr) + if rerr.IsNotFound() { + return nil, cloudprovider.InstanceNotFound + } return nil, rerr.Error() } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go index a0cedbef2d4..ed130852ab0 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go @@ -89,6 +89,15 @@ func (err *Error) IsThrottled() bool { return err.HTTPStatusCode == http.StatusTooManyRequests || err.RetryAfter.After(now()) } +// IsNotFound returns true the if the requested object wasn't found +func (err *Error) IsNotFound() bool { + if err == nil { + return false + } + + return err.HTTPStatusCode == http.StatusNotFound +} + // NewError creates a new Error. func NewError(retriable bool, err error) *Error { return &Error{