From 46593daf2276dd42a6fadab70aa6268e970804ac Mon Sep 17 00:00:00 2001 From: Qi Ni Date: Wed, 21 Oct 2020 13:23:07 +0800 Subject: [PATCH] Update the PIP when it is not in the Succeeded provisioning state during the LB update. --- .../azure/azure_backoff.go | 48 ++++++++++++++++--- .../azure/azure_backoff_test.go | 15 ++++++ 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go index 6ac166c6a89..6c54a2cbe07 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff.go @@ -20,6 +20,7 @@ package azure import ( "net/http" + "regexp" "strings" "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-12-01/compute" @@ -44,6 +45,12 @@ const ( operationCanceledErrorMessage = "canceledandsupersededduetoanotheroperation" cannotDeletePublicIPErrorMessageCode = "PublicIPAddressCannotBeDeleted" + + referencedResourceNotProvisionedMessageCode = "ReferencedResourceNotProvisioned" +) + +var ( + pipErrorMessageRE = regexp.MustCompile(`(?:.*)/subscriptions/(?:.*)/resourceGroups/(.*)/providers/Microsoft.Network/publicIPAddresses/([^\s]+)(?:.*)`) ) // RequestBackoff if backoff is disabled in cloud provider it @@ -182,7 +189,7 @@ func (az *Cloud) CreateOrUpdateLB(service *v1.Service, lb network.LoadBalancer) defer cancel() rgName := az.getLoadBalancerResourceGroup() - rerr := az.LoadBalancerClient.CreateOrUpdate(ctx, rgName, *lb.Name, lb, to.String(lb.Etag)) + rerr := az.LoadBalancerClient.CreateOrUpdate(ctx, rgName, to.String(lb.Name), lb, to.String(lb.Etag)) klog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): end", *lb.Name) if rerr == nil { // Invalidate the cache right after updating @@ -192,12 +199,39 @@ func (az *Cloud) CreateOrUpdateLB(service *v1.Service, lb network.LoadBalancer) // Invalidate the cache because ETAG precondition mismatch. if rerr.HTTPStatusCode == http.StatusPreconditionFailed { - klog.V(3).Infof("LoadBalancer cache for %s is cleanup because of http.StatusPreconditionFailed", *lb.Name) + klog.V(3).Infof("LoadBalancer cache for %s is cleanup because of http.StatusPreconditionFailed", to.String(lb.Name)) az.lbCache.Delete(*lb.Name) } + + retryErrorMessage := rerr.Error().Error() // Invalidate the cache because another new operation has canceled the current request. - if strings.Contains(strings.ToLower(rerr.Error().Error()), operationCanceledErrorMessage) { - klog.V(3).Infof("LoadBalancer cache for %s is cleanup because CreateOrUpdate is canceled by another operation", *lb.Name) + if strings.Contains(strings.ToLower(retryErrorMessage), operationCanceledErrorMessage) { + klog.V(3).Infof("LoadBalancer cache for %s is cleanup because CreateOrUpdate is canceled by another operation", to.String(lb.Name)) + az.lbCache.Delete(*lb.Name) + } + + // The LB update may fail because the referenced PIP is not in the Succeeded provisioning state + if strings.Contains(strings.ToLower(retryErrorMessage), strings.ToLower(referencedResourceNotProvisionedMessageCode)) { + matches := pipErrorMessageRE.FindStringSubmatch(retryErrorMessage) + if len(matches) != 3 { + klog.Warningf("Failed to parse the retry error message %s", retryErrorMessage) + return rerr.Error() + } + pipRG, pipName := matches[1], matches[2] + klog.V(3).Infof("The public IP %s referenced by load balancer %s is not in Succeeded provisioning state, will try to update it", pipName, to.String(lb.Name)) + pip, _, err := az.getPublicIPAddress(pipRG, pipName) + if err != nil { + klog.Warningf("Failed to get the public IP %s in resource group %s: %v", pipName, pipRG, err) + return rerr.Error() + } + // Perform a dummy update to fix the provisioning state + err = az.CreateOrUpdatePIP(service, pipRG, pip) + if err != nil { + klog.Warningf("Failed to update the public IP %s in resource group %s: %v", pipName, pipRG, err) + return rerr.Error() + } + // Invalidate the LB cache, return the error, and the controller manager + // would retry the LB update in the next reconcile loop az.lbCache.Delete(*lb.Name) } @@ -241,10 +275,10 @@ func (az *Cloud) CreateOrUpdatePIP(service *v1.Service, pipResourceGroup string, ctx, cancel := getContextWithCancel() defer cancel() - rerr := az.PublicIPAddressesClient.CreateOrUpdate(ctx, pipResourceGroup, *pip.Name, pip) - klog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s, %s): end", pipResourceGroup, *pip.Name) + rerr := az.PublicIPAddressesClient.CreateOrUpdate(ctx, pipResourceGroup, to.String(pip.Name), pip) + klog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s, %s): end", pipResourceGroup, to.String(pip.Name)) if rerr != nil { - klog.Errorf("PublicIPAddressesClient.CreateOrUpdate(%s, %s) failed: %s", pipResourceGroup, *pip.Name, rerr.Error().Error()) + klog.Errorf("PublicIPAddressesClient.CreateOrUpdate(%s, %s) failed: %s", pipResourceGroup, to.String(pip.Name), rerr.Error().Error()) az.Event(service, v1.EventTypeWarning, "CreateOrUpdatePublicIPAddress", rerr.Error().Error()) return rerr.Error() } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff_test.go index 85388857aab..5dd41d08834 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_backoff_test.go @@ -240,6 +240,8 @@ func TestCreateOrUpdateLB(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + referencedResourceNotProvisionedRawErrorString := `Code="ReferencedResourceNotProvisioned" Message="Cannot proceed with operation because resource /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/publicIPAddresses/pip used by resource /subscriptions/sub/resourceGroups/rg/providers/Microsoft.Network/loadBalancers/lb is not in Succeeded state. Resource is in Failed state and the last operation that updated/is updating the resource is PutPublicIpAddressOperation."` + tests := []struct { clientErr *retry.Error expectedErr error @@ -252,6 +254,10 @@ func TestCreateOrUpdateLB(t *testing.T) { clientErr: &retry.Error{RawError: fmt.Errorf(operationCanceledErrorMessage)}, expectedErr: fmt.Errorf("Retriable: false, RetryAfter: 0s, HTTPStatusCode: 0, RawError: canceledandsupersededduetoanotheroperation"), }, + { + clientErr: &retry.Error{RawError: fmt.Errorf(referencedResourceNotProvisionedRawErrorString)}, + expectedErr: fmt.Errorf("Retriable: false, RetryAfter: 0s, HTTPStatusCode: 0, RawError: %s", referencedResourceNotProvisionedRawErrorString), + }, } for _, test := range tests { @@ -262,6 +268,15 @@ func TestCreateOrUpdateLB(t *testing.T) { mockLBClient.EXPECT().CreateOrUpdate(gomock.Any(), az.ResourceGroup, gomock.Any(), gomock.Any(), gomock.Any()).Return(test.clientErr) mockLBClient.EXPECT().Get(gomock.Any(), az.ResourceGroup, "lb", gomock.Any()).Return(network.LoadBalancer{}, nil) + mockPIPClient := az.PublicIPAddressesClient.(*mockpublicipclient.MockInterface) + mockPIPClient.EXPECT().CreateOrUpdate(gomock.Any(), az.ResourceGroup, "pip", gomock.Any()).Return(nil).AnyTimes() + mockPIPClient.EXPECT().Get(gomock.Any(), az.ResourceGroup, "pip", gomock.Any()).Return(network.PublicIPAddress{ + Name: to.StringPtr("pip"), + PublicIPAddressPropertiesFormat: &network.PublicIPAddressPropertiesFormat{ + ProvisioningState: to.StringPtr("Succeeded"), + }, + }, nil).AnyTimes() + err := az.CreateOrUpdateLB(&v1.Service{}, network.LoadBalancer{ Name: to.StringPtr("lb"), Etag: to.StringPtr("etag"),