From 6a487723eda902d46a1e2b5003e4de2d84d08354 Mon Sep 17 00:00:00 2001 From: Pengfei Ni Date: Tue, 11 Feb 2020 21:23:59 +0800 Subject: [PATCH] Make Azure clients only retries on specified HTTP status codes --- .../legacy-cloud-providers/azure/azure.go | 16 +++---- .../azure/azure_test.go | 2 +- .../clients/armclient/azure_armclient_test.go | 4 +- .../azure/retry/azure_error.go | 42 ++++++++++++++++--- .../azure/retry/azure_error_test.go | 8 ++-- .../azure/retry/azure_retry.go | 23 +++++----- .../azure/retry/azure_retry_test.go | 38 +++++++++++------ 7 files changed, 86 insertions(+), 47 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure.go index c1457be15d6..74342f42ad5 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure.go @@ -504,23 +504,17 @@ func (az *Cloud) InitializeCloudFromConfig(config *Config, fromSecret bool) erro az.VirtualMachineSizesClient = vmsizeclient.New(azClientConfig.WithRateLimiter(config.VirtualMachineSizeRateLimit)) az.SnapshotsClient = snapshotclient.New(azClientConfig.WithRateLimiter(config.SnapshotRateLimit)) az.StorageAccountClient = storageaccountclient.New(azClientConfig.WithRateLimiter(config.StorageAccountRateLimit)) - + az.VirtualMachinesClient = vmclient.New(azClientConfig.WithRateLimiter(config.VirtualMachineRateLimit)) + az.DisksClient = diskclient.New(azClientConfig.WithRateLimiter(config.DiskRateLimit)) // fileClient is not based on armclient, but it's still backoff retried. - az.FileClient = newAzureFileClient(env, azClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusNotFound})) - - vmClientConfig := azClientConfig.WithRateLimiter(config.VirtualMachineRateLimit) - vmClientConfig.Backoff = vmClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusConflict}) - az.VirtualMachinesClient = vmclient.New(vmClientConfig) + az.FileClient = newAzureFileClient(env, azClientConfig.Backoff) // Error "not an active Virtual Machine Scale Set VM" is not retriable for VMSS VM. + // But http.StatusNotFound is retriable because of ARM replication latency. vmssVMClientConfig := azClientConfig.WithRateLimiter(config.VirtualMachineScaleSetRateLimit) - vmssVMClientConfig.Backoff = vmssVMClientConfig.Backoff.WithNonRetriableErrors([]string{vmssVMNotActiveErrorMessage}, []int{http.StatusConflict}) + vmssVMClientConfig.Backoff = vmssVMClientConfig.Backoff.WithNonRetriableErrors([]string{vmssVMNotActiveErrorMessage}).WithRetriableHTTPStatusCodes([]int{http.StatusNotFound}) az.VirtualMachineScaleSetVMsClient = vmssvmclient.New(vmssVMClientConfig) - disksClientConfig := azClientConfig.WithRateLimiter(config.DiskRateLimit) - disksClientConfig.Backoff = disksClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusNotFound, http.StatusConflict}) - az.DisksClient = diskclient.New(disksClientConfig) - if az.MaximumLoadBalancerRuleCount == 0 { az.MaximumLoadBalancerRuleCount = maximumLoadBalancerRuleCount } diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go index 924c9ba33cd..58e597107a7 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go @@ -801,7 +801,7 @@ func TestReconcileSecurityGroupEtagMismatch(t *testing.T) { HTTPStatusCode: http.StatusPreconditionFailed, RawError: errPreconditionFailedEtagMismatch, } - assert.Equal(t, err, expectedError.Error()) + assert.Equal(t, expectedError.Error(), err) } func TestReconcilePublicIPWithNewService(t *testing.T) { diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/clients/armclient/azure_armclient_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/clients/armclient/azure_armclient_test.go index 3554e3eeaf4..1d34600801b 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/clients/armclient/azure_armclient_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/clients/armclient/azure_armclient_test.go @@ -154,7 +154,7 @@ func TestSendAsync(t *testing.T) { assert.Nil(t, response) assert.Equal(t, 1, count) assert.NotNil(t, rerr) - assert.Equal(t, true, rerr.Retriable) + assert.Equal(t, false, rerr.Retriable) } func TestNormalizeAzureRegion(t *testing.T) { @@ -236,7 +236,7 @@ func TestDeleteResourceAsync(t *testing.T) { count := 0 server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { count++ - http.Error(w, "failed", http.StatusForbidden) + http.Error(w, "failed", http.StatusInternalServerError) })) backoff := &retry.Backoff{Steps: 3} diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go index bf0ec8df851..e2fffdbcdf9 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go @@ -38,6 +38,15 @@ const ( var ( // The function to get current time. now = time.Now + + // StatusCodesForRetry are a defined group of status code for which the client will retry. + StatusCodesForRetry = []int{ + http.StatusRequestTimeout, // 408 + http.StatusInternalServerError, // 500 + http.StatusBadGateway, // 502 + http.StatusServiceUnavailable, // 503 + http.StatusGatewayTimeout, // 504 + } ) // Error indicates an error returned by Azure APIs. @@ -184,18 +193,21 @@ func getHTTPStatusCode(resp *http.Response) int { // shouldRetryHTTPRequest determines if the request is retriable. func shouldRetryHTTPRequest(resp *http.Response, err error) bool { if resp != nil { - // HTTP 412 (StatusPreconditionFailed) means etag mismatch - // HTTP 400 (BadRequest) means the request cannot be accepted, hence we shouldn't retry. - if resp.StatusCode == http.StatusPreconditionFailed || resp.StatusCode == http.StatusBadRequest { - return false + for _, code := range StatusCodesForRetry { + if resp.StatusCode == code { + return true + } } - // HTTP 4xx (except 412) or 5xx suggests we should retry. - if 399 < resp.StatusCode && resp.StatusCode < 600 { + // should retry on <200, error>. + if isSuccessHTTPResponse(resp) && err != nil { return true } + + return false } + // should retry when error is not nil and no http.Response. if err != nil { return true } @@ -224,6 +236,24 @@ func getRetryAfter(resp *http.Response) time.Duration { return dur } +// GetErrorWithRetriableHTTPStatusCodes gets an error with RetriableHTTPStatusCodes. +// It is used to retry on some HTTPStatusCodes. +func GetErrorWithRetriableHTTPStatusCodes(resp *http.Response, err error, retriableHTTPStatusCodes []int) *Error { + rerr := GetError(resp, err) + if rerr == nil { + return nil + } + + for _, code := range retriableHTTPStatusCodes { + if rerr.HTTPStatusCode == code { + rerr.Retriable = true + break + } + } + + return rerr +} + // GetStatusNotFoundAndForbiddenIgnoredError gets an error with StatusNotFound and StatusForbidden ignored. // It is only used in DELETE operations. func GetStatusNotFoundAndForbiddenIgnoredError(resp *http.Response, err error) *Error { diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error_test.go index 4eafe565b17..1dcb42488f5 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error_test.go @@ -73,7 +73,7 @@ func TestGetError(t *testing.T) { code: http.StatusSeeOther, err: fmt.Errorf("some error"), expected: &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: http.StatusSeeOther, RawError: fmt.Errorf("some error"), }, @@ -82,7 +82,7 @@ func TestGetError(t *testing.T) { code: http.StatusTooManyRequests, retryAfter: 100, expected: &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: http.StatusTooManyRequests, RetryAfter: now().Add(100 * time.Second), RawError: fmt.Errorf("some error"), @@ -156,7 +156,7 @@ func TestGetStatusNotFoundAndForbiddenIgnoredError(t *testing.T) { code: http.StatusSeeOther, err: fmt.Errorf("some error"), expected: &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: http.StatusSeeOther, RawError: fmt.Errorf("some error"), }, @@ -165,7 +165,7 @@ func TestGetStatusNotFoundAndForbiddenIgnoredError(t *testing.T) { code: http.StatusTooManyRequests, retryAfter: 100, expected: &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: http.StatusTooManyRequests, RetryAfter: now().Add(100 * time.Second), RawError: fmt.Errorf("some error"), diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry.go b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry.go index 209a0c2bec0..64d9f0fa461 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry.go @@ -58,8 +58,8 @@ type Backoff struct { Cap time.Duration // The errors indicate that the request shouldn't do more retrying. NonRetriableErrors []string - // The HTTPStatusCode indicates that the request shouldn't do more retrying. - NonRetriableHTTPStatusCodes []int + // The RetriableHTTPStatusCodes indicates that the HTTPStatusCode should do more retrying. + RetriableHTTPStatusCodes []int } // NewBackoff creates a new Backoff. @@ -73,11 +73,17 @@ func NewBackoff(duration time.Duration, factor float64, jitter float64, steps in } } -// WithNonRetriableErrors returns a new *Backoff with NonRetriableErrors, NonRetriableHTTPStatusCodes assigned. -func (b *Backoff) WithNonRetriableErrors(errs []string, httpStatusCodes []int) *Backoff { +// WithNonRetriableErrors returns a new *Backoff with NonRetriableErrors assigned. +func (b *Backoff) WithNonRetriableErrors(errs []string) *Backoff { newBackoff := *b newBackoff.NonRetriableErrors = errs - newBackoff.NonRetriableHTTPStatusCodes = httpStatusCodes + return &newBackoff +} + +// WithRetriableHTTPStatusCodes returns a new *Backoff with RetriableHTTPStatusCode assigned. +func (b *Backoff) WithRetriableHTTPStatusCodes(httpStatusCodes []int) *Backoff { + newBackoff := *b + newBackoff.RetriableHTTPStatusCodes = httpStatusCodes return &newBackoff } @@ -93,11 +99,6 @@ func (b *Backoff) isNonRetriableError(rerr *Error) bool { } } - for _, code := range b.NonRetriableHTTPStatusCodes { - if rerr.HTTPStatusCode == code { - return true - } - } return false } @@ -162,7 +163,7 @@ func doBackoffRetry(s autorest.Sender, r *http.Request, backoff *Backoff) (resp return } resp, err = s.Do(rr.Request()) - rerr := GetError(resp, err) + rerr := GetErrorWithRetriableHTTPStatusCodes(resp, err, backoff.RetriableHTTPStatusCodes) // Abort retries in the following scenarios: // 1) request succeed // 2) request is not retriable diff --git a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry_test.go b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry_test.go index 17401b2f314..b9a7cca34c7 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry_test.go @@ -75,11 +75,6 @@ func TestStep(t *testing.T) { } func TestDoBackoffRetry(t *testing.T) { - backoff := &Backoff{ - Factor: 1.0, - Steps: 3, - NonRetriableHTTPStatusCodes: []int{http.StatusNotFound}, - } fakeRequest := &http.Request{ URL: &url.URL{ Host: "localhost", @@ -96,7 +91,7 @@ func TestDoBackoffRetry(t *testing.T) { HTTPStatusCode: 500, RawError: fmt.Errorf("HTTP status code (500)"), } - resp, err := doBackoffRetry(client, fakeRequest, backoff) + resp, err := doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3}) assert.NotNil(t, resp) assert.Equal(t, 500, resp.StatusCode) assert.Equal(t, expectedErr.Error(), err) @@ -106,7 +101,7 @@ func TestDoBackoffRetry(t *testing.T) { r = mocks.NewResponseWithStatus("200 OK", http.StatusOK) client = mocks.NewSender() client.AppendAndRepeatResponse(r, 1) - resp, err = doBackoffRetry(client, fakeRequest, backoff) + resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3}) assert.Nil(t, err) assert.Equal(t, 1, client.Attempts()) assert.NotNil(t, resp) @@ -117,28 +112,47 @@ func TestDoBackoffRetry(t *testing.T) { client = mocks.NewSender() client.AppendAndRepeatResponse(r, 1) expectedErr = &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: 429, RawError: fmt.Errorf("HTTP status code (429)"), } - resp, err = doBackoffRetry(client, fakeRequest, backoff) + resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3}) assert.Equal(t, expectedErr.Error(), err) assert.Equal(t, 1, client.Attempts()) assert.NotNil(t, resp) assert.Equal(t, 429, resp.StatusCode) - // retry on non retriable error + // don't retry on non retriable error r = mocks.NewResponseWithStatus("404 StatusNotFound", http.StatusNotFound) client = mocks.NewSender() client.AppendAndRepeatResponse(r, 1) expectedErr = &Error{ - Retriable: true, + Retriable: false, HTTPStatusCode: 404, RawError: fmt.Errorf("HTTP status code (404)"), } - resp, err = doBackoffRetry(client, fakeRequest, backoff) + resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3}) assert.NotNil(t, resp) assert.Equal(t, 404, resp.StatusCode) assert.Equal(t, expectedErr.Error(), err) assert.Equal(t, 1, client.Attempts()) + + // retry on RetriableHTTPStatusCodes + r = mocks.NewResponseWithStatus("102 StatusProcessing", http.StatusProcessing) + client = mocks.NewSender() + client.AppendAndRepeatResponse(r, 3) + expectedErr = &Error{ + Retriable: true, + HTTPStatusCode: 102, + RawError: fmt.Errorf("HTTP status code (102)"), + } + resp, err = doBackoffRetry(client, fakeRequest, &Backoff{ + Factor: 1.0, + Steps: 3, + RetriableHTTPStatusCodes: []int{http.StatusProcessing}, + }) + assert.NotNil(t, resp) + assert.Equal(t, 102, resp.StatusCode) + assert.Equal(t, expectedErr.Error(), err) + assert.Equal(t, 3, client.Attempts()) }