From 9f80e7a6f8eacca91ce53dd3bd6997bc29f73c16 Mon Sep 17 00:00:00 2001 From: Michael Gugino Date: Mon, 14 Oct 2019 11:04:39 -0400 Subject: [PATCH 1/3] Allow deletion of pending pods when using PDBS Currently, if you have a PDB set, it is possible for a pod stuck in pending state to be prevented from deletion even though there are in fact enough healthy replicas. This commit allows pods in Pending state to be removed. This commit also adds associated unit tests. related-bug: #80389 --- pkg/registry/core/pod/storage/BUILD | 3 + pkg/registry/core/pod/storage/eviction.go | 83 +++++- .../core/pod/storage/eviction_test.go | 273 +++++++++++++++++- 3 files changed, 341 insertions(+), 18 deletions(-) diff --git a/pkg/registry/core/pod/storage/BUILD b/pkg/registry/core/pod/storage/BUILD index 9ee6edd4f0f..fb3586bba99 100644 --- a/pkg/registry/core/pod/storage/BUILD +++ b/pkg/registry/core/pod/storage/BUILD @@ -22,11 +22,14 @@ go_test( "//staging/src/k8s.io/api/policy/v1beta1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/internalversion:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/runtime/schema:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/watch:go_default_library", "//staging/src/k8s.io/apiserver/pkg/endpoints/request:go_default_library", "//staging/src/k8s.io/apiserver/pkg/registry/generic:go_default_library", "//staging/src/k8s.io/apiserver/pkg/registry/generic/registry:go_default_library", diff --git a/pkg/registry/core/pod/storage/eviction.go b/pkg/registry/core/pod/storage/eviction.go index d396cc7ef22..4c553400e43 100644 --- a/pkg/registry/core/pod/storage/eviction.go +++ b/pkg/registry/core/pod/storage/eviction.go @@ -29,7 +29,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/wait" - genericregistry "k8s.io/apiserver/pkg/registry/generic/registry" "k8s.io/apiserver/pkg/registry/rest" "k8s.io/apiserver/pkg/util/dryrun" policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1" @@ -46,24 +45,25 @@ const ( // This situation should self-correct because the PDB controller removes // entries from the map automatically after the PDB DeletionTimeout regardless. MaxDisruptedPodSize = 2000 + retrySteps = 20 ) // EvictionsRetry is the retry for a conflict where multiple clients // are making changes to the same resource. var EvictionsRetry = wait.Backoff{ - Steps: 20, + Steps: retrySteps, Duration: 500 * time.Millisecond, Factor: 1.0, Jitter: 0.1, } -func newEvictionStorage(store *genericregistry.Store, podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter) *EvictionREST { +func newEvictionStorage(store rest.StandardStorage, podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter) *EvictionREST { return &EvictionREST{store: store, podDisruptionBudgetClient: podDisruptionBudgetClient} } // EvictionREST implements the REST endpoint for evicting pods from nodes type EvictionREST struct { - store *genericregistry.Store + store rest.StandardStorage podDisruptionBudgetClient policyclient.PodDisruptionBudgetsGetter } @@ -130,13 +130,55 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje // Evicting a terminal pod should result in direct deletion of pod as it already caused disruption by the time we are evicting. // There is no need to check for pdb. - if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed { - _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) - if err != nil { - return nil, err + if canIgnorePDB(pod) { + deleteRefresh := false + continueToPDBs := false + // Preserve current deletionOptions if we need to fall through to checking PDBs later. + preservedDeletionOptions := deletionOptions.DeepCopy() + err := retry.RetryOnConflict(EvictionsRetry, func() error { + if deleteRefresh { + // If we hit a conflict error, get the latest pod + obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{}) + if err != nil { + return err + } + pod = obj.(*api.Pod) + if !canIgnorePDB(pod) { + // Pod is no longer in a state where we can skip checking + // PDBs, continue to PDB checks. + continueToPDBs = true + // restore original deletion options because we may have + // modified them. + deletionOptions = preservedDeletionOptions + return nil + } + } + if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(preservedDeletionOptions) { + // Set deletionOptions.Preconditions.ResourceVersion to ensure we're not + // racing with another PDB-impacting process elsewhere. + if deletionOptions.Preconditions == nil { + deletionOptions.Preconditions = &metav1.Preconditions{} + } + deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion + } else { + // restore original deletion options because we may have + // modified them. + deletionOptions = preservedDeletionOptions + } + _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) + if err != nil { + deleteRefresh = true + return err + } + return nil + }) + if !continueToPDBs { + if err != nil { + return nil, err + } + return &metav1.Status{ + Status: metav1.StatusSuccess}, nil } - return &metav1.Status{ - Status: metav1.StatusSuccess}, nil } var rtStatus *metav1.Status var pdbName string @@ -203,6 +245,27 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje return &metav1.Status{Status: metav1.StatusSuccess}, nil } +// canIgnorePDB returns true for pod conditions that allow the pod to be deleted +// without checking PDBs. +func canIgnorePDB(pod *api.Pod) bool { + if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed || pod.Status.Phase == api.PodPending { + return true + } + return false +} + +func shouldEnforceResourceVersion(pod *api.Pod) bool { + // Only pods that may be included as health in PDBs in the future need to be checked. + if pod.Status.Phase == api.PodPending { + return true + } + return false +} + +func resourceVersionIsUnset(options *metav1.DeleteOptions) bool { + return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil +} + // checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption. func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error { if pdb.Status.ObservedGeneration < pdb.Generation { diff --git a/pkg/registry/core/pod/storage/eviction_test.go b/pkg/registry/core/pod/storage/eviction_test.go index 3898175aa93..3c4e4cb4fbe 100644 --- a/pkg/registry/core/pod/storage/eviction_test.go +++ b/pkg/registry/core/pod/storage/eviction_test.go @@ -17,13 +17,19 @@ limitations under the License. package storage import ( + "context" + "errors" "testing" policyv1beta1 "k8s.io/api/policy/v1beta1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metainternalversion "k8s.io/apimachinery/pkg/apis/meta/internalversion" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/watch" genericapirequest "k8s.io/apiserver/pkg/endpoints/request" + "k8s.io/apiserver/pkg/registry/rest" "k8s.io/client-go/kubernetes/fake" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/policy" @@ -39,16 +45,59 @@ func TestEviction(t *testing.T) { expectError bool expectDeleted bool + podPhase api.PodPhase + podName string }{ { - name: "matching pdbs with no disruptions allowed", + name: "matching pdbs with no disruptions allowed, pod running", pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, }}, - eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t1", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, expectError: true, + podPhase: api.PodRunning, + podName: "t1", + }, + { + name: "matching pdbs with no disruptions allowed, pod pending", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t2", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podPhase: api.PodPending, + expectDeleted: true, + podName: "t2", + }, + { + name: "matching pdbs with no disruptions allowed, pod succeeded", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t3", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podPhase: api.PodSucceeded, + expectDeleted: true, + podName: "t3", + }, + { + name: "matching pdbs with no disruptions allowed, pod failed", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t4", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podPhase: api.PodFailed, + expectDeleted: true, + podName: "t4", }, { name: "matching pdbs with disruptions allowed", @@ -57,8 +106,9 @@ func TestEviction(t *testing.T) { Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}, }}, - eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t5", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, expectDeleted: true, + podName: "t5", }, { name: "non-matching pdbs", @@ -67,8 +117,9 @@ func TestEviction(t *testing.T) { Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"b": "true"}}}, Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, }}, - eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t6", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, expectDeleted: true, + podName: "t6", }, { name: "matching pdbs with disruptions allowed but bad name in Url", @@ -78,26 +129,35 @@ func TestEviction(t *testing.T) { Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}, }}, badNameInURL: true, - eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, expectError: true, + podName: "t7", }, } for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { testContext := genericapirequest.WithNamespace(genericapirequest.NewContext(), metav1.NamespaceDefault) - storage, _, _, server := newStorage(t) + storage, _, statusStorage, server := newStorage(t) defer server.Terminate(t) defer storage.Store.DestroyFunc() pod := validNewPod() + pod.Name = tc.podName pod.Labels = map[string]string{"a": "true"} pod.Spec.NodeName = "foo" - if _, err := storage.Create(testContext, pod, nil, &metav1.CreateOptions{}); err != nil { t.Error(err) } + if tc.podPhase != "" { + pod.Status.Phase = tc.podPhase + _, _, err := statusStorage.Update(testContext, pod.Name, rest.DefaultUpdatedObjectInfo(pod), rest.ValidateAllObjectFunc, rest.ValidateAllObjectUpdateFunc, false, &metav1.UpdateOptions{}) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + } + client := fake.NewSimpleClientset(tc.pdbs...) evictionRest := newEvictionStorage(storage.Store, client.PolicyV1beta1()) @@ -105,9 +165,11 @@ func TestEviction(t *testing.T) { if tc.badNameInURL { name += "bad-name" } + _, err := evictionRest.Create(testContext, name, tc.eviction, nil, &metav1.CreateOptions{}) + //_, err = evictionRest.Create(testContext, name, tc.eviction, nil, &metav1.CreateOptions{}) if (err != nil) != tc.expectError { - t.Errorf("expected error=%v, got %v", tc.expectError, err) + t.Errorf("expected error=%v, got %v; name %v", tc.expectError, err, pod.Name) return } if tc.badNameInURL { @@ -146,6 +208,122 @@ func TestEviction(t *testing.T) { } } +func TestEvictionIngorePDB(t *testing.T) { + testcases := []struct { + name string + pdbs []runtime.Object + eviction *policy.Eviction + + expectError bool + podPhase api.PodPhase + podName string + expectedDeleteCount int + }{ + { + name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t1", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podPhase: api.PodPending, + podName: "t1", + expectedDeleteCount: 3, + }, + // This test case is critical. If it is removed or broken we may + // regress and allow a pod to be deleted without checking PDBs when the + // pod should not be deleted. + { + name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod becomes running, continueToPDBs", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t2", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: true, + podPhase: api.PodPending, + podName: "t2", + expectedDeleteCount: 1, + }, + { + name: "pdbs disruptions allowed, pod pending, first delete conflict, pod becomes running, continueToPDBs", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 1}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t3", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podPhase: api.PodPending, + podName: "t3", + expectedDeleteCount: 2, + }, + { + name: "pod pending, always conflict on delete", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t4", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: true, + podPhase: api.PodPending, + podName: "t4", + expectedDeleteCount: retrySteps, + }, + { + name: "pod pending, always conflict on delete, user provided ResourceVersion constraint", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t5", Namespace: "default"}, DeleteOptions: metav1.NewRVDeletionPrecondition("userProvided")}, + expectError: true, + podPhase: api.PodPending, + podName: "t5", + expectedDeleteCount: retrySteps, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + testContext := genericapirequest.WithNamespace(genericapirequest.NewContext(), metav1.NamespaceDefault) + ms := &mockStore{ + deleteCount: 0, + } + + pod := validNewPod() + pod.Name = tc.podName + pod.Labels = map[string]string{"a": "true"} + pod.Spec.NodeName = "foo" + if tc.podPhase != "" { + pod.Status.Phase = tc.podPhase + } + + client := fake.NewSimpleClientset(tc.pdbs...) + evictionRest := newEvictionStorage(ms, client.PolicyV1beta1()) + + name := pod.Name + ms.pod = pod + + _, err := evictionRest.Create(testContext, name, tc.eviction, nil, &metav1.CreateOptions{}) + if (err != nil) != tc.expectError { + t.Errorf("expected error=%v, got %v; name %v", tc.expectError, err, pod.Name) + return + } + + if tc.expectedDeleteCount != ms.deleteCount { + t.Errorf("expected delete count=%v, got %v; name %v", tc.expectedDeleteCount, ms.deleteCount, pod.Name) + } + + }) + } +} + func TestEvictionDryRun(t *testing.T) { testcases := []struct { name string @@ -204,3 +382,82 @@ func TestEvictionDryRun(t *testing.T) { }) } } + +func resource(resource string) schema.GroupResource { + return schema.GroupResource{Group: "", Resource: resource} +} + +type mockStore struct { + deleteCount int + pod *api.Pod +} + +func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions) (runtime.Object, bool, error) { + if ms.pod.Name == "t4" { + // Always return error for this pod + return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message")) + } + if count == 1 { + // This is a hack to ensure that some test pods don't change phase + // but do change resource version + if ms.pod.Name != "t1" && ms.pod.Name != "t5" { + ms.pod.Status.Phase = api.PodRunning + } + ms.pod.ResourceVersion = "999" + // Always return conflict on the first attempt + return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message")) + } + // Compare enforce deletionOptions + if options == nil || options.Preconditions == nil || options.Preconditions.ResourceVersion == nil { + return nil, true, nil + } else if *options.Preconditions.ResourceVersion != "1000" { + // Here we're simulating that the pod has changed resource version again + // pod "t4" should make it here, this validates we're getting the latest + // resourceVersion of the pod and successfully delete on the next deletion + // attempt after this one. + ms.pod.ResourceVersion = "1000" + return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message")) + } + return nil, true, nil +} + +func (ms *mockStore) Delete(ctx context.Context, name string, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) { + ms.deleteCount++ + return ms.mutatorDeleteFunc(ms.deleteCount, options) +} + +func (ms *mockStore) Watch(ctx context.Context, options *metainternalversion.ListOptions) (watch.Interface, error) { + return nil, nil +} + +func (ms *mockStore) Update(ctx context.Context, name string, objInfo rest.UpdatedObjectInfo, createValidation rest.ValidateObjectFunc, updateValidation rest.ValidateObjectUpdateFunc, forceAllowCreate bool, options *metav1.UpdateOptions) (runtime.Object, bool, error) { + return nil, false, nil +} + +func (ms *mockStore) Get(ctx context.Context, name string, options *metav1.GetOptions) (runtime.Object, error) { + return ms.pod, nil +} + +func (ms *mockStore) New() runtime.Object { + return nil +} + +func (ms *mockStore) Create(ctx context.Context, obj runtime.Object, createValidation rest.ValidateObjectFunc, options *metav1.CreateOptions) (runtime.Object, error) { + return nil, nil +} + +func (ms *mockStore) DeleteCollection(ctx context.Context, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions, listOptions *metainternalversion.ListOptions) (runtime.Object, error) { + return nil, nil +} + +func (ms *mockStore) List(ctx context.Context, options *metainternalversion.ListOptions) (runtime.Object, error) { + return nil, nil +} + +func (ms *mockStore) NewList() runtime.Object { + return nil +} + +func (ms *mockStore) ConvertToTable(ctx context.Context, object runtime.Object, tableOptions runtime.Object) (*metav1.Table, error) { + return nil, nil +} From 4522141f0a4e5490ca0af4f46bd517425bebe34d Mon Sep 17 00:00:00 2001 From: David Eads Date: Tue, 12 May 2020 14:27:52 -0400 Subject: [PATCH 2/3] reduce complexity in pdb refactor --- pkg/registry/core/pod/storage/eviction.go | 108 ++++++++++------------ 1 file changed, 49 insertions(+), 59 deletions(-) diff --git a/pkg/registry/core/pod/storage/eviction.go b/pkg/registry/core/pod/storage/eviction.go index 4c553400e43..f353e9f8568 100644 --- a/pkg/registry/core/pod/storage/eviction.go +++ b/pkg/registry/core/pod/storage/eviction.go @@ -111,75 +111,65 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje return nil, errors.NewBadRequest("name in URL does not match name in Eviction object") } - deletionOptions, err := propagateDryRun(eviction, options) + originalDeleteOptions, err := propagateDryRun(eviction, options) if err != nil { return nil, err } - obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{}) - if err != nil { - return nil, err - } - pod := obj.(*api.Pod) - if createValidation != nil { if err := createValidation(ctx, eviction.DeepCopyObject()); err != nil { return nil, err } } - // Evicting a terminal pod should result in direct deletion of pod as it already caused disruption by the time we are evicting. - // There is no need to check for pdb. - if canIgnorePDB(pod) { - deleteRefresh := false - continueToPDBs := false - // Preserve current deletionOptions if we need to fall through to checking PDBs later. - preservedDeletionOptions := deletionOptions.DeepCopy() - err := retry.RetryOnConflict(EvictionsRetry, func() error { - if deleteRefresh { - // If we hit a conflict error, get the latest pod - obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{}) - if err != nil { - return err - } - pod = obj.(*api.Pod) - if !canIgnorePDB(pod) { - // Pod is no longer in a state where we can skip checking - // PDBs, continue to PDB checks. - continueToPDBs = true - // restore original deletion options because we may have - // modified them. - deletionOptions = preservedDeletionOptions - return nil - } - } - if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(preservedDeletionOptions) { - // Set deletionOptions.Preconditions.ResourceVersion to ensure we're not - // racing with another PDB-impacting process elsewhere. - if deletionOptions.Preconditions == nil { - deletionOptions.Preconditions = &metav1.Preconditions{} - } - deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion - } else { - // restore original deletion options because we may have - // modified them. - deletionOptions = preservedDeletionOptions - } - _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) - if err != nil { - deleteRefresh = true - return err - } - return nil - }) - if !continueToPDBs { - if err != nil { - return nil, err - } - return &metav1.Status{ - Status: metav1.StatusSuccess}, nil + var pod *api.Pod + deletedPod := false + err = retry.RetryOnConflict(EvictionsRetry, func() error { + obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{}) + if err != nil { + return err } + pod = obj.(*api.Pod) + + // Evicting a terminal pod should result in direct deletion of pod as it already caused disruption by the time we are evicting. + // There is no need to check for pdb. + if !canIgnorePDB(pod) { + // Pod is not in a state where we can skip checking PDBs, exit the loop, and continue to PDB checks. + return nil + } + + // the PDB can be ignored, so delete the pod + deletionOptions := originalDeleteOptions.DeepCopy() + if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) { + // Set deletionOptions.Preconditions.ResourceVersion to ensure we're not + // racing with another PDB-impacting process elsewhere. + if deletionOptions.Preconditions == nil { + deletionOptions.Preconditions = &metav1.Preconditions{} + } + deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion + } + _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) + if err != nil { + return err + } + deletedPod = true + return nil + }) + switch { + case err != nil: + // this can happen in cases where the PDB can be ignored, but there was a problem issuing the pod delete: + // maybe we conflicted two many times or we didn't have permission or something else weird. + return nil, err + + case deletedPod: + // this happens when we successfully deleted the pod. In this case, we're done executing because we've evicted/deleted the pod + return &metav1.Status{Status: metav1.StatusSuccess}, nil + + default: + // this happens when we didn't have an error and we didn't delete the pod. The only branch that happens on is when + // we cannot ignored the PDB for this pod, so this is the fall through case. } + var rtStatus *metav1.Status var pdbName string err = func() error { @@ -214,7 +204,7 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje // If it was false already, or if it becomes false during the course of our retries, // raise an error marked as a 429. - if err = r.checkAndDecrement(pod.Namespace, pod.Name, *pdb, dryrun.IsDryRun(deletionOptions.DryRun)); err != nil { + if err = r.checkAndDecrement(pod.Namespace, pod.Name, *pdb, dryrun.IsDryRun(originalDeleteOptions.DryRun)); err != nil { refresh = true return err } @@ -236,7 +226,7 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje // At this point there was either no PDB or we succeeded in decrementing // Try the delete - _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) + _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy()) if err != nil { return nil, err } From 047b0cee719465de05fd0563f5236bd94c002788 Mon Sep 17 00:00:00 2001 From: Michael Gugino Date: Thu, 28 May 2020 14:13:00 -0400 Subject: [PATCH 3/3] Quit retrying early with user supplied resourceVersion This commit also updates tests and comments. --- pkg/registry/core/pod/storage/eviction.go | 27 +++++++++++++------ .../core/pod/storage/eviction_test.go | 4 +-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pkg/registry/core/pod/storage/eviction.go b/pkg/registry/core/pod/storage/eviction.go index f353e9f8568..7096e18c97d 100644 --- a/pkg/registry/core/pod/storage/eviction.go +++ b/pkg/registry/core/pod/storage/eviction.go @@ -45,13 +45,12 @@ const ( // This situation should self-correct because the PDB controller removes // entries from the map automatically after the PDB DeletionTimeout regardless. MaxDisruptedPodSize = 2000 - retrySteps = 20 ) // EvictionsRetry is the retry for a conflict where multiple clients // are making changes to the same resource. var EvictionsRetry = wait.Backoff{ - Steps: retrySteps, + Steps: 20, Duration: 500 * time.Millisecond, Factor: 1.0, Jitter: 0.1, @@ -124,7 +123,14 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje var pod *api.Pod deletedPod := false - err = retry.RetryOnConflict(EvictionsRetry, func() error { + // by default, retry conflict errors + shouldRetry := errors.IsConflict + if !resourceVersionIsUnset(originalDeleteOptions) { + // if the original options included a resourceVersion precondition, don't retry + shouldRetry = func(err error) bool { return false } + } + + err = retry.OnError(EvictionsRetry, shouldRetry, func() error { obj, err = r.store.Get(ctx, eviction.Name, &metav1.GetOptions{}) if err != nil { return err @@ -140,6 +146,9 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje // the PDB can be ignored, so delete the pod deletionOptions := originalDeleteOptions.DeepCopy() + // We should check if resourceVersion is already set by the requestor + // as it might be older than the pod we just fetched and should be + // honored. if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) { // Set deletionOptions.Preconditions.ResourceVersion to ensure we're not // racing with another PDB-impacting process elsewhere. @@ -158,7 +167,7 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje switch { case err != nil: // this can happen in cases where the PDB can be ignored, but there was a problem issuing the pod delete: - // maybe we conflicted two many times or we didn't have permission or something else weird. + // maybe we conflicted too many times or we didn't have permission or something else weird. return nil, err case deletedPod: @@ -245,11 +254,13 @@ func canIgnorePDB(pod *api.Pod) bool { } func shouldEnforceResourceVersion(pod *api.Pod) bool { - // Only pods that may be included as health in PDBs in the future need to be checked. - if pod.Status.Phase == api.PodPending { - return true + // We don't need to enforce ResourceVersion for terminal pods + if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed { + return false } - return false + // Return true for all other pods to ensure we don't race against a pod becoming + // ready and violating PDBs. + return true } func resourceVersionIsUnset(options *metav1.DeleteOptions) bool { diff --git a/pkg/registry/core/pod/storage/eviction_test.go b/pkg/registry/core/pod/storage/eviction_test.go index 3c4e4cb4fbe..d21d527179d 100644 --- a/pkg/registry/core/pod/storage/eviction_test.go +++ b/pkg/registry/core/pod/storage/eviction_test.go @@ -272,7 +272,7 @@ func TestEvictionIngorePDB(t *testing.T) { expectError: true, podPhase: api.PodPending, podName: "t4", - expectedDeleteCount: retrySteps, + expectedDeleteCount: EvictionsRetry.Steps, }, { name: "pod pending, always conflict on delete, user provided ResourceVersion constraint", @@ -285,7 +285,7 @@ func TestEvictionIngorePDB(t *testing.T) { expectError: true, podPhase: api.PodPending, podName: "t5", - expectedDeleteCount: retrySteps, + expectedDeleteCount: 1, }, }