diff --git a/pkg/registry/core/pod/storage/BUILD b/pkg/registry/core/pod/storage/BUILD index fb3586bba99..d209cfb07ab 100644 --- a/pkg/registry/core/pod/storage/BUILD +++ b/pkg/registry/core/pod/storage/BUILD @@ -14,6 +14,7 @@ go_test( ], embed = [":go_default_library"], deps = [ + "//pkg/api/pod:go_default_library", "//pkg/apis/core:go_default_library", "//pkg/apis/policy:go_default_library", "//pkg/registry/registrytest:go_default_library", diff --git a/pkg/registry/core/pod/storage/eviction.go b/pkg/registry/core/pod/storage/eviction.go index 7ed0875af02..28bf90e380a 100644 --- a/pkg/registry/core/pod/storage/eviction.go +++ b/pkg/registry/core/pod/storage/eviction.go @@ -33,6 +33,7 @@ import ( "k8s.io/apiserver/pkg/util/dryrun" policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1" "k8s.io/client-go/util/retry" + podutil "k8s.io/kubernetes/pkg/api/pod" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/policy" ) @@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje } // the PDB can be ignored, so delete the pod - deletionOptions := originalDeleteOptions.DeepCopy() + deleteOptions := originalDeleteOptions + // We should check if resourceVersion is already set by the requestor // as it might be older than the pod we just fetched and should be // honored. if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) { - // Set deletionOptions.Preconditions.ResourceVersion to ensure we're not + // Set deleteOptions.Preconditions.ResourceVersion to ensure we're not // racing with another PDB-impacting process elsewhere. - if deletionOptions.Preconditions == nil { - deletionOptions.Preconditions = &metav1.Preconditions{} - } - deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion + deleteOptions = deleteOptions.DeepCopy() + setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion) } - _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions) + _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions) if err != nil { return err } @@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje var rtStatus *metav1.Status var pdbName string + updateDeletionOptions := false + err = func() error { pdbs, err := r.getPodDisruptionBudgets(ctx, pod) if err != nil { @@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje pdb := &pdbs[0] pdbName = pdb.Name + + // If the pod is not ready, it doesn't count towards healthy and we should not decrement + if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 { + updateDeletionOptions = true + return nil + } + refresh := false err = retry.RetryOnConflict(EvictionsRetry, func() error { if refresh { @@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje return rtStatus, nil } - // At this point there was either no PDB or we succeeded in decrementing + // At this point there was either no PDB or we succeeded in decrementing or + // the pod was unready and we have enough healthy replicas + + deleteOptions := originalDeleteOptions + + // Set deleteOptions.Preconditions.ResourceVersion to ensure + // the pod hasn't been considered ready since we calculated + if updateDeletionOptions { + // Take a copy so we can compare to client-provied Options later. + deleteOptions = deleteOptions.DeepCopy() + setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion) + } // Try the delete - _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy()) + _, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions) if err != nil { + if errors.IsConflict(err) && updateDeletionOptions && + (originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) { + // If we encounter a resource conflict error, we updated the deletion options to include them, + // and the original deletion options did not specify ResourceVersion, we send back + // TooManyRequests so clients will retry. + return nil, createTooManyRequestsError(pdbName) + } return nil, err } @@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje return &metav1.Status{Status: metav1.StatusSuccess}, nil } +func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) { + if deleteOptions.Preconditions == nil { + deleteOptions.Preconditions = &metav1.Preconditions{} + } + deleteOptions.Preconditions.ResourceVersion = resourceVersion +} + // canIgnorePDB returns true for pod conditions that allow the pod to be deleted // without checking PDBs. func canIgnorePDB(pod *api.Pod) bool { @@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool { return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil } +func createTooManyRequestsError(name string) error { + // TODO(mml): Add a Retry-After header. Once there are time-based + // budgets, we can sometimes compute a sensible suggested value. But + // even without that, we can give a suggestion (10 minutes?) that + // prevents well-behaved clients from hammering us. + err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0) + err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)}) + return err +} + // checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption. func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error { if pdb.Status.ObservedGeneration < pdb.Generation { - // TODO(mml): Add a Retry-After header. Once there are time-based - // budgets, we can sometimes compute a sensible suggested value. But - // even without that, we can give a suggestion (10 minutes?) that - // prevents well-behaved clients from hammering us. - err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0) - err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)}) - return err + + return createTooManyRequestsError(pdb.Name) } if pdb.Status.DisruptionsAllowed < 0 { return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative")) diff --git a/pkg/registry/core/pod/storage/eviction_test.go b/pkg/registry/core/pod/storage/eviction_test.go index 5ea203b3bc3..8107506abd1 100644 --- a/pkg/registry/core/pod/storage/eviction_test.go +++ b/pkg/registry/core/pod/storage/eviction_test.go @@ -31,6 +31,7 @@ import ( genericapirequest "k8s.io/apiserver/pkg/endpoints/request" "k8s.io/apiserver/pkg/registry/rest" "k8s.io/client-go/kubernetes/fake" + podapi "k8s.io/kubernetes/pkg/api/pod" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/policy" ) @@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) { podName string expectedDeleteCount int podTerminating bool + prc *api.PodCondition }{ { name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully", @@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) { expectedDeleteCount: 1, podTerminating: true, }, + { + name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{ + // This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours. + DisruptionsAllowed: 0, + CurrentHealthy: 2, + DesiredHealthy: 2, + }, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: true, + podName: "t7", + expectedDeleteCount: 0, + podTerminating: false, + podPhase: api.PodRunning, + prc: &api.PodCondition{ + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + { + name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{ + // This simulates 3 pods desired, our pod unhealthy + DisruptionsAllowed: 0, + CurrentHealthy: 2, + DesiredHealthy: 2, + }, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: false, + podName: "t8", + expectedDeleteCount: 1, + podTerminating: false, + podPhase: api.PodRunning, + prc: &api.PodCondition{ + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, + { + // This case should return the 529 retry error. + name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{ + // This simulates 3 pods desired, our pod unhealthy + DisruptionsAllowed: 0, + CurrentHealthy: 2, + DesiredHealthy: 2, + }, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: true, + podName: "t9", + expectedDeleteCount: 1, + podTerminating: false, + podPhase: api.PodRunning, + prc: &api.PodCondition{ + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, + { + // This case should return the 529 retry error. + name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete", + pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{ + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, + Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}}, + Status: policyv1beta1.PodDisruptionBudgetStatus{ + // This simulates 3 pods desired, our pod unhealthy + DisruptionsAllowed: 0, + CurrentHealthy: 2, + DesiredHealthy: 2, + }, + }}, + eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)}, + expectError: true, + podName: "t10", + expectedDeleteCount: 1, + podTerminating: false, + podPhase: api.PodRunning, + prc: &api.PodCondition{ + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, } for _, tc := range testcases { @@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) { pod.ObjectMeta.DeletionTimestamp = ¤tTime } + // Setup pod condition + if tc.prc != nil { + if !podapi.UpdatePodCondition(&pod.Status, tc.prc) { + t.Fatalf("Unable to update pod ready condition") + } + } + client := fake.NewSimpleClientset(tc.pdbs...) evictionRest := newEvictionStorage(ms, client.PolicyV1beta1()) @@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions) // Always return error for this pod return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message")) } - if ms.pod.Name == "t6" { - // This pod has a deletionTimestamp and should not raise conflict on delete + if ms.pod.Name == "t6" || ms.pod.Name == "t8" { + // t6: This pod has a deletionTimestamp and should not raise conflict on delete + // t8: This pod should not have a resource conflict. return nil, true, nil } + if ms.pod.Name == "t10" { + return nil, false, apierrors.NewBadRequest("test designed to error") + } if count == 1 { // This is a hack to ensure that some test pods don't change phase // but do change resource version