Allow deletion of unhealthy pods if enough healthy

Currently, if you have a PDB with 0 disruptions
available and you attempt to evict a non-healthy
pod, the eviction request will always fail.  This
is because the eviction API does not currently
take in to account that the pod you are removing
is the unhealthy one.

This commit accounts for trying to evict an
unhealthy pod as long as there are enough healthy
pods to satisfy the PDB's requirements.  To
protect against race conditions, a ResourceVersion
constraint is enforced.  This will ensure that
the target pod does not go healthy and allow
any race condition to occur which might disrupt
too many pods at once.

This commit also eliminates superfluous class to
DeepCopy for the deleteOptions struct.
This commit is contained in:
Michael Gugino 2020-08-31 16:22:56 -04:00
parent b8ecff28d3
commit 717be0cd44
3 changed files with 165 additions and 18 deletions

View File

@ -14,6 +14,7 @@ go_test(
],
embed = [":go_default_library"],
deps = [
"//pkg/api/pod:go_default_library",
"//pkg/apis/core:go_default_library",
"//pkg/apis/policy:go_default_library",
"//pkg/registry/registrytest:go_default_library",

View File

@ -33,6 +33,7 @@ import (
"k8s.io/apiserver/pkg/util/dryrun"
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
"k8s.io/client-go/util/retry"
podutil "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
}
// the PDB can be ignored, so delete the pod
deletionOptions := originalDeleteOptions.DeepCopy()
deleteOptions := originalDeleteOptions
// We should check if resourceVersion is already set by the requestor
// as it might be older than the pod we just fetched and should be
// honored.
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
// racing with another PDB-impacting process elsewhere.
if deletionOptions.Preconditions == nil {
deletionOptions.Preconditions = &metav1.Preconditions{}
}
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
return err
}
@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
var rtStatus *metav1.Status
var pdbName string
updateDeletionOptions := false
err = func() error {
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
if err != nil {
@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
pdb := &pdbs[0]
pdbName = pdb.Name
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
updateDeletionOptions = true
return nil
}
refresh := false
err = retry.RetryOnConflict(EvictionsRetry, func() error {
if refresh {
@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
return rtStatus, nil
}
// At this point there was either no PDB or we succeeded in decrementing
// At this point there was either no PDB or we succeeded in decrementing or
// the pod was unready and we have enough healthy replicas
deleteOptions := originalDeleteOptions
// Set deleteOptions.Preconditions.ResourceVersion to ensure
// the pod hasn't been considered ready since we calculated
if updateDeletionOptions {
// Take a copy so we can compare to client-provied Options later.
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}
// Try the delete
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
if errors.IsConflict(err) && updateDeletionOptions &&
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
// If we encounter a resource conflict error, we updated the deletion options to include them,
// and the original deletion options did not specify ResourceVersion, we send back
// TooManyRequests so clients will retry.
return nil, createTooManyRequestsError(pdbName)
}
return nil, err
}
@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
return &metav1.Status{Status: metav1.StatusSuccess}, nil
}
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
if deleteOptions.Preconditions == nil {
deleteOptions.Preconditions = &metav1.Preconditions{}
}
deleteOptions.Preconditions.ResourceVersion = resourceVersion
}
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
// without checking PDBs.
func canIgnorePDB(pod *api.Pod) bool {
@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
}
func createTooManyRequestsError(name string) error {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
return err
}
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
if pdb.Status.ObservedGeneration < pdb.Generation {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
return err
return createTooManyRequestsError(pdb.Name)
}
if pdb.Status.DisruptionsAllowed < 0 {
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))

View File

@ -31,6 +31,7 @@ import (
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
"k8s.io/client-go/kubernetes/fake"
podapi "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
podName string
expectedDeleteCount int
podTerminating bool
prc *api.PodCondition
}{
{
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
expectedDeleteCount: 1,
podTerminating: true,
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t7",
expectedDeleteCount: 0,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionTrue,
},
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: false,
podName: "t8",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t9",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t10",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
}
for _, tc := range testcases {
@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
pod.ObjectMeta.DeletionTimestamp = &currentTime
}
// Setup pod condition
if tc.prc != nil {
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
t.Fatalf("Unable to update pod ready condition")
}
}
client := fake.NewSimpleClientset(tc.pdbs...)
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
// Always return error for this pod
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
}
if ms.pod.Name == "t6" {
// This pod has a deletionTimestamp and should not raise conflict on delete
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
// t8: This pod should not have a resource conflict.
return nil, true, nil
}
if ms.pod.Name == "t10" {
return nil, false, apierrors.NewBadRequest("test designed to error")
}
if count == 1 {
// This is a hack to ensure that some test pods don't change phase
// but do change resource version