Merge pull request #94381 from mgugino-upstream-stage/eviction-disrupted-pods

Allow deletion of unhealthy pods if enough healthy
This commit is contained in:
Kubernetes Prow Robot 2020-10-21 02:24:20 -07:00 committed by GitHub
commit 7509c4eb47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 165 additions and 18 deletions

View File

@ -14,6 +14,7 @@ go_test(
],
embed = [":go_default_library"],
deps = [
"//pkg/api/pod:go_default_library",
"//pkg/apis/core:go_default_library",
"//pkg/apis/policy:go_default_library",
"//pkg/registry/registrytest:go_default_library",

View File

@ -33,6 +33,7 @@ import (
"k8s.io/apiserver/pkg/util/dryrun"
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
"k8s.io/client-go/util/retry"
podutil "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
}
// the PDB can be ignored, so delete the pod
deletionOptions := originalDeleteOptions.DeepCopy()
deleteOptions := originalDeleteOptions
// We should check if resourceVersion is already set by the requestor
// as it might be older than the pod we just fetched and should be
// honored.
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
// racing with another PDB-impacting process elsewhere.
if deletionOptions.Preconditions == nil {
deletionOptions.Preconditions = &metav1.Preconditions{}
}
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
return err
}
@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
var rtStatus *metav1.Status
var pdbName string
updateDeletionOptions := false
err = func() error {
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
if err != nil {
@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
pdb := &pdbs[0]
pdbName = pdb.Name
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
updateDeletionOptions = true
return nil
}
refresh := false
err = retry.RetryOnConflict(EvictionsRetry, func() error {
if refresh {
@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
return rtStatus, nil
}
// At this point there was either no PDB or we succeeded in decrementing
// At this point there was either no PDB or we succeeded in decrementing or
// the pod was unready and we have enough healthy replicas
deleteOptions := originalDeleteOptions
// Set deleteOptions.Preconditions.ResourceVersion to ensure
// the pod hasn't been considered ready since we calculated
if updateDeletionOptions {
// Take a copy so we can compare to client-provied Options later.
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}
// Try the delete
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
if errors.IsConflict(err) && updateDeletionOptions &&
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
// If we encounter a resource conflict error, we updated the deletion options to include them,
// and the original deletion options did not specify ResourceVersion, we send back
// TooManyRequests so clients will retry.
return nil, createTooManyRequestsError(pdbName)
}
return nil, err
}
@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
return &metav1.Status{Status: metav1.StatusSuccess}, nil
}
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
if deleteOptions.Preconditions == nil {
deleteOptions.Preconditions = &metav1.Preconditions{}
}
deleteOptions.Preconditions.ResourceVersion = resourceVersion
}
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
// without checking PDBs.
func canIgnorePDB(pod *api.Pod) bool {
@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
}
func createTooManyRequestsError(name string) error {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
return err
}
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
if pdb.Status.ObservedGeneration < pdb.Generation {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
return err
return createTooManyRequestsError(pdb.Name)
}
if pdb.Status.DisruptionsAllowed < 0 {
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))

View File

@ -31,6 +31,7 @@ import (
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
"k8s.io/client-go/kubernetes/fake"
podapi "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
podName string
expectedDeleteCount int
podTerminating bool
prc *api.PodCondition
}{
{
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
expectedDeleteCount: 1,
podTerminating: true,
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t7",
expectedDeleteCount: 0,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionTrue,
},
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: false,
podName: "t8",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t9",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t10",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
}
for _, tc := range testcases {
@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
pod.ObjectMeta.DeletionTimestamp = &currentTime
}
// Setup pod condition
if tc.prc != nil {
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
t.Fatalf("Unable to update pod ready condition")
}
}
client := fake.NewSimpleClientset(tc.pdbs...)
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
// Always return error for this pod
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
}
if ms.pod.Name == "t6" {
// This pod has a deletionTimestamp and should not raise conflict on delete
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
// t8: This pod should not have a resource conflict.
return nil, true, nil
}
if ms.pod.Name == "t10" {
return nil, false, apierrors.NewBadRequest("test designed to error")
}
if count == 1 {
// This is a hack to ensure that some test pods don't change phase
// but do change resource version