mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Allow deletion of unhealthy pods if enough healthy
Currently, if you have a PDB with 0 disruptions available and you attempt to evict a non-healthy pod, the eviction request will always fail. This is because the eviction API does not currently take in to account that the pod you are removing is the unhealthy one. This commit accounts for trying to evict an unhealthy pod as long as there are enough healthy pods to satisfy the PDB's requirements. To protect against race conditions, a ResourceVersion constraint is enforced. This will ensure that the target pod does not go healthy and allow any race condition to occur which might disrupt too many pods at once. This commit also eliminates superfluous class to DeepCopy for the deleteOptions struct.
This commit is contained in:
parent
b8ecff28d3
commit
717be0cd44
@ -14,6 +14,7 @@ go_test(
|
||||
],
|
||||
embed = [":go_default_library"],
|
||||
deps = [
|
||||
"//pkg/api/pod:go_default_library",
|
||||
"//pkg/apis/core:go_default_library",
|
||||
"//pkg/apis/policy:go_default_library",
|
||||
"//pkg/registry/registrytest:go_default_library",
|
||||
|
@ -33,6 +33,7 @@ import (
|
||||
"k8s.io/apiserver/pkg/util/dryrun"
|
||||
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
|
||||
"k8s.io/client-go/util/retry"
|
||||
podutil "k8s.io/kubernetes/pkg/api/pod"
|
||||
api "k8s.io/kubernetes/pkg/apis/core"
|
||||
"k8s.io/kubernetes/pkg/apis/policy"
|
||||
)
|
||||
@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
}
|
||||
|
||||
// the PDB can be ignored, so delete the pod
|
||||
deletionOptions := originalDeleteOptions.DeepCopy()
|
||||
deleteOptions := originalDeleteOptions
|
||||
|
||||
// We should check if resourceVersion is already set by the requestor
|
||||
// as it might be older than the pod we just fetched and should be
|
||||
// honored.
|
||||
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
|
||||
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
|
||||
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
|
||||
// racing with another PDB-impacting process elsewhere.
|
||||
if deletionOptions.Preconditions == nil {
|
||||
deletionOptions.Preconditions = &metav1.Preconditions{}
|
||||
}
|
||||
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
|
||||
deleteOptions = deleteOptions.DeepCopy()
|
||||
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||
}
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
|
||||
var rtStatus *metav1.Status
|
||||
var pdbName string
|
||||
updateDeletionOptions := false
|
||||
|
||||
err = func() error {
|
||||
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
|
||||
if err != nil {
|
||||
@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
|
||||
pdb := &pdbs[0]
|
||||
pdbName = pdb.Name
|
||||
|
||||
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
|
||||
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
|
||||
updateDeletionOptions = true
|
||||
return nil
|
||||
}
|
||||
|
||||
refresh := false
|
||||
err = retry.RetryOnConflict(EvictionsRetry, func() error {
|
||||
if refresh {
|
||||
@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
return rtStatus, nil
|
||||
}
|
||||
|
||||
// At this point there was either no PDB or we succeeded in decrementing
|
||||
// At this point there was either no PDB or we succeeded in decrementing or
|
||||
// the pod was unready and we have enough healthy replicas
|
||||
|
||||
deleteOptions := originalDeleteOptions
|
||||
|
||||
// Set deleteOptions.Preconditions.ResourceVersion to ensure
|
||||
// the pod hasn't been considered ready since we calculated
|
||||
if updateDeletionOptions {
|
||||
// Take a copy so we can compare to client-provied Options later.
|
||||
deleteOptions = deleteOptions.DeepCopy()
|
||||
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||
}
|
||||
|
||||
// Try the delete
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||
if err != nil {
|
||||
if errors.IsConflict(err) && updateDeletionOptions &&
|
||||
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
|
||||
// If we encounter a resource conflict error, we updated the deletion options to include them,
|
||||
// and the original deletion options did not specify ResourceVersion, we send back
|
||||
// TooManyRequests so clients will retry.
|
||||
return nil, createTooManyRequestsError(pdbName)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
return &metav1.Status{Status: metav1.StatusSuccess}, nil
|
||||
}
|
||||
|
||||
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
|
||||
if deleteOptions.Preconditions == nil {
|
||||
deleteOptions.Preconditions = &metav1.Preconditions{}
|
||||
}
|
||||
deleteOptions.Preconditions.ResourceVersion = resourceVersion
|
||||
}
|
||||
|
||||
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
|
||||
// without checking PDBs.
|
||||
func canIgnorePDB(pod *api.Pod) bool {
|
||||
@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
|
||||
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
|
||||
}
|
||||
|
||||
func createTooManyRequestsError(name string) error {
|
||||
// TODO(mml): Add a Retry-After header. Once there are time-based
|
||||
// budgets, we can sometimes compute a sensible suggested value. But
|
||||
// even without that, we can give a suggestion (10 minutes?) that
|
||||
// prevents well-behaved clients from hammering us.
|
||||
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
|
||||
return err
|
||||
}
|
||||
|
||||
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
|
||||
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
|
||||
if pdb.Status.ObservedGeneration < pdb.Generation {
|
||||
// TODO(mml): Add a Retry-After header. Once there are time-based
|
||||
// budgets, we can sometimes compute a sensible suggested value. But
|
||||
// even without that, we can give a suggestion (10 minutes?) that
|
||||
// prevents well-behaved clients from hammering us.
|
||||
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
|
||||
return err
|
||||
|
||||
return createTooManyRequestsError(pdb.Name)
|
||||
}
|
||||
if pdb.Status.DisruptionsAllowed < 0 {
|
||||
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
|
||||
|
@ -31,6 +31,7 @@ import (
|
||||
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
|
||||
"k8s.io/apiserver/pkg/registry/rest"
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
podapi "k8s.io/kubernetes/pkg/api/pod"
|
||||
api "k8s.io/kubernetes/pkg/apis/core"
|
||||
"k8s.io/kubernetes/pkg/apis/policy"
|
||||
)
|
||||
@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
podName string
|
||||
expectedDeleteCount int
|
||||
podTerminating bool
|
||||
prc *api.PodCondition
|
||||
}{
|
||||
{
|
||||
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
|
||||
@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: true,
|
||||
},
|
||||
{
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t7",
|
||||
expectedDeleteCount: 0,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionTrue,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: false,
|
||||
podName: "t8",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
{
|
||||
// This case should return the 529 retry error.
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t9",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
{
|
||||
// This case should return the 529 retry error.
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t10",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testcases {
|
||||
@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
pod.ObjectMeta.DeletionTimestamp = ¤tTime
|
||||
}
|
||||
|
||||
// Setup pod condition
|
||||
if tc.prc != nil {
|
||||
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
|
||||
t.Fatalf("Unable to update pod ready condition")
|
||||
}
|
||||
}
|
||||
|
||||
client := fake.NewSimpleClientset(tc.pdbs...)
|
||||
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
|
||||
|
||||
@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
|
||||
// Always return error for this pod
|
||||
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
|
||||
}
|
||||
if ms.pod.Name == "t6" {
|
||||
// This pod has a deletionTimestamp and should not raise conflict on delete
|
||||
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
|
||||
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
|
||||
// t8: This pod should not have a resource conflict.
|
||||
return nil, true, nil
|
||||
}
|
||||
if ms.pod.Name == "t10" {
|
||||
return nil, false, apierrors.NewBadRequest("test designed to error")
|
||||
}
|
||||
if count == 1 {
|
||||
// This is a hack to ensure that some test pods don't change phase
|
||||
// but do change resource version
|
||||
|
Loading…
Reference in New Issue
Block a user