mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-01 15:58:37 +00:00
Merge pull request #94381 from mgugino-upstream-stage/eviction-disrupted-pods
Allow deletion of unhealthy pods if enough healthy
This commit is contained in:
commit
7509c4eb47
@ -14,6 +14,7 @@ go_test(
|
||||
],
|
||||
embed = [":go_default_library"],
|
||||
deps = [
|
||||
"//pkg/api/pod:go_default_library",
|
||||
"//pkg/apis/core:go_default_library",
|
||||
"//pkg/apis/policy:go_default_library",
|
||||
"//pkg/registry/registrytest:go_default_library",
|
||||
|
@ -33,6 +33,7 @@ import (
|
||||
"k8s.io/apiserver/pkg/util/dryrun"
|
||||
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
|
||||
"k8s.io/client-go/util/retry"
|
||||
podutil "k8s.io/kubernetes/pkg/api/pod"
|
||||
api "k8s.io/kubernetes/pkg/apis/core"
|
||||
"k8s.io/kubernetes/pkg/apis/policy"
|
||||
)
|
||||
@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
}
|
||||
|
||||
// the PDB can be ignored, so delete the pod
|
||||
deletionOptions := originalDeleteOptions.DeepCopy()
|
||||
deleteOptions := originalDeleteOptions
|
||||
|
||||
// We should check if resourceVersion is already set by the requestor
|
||||
// as it might be older than the pod we just fetched and should be
|
||||
// honored.
|
||||
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
|
||||
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
|
||||
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
|
||||
// racing with another PDB-impacting process elsewhere.
|
||||
if deletionOptions.Preconditions == nil {
|
||||
deletionOptions.Preconditions = &metav1.Preconditions{}
|
||||
}
|
||||
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
|
||||
deleteOptions = deleteOptions.DeepCopy()
|
||||
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||
}
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
|
||||
var rtStatus *metav1.Status
|
||||
var pdbName string
|
||||
updateDeletionOptions := false
|
||||
|
||||
err = func() error {
|
||||
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
|
||||
if err != nil {
|
||||
@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
|
||||
pdb := &pdbs[0]
|
||||
pdbName = pdb.Name
|
||||
|
||||
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
|
||||
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
|
||||
updateDeletionOptions = true
|
||||
return nil
|
||||
}
|
||||
|
||||
refresh := false
|
||||
err = retry.RetryOnConflict(EvictionsRetry, func() error {
|
||||
if refresh {
|
||||
@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
return rtStatus, nil
|
||||
}
|
||||
|
||||
// At this point there was either no PDB or we succeeded in decrementing
|
||||
// At this point there was either no PDB or we succeeded in decrementing or
|
||||
// the pod was unready and we have enough healthy replicas
|
||||
|
||||
deleteOptions := originalDeleteOptions
|
||||
|
||||
// Set deleteOptions.Preconditions.ResourceVersion to ensure
|
||||
// the pod hasn't been considered ready since we calculated
|
||||
if updateDeletionOptions {
|
||||
// Take a copy so we can compare to client-provied Options later.
|
||||
deleteOptions = deleteOptions.DeepCopy()
|
||||
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||
}
|
||||
|
||||
// Try the delete
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
|
||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||
if err != nil {
|
||||
if errors.IsConflict(err) && updateDeletionOptions &&
|
||||
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
|
||||
// If we encounter a resource conflict error, we updated the deletion options to include them,
|
||||
// and the original deletion options did not specify ResourceVersion, we send back
|
||||
// TooManyRequests so clients will retry.
|
||||
return nil, createTooManyRequestsError(pdbName)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
||||
return &metav1.Status{Status: metav1.StatusSuccess}, nil
|
||||
}
|
||||
|
||||
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
|
||||
if deleteOptions.Preconditions == nil {
|
||||
deleteOptions.Preconditions = &metav1.Preconditions{}
|
||||
}
|
||||
deleteOptions.Preconditions.ResourceVersion = resourceVersion
|
||||
}
|
||||
|
||||
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
|
||||
// without checking PDBs.
|
||||
func canIgnorePDB(pod *api.Pod) bool {
|
||||
@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
|
||||
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
|
||||
}
|
||||
|
||||
func createTooManyRequestsError(name string) error {
|
||||
// TODO(mml): Add a Retry-After header. Once there are time-based
|
||||
// budgets, we can sometimes compute a sensible suggested value. But
|
||||
// even without that, we can give a suggestion (10 minutes?) that
|
||||
// prevents well-behaved clients from hammering us.
|
||||
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
|
||||
return err
|
||||
}
|
||||
|
||||
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
|
||||
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
|
||||
if pdb.Status.ObservedGeneration < pdb.Generation {
|
||||
// TODO(mml): Add a Retry-After header. Once there are time-based
|
||||
// budgets, we can sometimes compute a sensible suggested value. But
|
||||
// even without that, we can give a suggestion (10 minutes?) that
|
||||
// prevents well-behaved clients from hammering us.
|
||||
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
|
||||
return err
|
||||
|
||||
return createTooManyRequestsError(pdb.Name)
|
||||
}
|
||||
if pdb.Status.DisruptionsAllowed < 0 {
|
||||
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
|
||||
|
@ -31,6 +31,7 @@ import (
|
||||
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
|
||||
"k8s.io/apiserver/pkg/registry/rest"
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
podapi "k8s.io/kubernetes/pkg/api/pod"
|
||||
api "k8s.io/kubernetes/pkg/apis/core"
|
||||
"k8s.io/kubernetes/pkg/apis/policy"
|
||||
)
|
||||
@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
podName string
|
||||
expectedDeleteCount int
|
||||
podTerminating bool
|
||||
prc *api.PodCondition
|
||||
}{
|
||||
{
|
||||
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
|
||||
@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: true,
|
||||
},
|
||||
{
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t7",
|
||||
expectedDeleteCount: 0,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionTrue,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: false,
|
||||
podName: "t8",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
{
|
||||
// This case should return the 529 retry error.
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t9",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
{
|
||||
// This case should return the 529 retry error.
|
||||
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
|
||||
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||
// This simulates 3 pods desired, our pod unhealthy
|
||||
DisruptionsAllowed: 0,
|
||||
CurrentHealthy: 2,
|
||||
DesiredHealthy: 2,
|
||||
},
|
||||
}},
|
||||
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||
expectError: true,
|
||||
podName: "t10",
|
||||
expectedDeleteCount: 1,
|
||||
podTerminating: false,
|
||||
podPhase: api.PodRunning,
|
||||
prc: &api.PodCondition{
|
||||
Type: api.PodReady,
|
||||
Status: api.ConditionFalse,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testcases {
|
||||
@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
|
||||
pod.ObjectMeta.DeletionTimestamp = ¤tTime
|
||||
}
|
||||
|
||||
// Setup pod condition
|
||||
if tc.prc != nil {
|
||||
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
|
||||
t.Fatalf("Unable to update pod ready condition")
|
||||
}
|
||||
}
|
||||
|
||||
client := fake.NewSimpleClientset(tc.pdbs...)
|
||||
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
|
||||
|
||||
@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
|
||||
// Always return error for this pod
|
||||
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
|
||||
}
|
||||
if ms.pod.Name == "t6" {
|
||||
// This pod has a deletionTimestamp and should not raise conflict on delete
|
||||
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
|
||||
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
|
||||
// t8: This pod should not have a resource conflict.
|
||||
return nil, true, nil
|
||||
}
|
||||
if ms.pod.Name == "t10" {
|
||||
return nil, false, apierrors.NewBadRequest("test designed to error")
|
||||
}
|
||||
if count == 1 {
|
||||
// This is a hack to ensure that some test pods don't change phase
|
||||
// but do change resource version
|
||||
|
Loading…
Reference in New Issue
Block a user