mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-07 03:03:59 +00:00
Merge pull request #94381 from mgugino-upstream-stage/eviction-disrupted-pods
Allow deletion of unhealthy pods if enough healthy
This commit is contained in:
commit
7509c4eb47
@ -14,6 +14,7 @@ go_test(
|
|||||||
],
|
],
|
||||||
embed = [":go_default_library"],
|
embed = [":go_default_library"],
|
||||||
deps = [
|
deps = [
|
||||||
|
"//pkg/api/pod:go_default_library",
|
||||||
"//pkg/apis/core:go_default_library",
|
"//pkg/apis/core:go_default_library",
|
||||||
"//pkg/apis/policy:go_default_library",
|
"//pkg/apis/policy:go_default_library",
|
||||||
"//pkg/registry/registrytest:go_default_library",
|
"//pkg/registry/registrytest:go_default_library",
|
||||||
|
@ -33,6 +33,7 @@ import (
|
|||||||
"k8s.io/apiserver/pkg/util/dryrun"
|
"k8s.io/apiserver/pkg/util/dryrun"
|
||||||
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
|
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
|
||||||
"k8s.io/client-go/util/retry"
|
"k8s.io/client-go/util/retry"
|
||||||
|
podutil "k8s.io/kubernetes/pkg/api/pod"
|
||||||
api "k8s.io/kubernetes/pkg/apis/core"
|
api "k8s.io/kubernetes/pkg/apis/core"
|
||||||
"k8s.io/kubernetes/pkg/apis/policy"
|
"k8s.io/kubernetes/pkg/apis/policy"
|
||||||
)
|
)
|
||||||
@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
|||||||
}
|
}
|
||||||
|
|
||||||
// the PDB can be ignored, so delete the pod
|
// the PDB can be ignored, so delete the pod
|
||||||
deletionOptions := originalDeleteOptions.DeepCopy()
|
deleteOptions := originalDeleteOptions
|
||||||
|
|
||||||
// We should check if resourceVersion is already set by the requestor
|
// We should check if resourceVersion is already set by the requestor
|
||||||
// as it might be older than the pod we just fetched and should be
|
// as it might be older than the pod we just fetched and should be
|
||||||
// honored.
|
// honored.
|
||||||
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
|
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
|
||||||
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
|
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
|
||||||
// racing with another PDB-impacting process elsewhere.
|
// racing with another PDB-impacting process elsewhere.
|
||||||
if deletionOptions.Preconditions == nil {
|
deleteOptions = deleteOptions.DeepCopy()
|
||||||
deletionOptions.Preconditions = &metav1.Preconditions{}
|
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||||
}
|
|
||||||
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
|
|
||||||
}
|
}
|
||||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
|
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
|||||||
|
|
||||||
var rtStatus *metav1.Status
|
var rtStatus *metav1.Status
|
||||||
var pdbName string
|
var pdbName string
|
||||||
|
updateDeletionOptions := false
|
||||||
|
|
||||||
err = func() error {
|
err = func() error {
|
||||||
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
|
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
|||||||
|
|
||||||
pdb := &pdbs[0]
|
pdb := &pdbs[0]
|
||||||
pdbName = pdb.Name
|
pdbName = pdb.Name
|
||||||
|
|
||||||
|
// If the pod is not ready, it doesn't count towards healthy and we should not decrement
|
||||||
|
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
|
||||||
|
updateDeletionOptions = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
refresh := false
|
refresh := false
|
||||||
err = retry.RetryOnConflict(EvictionsRetry, func() error {
|
err = retry.RetryOnConflict(EvictionsRetry, func() error {
|
||||||
if refresh {
|
if refresh {
|
||||||
@ -232,11 +241,29 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
|||||||
return rtStatus, nil
|
return rtStatus, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// At this point there was either no PDB or we succeeded in decrementing
|
// At this point there was either no PDB or we succeeded in decrementing or
|
||||||
|
// the pod was unready and we have enough healthy replicas
|
||||||
|
|
||||||
|
deleteOptions := originalDeleteOptions
|
||||||
|
|
||||||
|
// Set deleteOptions.Preconditions.ResourceVersion to ensure
|
||||||
|
// the pod hasn't been considered ready since we calculated
|
||||||
|
if updateDeletionOptions {
|
||||||
|
// Take a copy so we can compare to client-provied Options later.
|
||||||
|
deleteOptions = deleteOptions.DeepCopy()
|
||||||
|
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
|
||||||
|
}
|
||||||
|
|
||||||
// Try the delete
|
// Try the delete
|
||||||
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
|
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if errors.IsConflict(err) && updateDeletionOptions &&
|
||||||
|
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
|
||||||
|
// If we encounter a resource conflict error, we updated the deletion options to include them,
|
||||||
|
// and the original deletion options did not specify ResourceVersion, we send back
|
||||||
|
// TooManyRequests so clients will retry.
|
||||||
|
return nil, createTooManyRequestsError(pdbName)
|
||||||
|
}
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,6 +271,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
|
|||||||
return &metav1.Status{Status: metav1.StatusSuccess}, nil
|
return &metav1.Status{Status: metav1.StatusSuccess}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
|
||||||
|
if deleteOptions.Preconditions == nil {
|
||||||
|
deleteOptions.Preconditions = &metav1.Preconditions{}
|
||||||
|
}
|
||||||
|
deleteOptions.Preconditions.ResourceVersion = resourceVersion
|
||||||
|
}
|
||||||
|
|
||||||
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
|
// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
|
||||||
// without checking PDBs.
|
// without checking PDBs.
|
||||||
func canIgnorePDB(pod *api.Pod) bool {
|
func canIgnorePDB(pod *api.Pod) bool {
|
||||||
@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
|
|||||||
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
|
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func createTooManyRequestsError(name string) error {
|
||||||
|
// TODO(mml): Add a Retry-After header. Once there are time-based
|
||||||
|
// budgets, we can sometimes compute a sensible suggested value. But
|
||||||
|
// even without that, we can give a suggestion (10 minutes?) that
|
||||||
|
// prevents well-behaved clients from hammering us.
|
||||||
|
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
||||||
|
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
|
// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
|
||||||
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
|
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
|
||||||
if pdb.Status.ObservedGeneration < pdb.Generation {
|
if pdb.Status.ObservedGeneration < pdb.Generation {
|
||||||
// TODO(mml): Add a Retry-After header. Once there are time-based
|
|
||||||
// budgets, we can sometimes compute a sensible suggested value. But
|
return createTooManyRequestsError(pdb.Name)
|
||||||
// even without that, we can give a suggestion (10 minutes?) that
|
|
||||||
// prevents well-behaved clients from hammering us.
|
|
||||||
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
|
|
||||||
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
if pdb.Status.DisruptionsAllowed < 0 {
|
if pdb.Status.DisruptionsAllowed < 0 {
|
||||||
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
|
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
|
||||||
|
@ -31,6 +31,7 @@ import (
|
|||||||
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
|
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
|
||||||
"k8s.io/apiserver/pkg/registry/rest"
|
"k8s.io/apiserver/pkg/registry/rest"
|
||||||
"k8s.io/client-go/kubernetes/fake"
|
"k8s.io/client-go/kubernetes/fake"
|
||||||
|
podapi "k8s.io/kubernetes/pkg/api/pod"
|
||||||
api "k8s.io/kubernetes/pkg/apis/core"
|
api "k8s.io/kubernetes/pkg/apis/core"
|
||||||
"k8s.io/kubernetes/pkg/apis/policy"
|
"k8s.io/kubernetes/pkg/apis/policy"
|
||||||
)
|
)
|
||||||
@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
|
|||||||
podName string
|
podName string
|
||||||
expectedDeleteCount int
|
expectedDeleteCount int
|
||||||
podTerminating bool
|
podTerminating bool
|
||||||
|
prc *api.PodCondition
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
|
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
|
||||||
@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
|
|||||||
expectedDeleteCount: 1,
|
expectedDeleteCount: 1,
|
||||||
podTerminating: true,
|
podTerminating: true,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
|
||||||
|
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||||
|
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||||
|
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||||
|
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
|
||||||
|
DisruptionsAllowed: 0,
|
||||||
|
CurrentHealthy: 2,
|
||||||
|
DesiredHealthy: 2,
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||||
|
expectError: true,
|
||||||
|
podName: "t7",
|
||||||
|
expectedDeleteCount: 0,
|
||||||
|
podTerminating: false,
|
||||||
|
podPhase: api.PodRunning,
|
||||||
|
prc: &api.PodCondition{
|
||||||
|
Type: api.PodReady,
|
||||||
|
Status: api.ConditionTrue,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
|
||||||
|
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||||
|
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||||
|
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||||
|
// This simulates 3 pods desired, our pod unhealthy
|
||||||
|
DisruptionsAllowed: 0,
|
||||||
|
CurrentHealthy: 2,
|
||||||
|
DesiredHealthy: 2,
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||||
|
expectError: false,
|
||||||
|
podName: "t8",
|
||||||
|
expectedDeleteCount: 1,
|
||||||
|
podTerminating: false,
|
||||||
|
podPhase: api.PodRunning,
|
||||||
|
prc: &api.PodCondition{
|
||||||
|
Type: api.PodReady,
|
||||||
|
Status: api.ConditionFalse,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This case should return the 529 retry error.
|
||||||
|
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
|
||||||
|
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||||
|
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||||
|
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||||
|
// This simulates 3 pods desired, our pod unhealthy
|
||||||
|
DisruptionsAllowed: 0,
|
||||||
|
CurrentHealthy: 2,
|
||||||
|
DesiredHealthy: 2,
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||||
|
expectError: true,
|
||||||
|
podName: "t9",
|
||||||
|
expectedDeleteCount: 1,
|
||||||
|
podTerminating: false,
|
||||||
|
podPhase: api.PodRunning,
|
||||||
|
prc: &api.PodCondition{
|
||||||
|
Type: api.PodReady,
|
||||||
|
Status: api.ConditionFalse,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// This case should return the 529 retry error.
|
||||||
|
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
|
||||||
|
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
|
||||||
|
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
|
||||||
|
Status: policyv1beta1.PodDisruptionBudgetStatus{
|
||||||
|
// This simulates 3 pods desired, our pod unhealthy
|
||||||
|
DisruptionsAllowed: 0,
|
||||||
|
CurrentHealthy: 2,
|
||||||
|
DesiredHealthy: 2,
|
||||||
|
},
|
||||||
|
}},
|
||||||
|
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
|
||||||
|
expectError: true,
|
||||||
|
podName: "t10",
|
||||||
|
expectedDeleteCount: 1,
|
||||||
|
podTerminating: false,
|
||||||
|
podPhase: api.PodRunning,
|
||||||
|
prc: &api.PodCondition{
|
||||||
|
Type: api.PodReady,
|
||||||
|
Status: api.ConditionFalse,
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testcases {
|
for _, tc := range testcases {
|
||||||
@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
|
|||||||
pod.ObjectMeta.DeletionTimestamp = ¤tTime
|
pod.ObjectMeta.DeletionTimestamp = ¤tTime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Setup pod condition
|
||||||
|
if tc.prc != nil {
|
||||||
|
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
|
||||||
|
t.Fatalf("Unable to update pod ready condition")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
client := fake.NewSimpleClientset(tc.pdbs...)
|
client := fake.NewSimpleClientset(tc.pdbs...)
|
||||||
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
|
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())
|
||||||
|
|
||||||
@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
|
|||||||
// Always return error for this pod
|
// Always return error for this pod
|
||||||
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
|
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
|
||||||
}
|
}
|
||||||
if ms.pod.Name == "t6" {
|
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
|
||||||
// This pod has a deletionTimestamp and should not raise conflict on delete
|
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
|
||||||
|
// t8: This pod should not have a resource conflict.
|
||||||
return nil, true, nil
|
return nil, true, nil
|
||||||
}
|
}
|
||||||
|
if ms.pod.Name == "t10" {
|
||||||
|
return nil, false, apierrors.NewBadRequest("test designed to error")
|
||||||
|
}
|
||||||
if count == 1 {
|
if count == 1 {
|
||||||
// This is a hack to ensure that some test pods don't change phase
|
// This is a hack to ensure that some test pods don't change phase
|
||||||
// but do change resource version
|
// but do change resource version
|
||||||
|
Loading…
Reference in New Issue
Block a user