mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-14 13:45:06 +00:00
Add pod disruption conditions for kubelet initiated failures
This commit is contained in:
@@ -31,6 +31,7 @@ import (
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/fields"
|
||||
kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
|
||||
@@ -500,6 +501,28 @@ var _ = SIGDescribe("PriorityPidEvictionOrdering [Slow] [Serial] [Disruptive][No
|
||||
specs[2].pod.Spec.PriorityClassName = highPriorityClassName
|
||||
runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logPidMetrics, specs)
|
||||
})
|
||||
|
||||
ginkgo.Context(fmt.Sprintf(testContextFmt, expectedNodeCondition)+"; PodDisruptionConditions enabled [NodeFeature:PodDisruptionConditions]", func() {
|
||||
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
pidsConsumed := int64(10000)
|
||||
summary := eventuallyGetSummary()
|
||||
availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses)
|
||||
initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)}
|
||||
initialConfig.EvictionMinimumReclaim = map[string]string{}
|
||||
initialConfig.FeatureGates = map[string]bool{
|
||||
string(features.PodDisruptionConditions): true,
|
||||
}
|
||||
})
|
||||
disruptionTarget := v1.AlphaNoCompatGuaranteeDisruptionTarget
|
||||
specs := []podEvictSpec{
|
||||
{
|
||||
evictionPriority: 1,
|
||||
pod: pidConsumingPod("fork-bomb-container", 30000),
|
||||
wantPodDisruptionCondition: &disruptionTarget,
|
||||
},
|
||||
}
|
||||
runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logPidMetrics, specs)
|
||||
})
|
||||
})
|
||||
|
||||
// Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
|
||||
@@ -507,8 +530,9 @@ type podEvictSpec struct {
|
||||
// P0 should never be evicted, P1 shouldn't evict before P2, etc.
|
||||
// If two are ranked at P1, either is permitted to fail before the other.
|
||||
// The test ends when all pods other than p0 have been evicted
|
||||
evictionPriority int
|
||||
pod *v1.Pod
|
||||
evictionPriority int
|
||||
pod *v1.Pod
|
||||
wantPodDisruptionCondition *v1.PodConditionType
|
||||
}
|
||||
|
||||
// runEvictionTest sets up a testing environment given the provided pods, and checks a few things:
|
||||
@@ -560,6 +584,9 @@ func runEvictionTest(f *framework.Framework, pressureTimeout time.Duration, expe
|
||||
return verifyEvictionOrdering(f, testSpecs)
|
||||
}, pressureTimeout, evictionPollInterval).Should(gomega.BeNil())
|
||||
|
||||
ginkgo.By("checking for the expected pod conditions for evicted pods")
|
||||
verifyPodConditions(f, testSpecs)
|
||||
|
||||
// We observe pressure from the API server. The eviction manager observes pressure from the kubelet internal stats.
|
||||
// This means the eviction manager will observe pressure before we will, creating a delay between when the eviction manager
|
||||
// evicts a pod, and when we observe the pressure by querying the API server. Add a delay here to account for this delay
|
||||
@@ -725,6 +752,21 @@ func verifyEvictionOrdering(f *framework.Framework, testSpecs []podEvictSpec) er
|
||||
return fmt.Errorf("pods that should be evicted are still running: %#v", pendingPods)
|
||||
}
|
||||
|
||||
func verifyPodConditions(f *framework.Framework, testSpecs []podEvictSpec) {
|
||||
for _, spec := range testSpecs {
|
||||
if spec.wantPodDisruptionCondition != nil {
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(context.TODO(), spec.pod.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err, "Failed to get the recent pod object for name: %q", pod.Name)
|
||||
|
||||
cType := *spec.wantPodDisruptionCondition
|
||||
podDisruptionCondition := e2epod.FindPodConditionByType(&pod.Status, cType)
|
||||
if podDisruptionCondition == nil {
|
||||
framework.Failf("pod %q should have the condition: %q, pod status: %v", pod.Name, cType, pod.Status)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func verifyEvictionEvents(f *framework.Framework, testSpecs []podEvictSpec, expectedStarvedResource v1.ResourceName) {
|
||||
for _, spec := range testSpecs {
|
||||
pod := spec.pod
|
||||
|
@@ -55,6 +55,109 @@ import (
|
||||
var _ = SIGDescribe("GracefulNodeShutdown [Serial] [NodeFeature:GracefulNodeShutdown] [NodeFeature:GracefulNodeShutdownBasedOnPodPriority]", func() {
|
||||
f := framework.NewDefaultFramework("graceful-node-shutdown")
|
||||
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
||||
|
||||
ginkgo.Context("graceful node shutdown when PodDisruptionConditions are enabled [NodeFeature:PodDisruptionConditions]", func() {
|
||||
|
||||
const (
|
||||
pollInterval = 1 * time.Second
|
||||
podStatusUpdateTimeout = 30 * time.Second
|
||||
nodeStatusUpdateTimeout = 30 * time.Second
|
||||
nodeShutdownGracePeriod = 30 * time.Second
|
||||
)
|
||||
|
||||
tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) {
|
||||
initialConfig.FeatureGates = map[string]bool{
|
||||
string(features.GracefulNodeShutdown): true,
|
||||
string(features.PodDisruptionConditions): true,
|
||||
string(features.GracefulNodeShutdownBasedOnPodPriority): false,
|
||||
}
|
||||
initialConfig.ShutdownGracePeriod = metav1.Duration{Duration: nodeShutdownGracePeriod}
|
||||
})
|
||||
|
||||
ginkgo.BeforeEach(func() {
|
||||
ginkgo.By("Wait for the node to be ready")
|
||||
waitForNodeReady()
|
||||
})
|
||||
|
||||
ginkgo.AfterEach(func() {
|
||||
ginkgo.By("Emitting Shutdown false signal; cancelling the shutdown")
|
||||
err := emitSignalPrepareForShutdown(false)
|
||||
framework.ExpectNoError(err)
|
||||
})
|
||||
|
||||
ginkgo.It("should add the DisruptionTarget pod failure condition to the evicted pods", func() {
|
||||
nodeName := getNodeName(f)
|
||||
nodeSelector := fields.Set{
|
||||
"spec.nodeName": nodeName,
|
||||
}.AsSelector().String()
|
||||
|
||||
// Define test pods
|
||||
pods := []*v1.Pod{
|
||||
getGracePeriodOverrideTestPod("pod-to-evict", nodeName, 5, ""),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
ginkgo.By("reating batch pods")
|
||||
e2epod.NewPodClient(f).CreateBatch(pods)
|
||||
|
||||
list, err := e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: nodeSelector,
|
||||
})
|
||||
|
||||
framework.ExpectNoError(err)
|
||||
framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
|
||||
|
||||
list, err = e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: nodeSelector,
|
||||
})
|
||||
if err != nil {
|
||||
framework.Failf("Failed to start batch pod: %q", err)
|
||||
}
|
||||
framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
|
||||
|
||||
for _, pod := range list.Items {
|
||||
framework.Logf("Pod %q status conditions: %q", pod.Name, &pod.Status.Conditions)
|
||||
}
|
||||
|
||||
ginkgo.By("Verifying batch pods are running")
|
||||
for _, pod := range list.Items {
|
||||
if podReady, err := testutils.PodRunningReady(&pod); err != nil || !podReady {
|
||||
framework.Failf("Failed to start batch pod: %v", pod.Name)
|
||||
}
|
||||
}
|
||||
|
||||
ginkgo.By("Emitting shutdown signal")
|
||||
err = emitSignalPrepareForShutdown(true)
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Verifying that all pods are shutdown")
|
||||
// All pod should be shutdown
|
||||
gomega.Eventually(func() error {
|
||||
list, err = e2epod.NewPodClient(f).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: nodeSelector,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
framework.ExpectEqual(len(list.Items), len(pods), "the number of pods is not as expected")
|
||||
|
||||
for _, pod := range list.Items {
|
||||
if !isPodShutdown(&pod) {
|
||||
framework.Logf("Expecting pod to be shutdown, but it's not currently: Pod: %q, Pod Status %+v", pod.Name, pod.Status)
|
||||
return fmt.Errorf("pod should be shutdown, phase: %s", pod.Status.Phase)
|
||||
}
|
||||
podDisruptionCondition := e2epod.FindPodConditionByType(&pod.Status, v1.AlphaNoCompatGuaranteeDisruptionTarget)
|
||||
if podDisruptionCondition == nil {
|
||||
framework.Failf("pod %q should have the condition: %q, pod status: %v", pod.Name, v1.AlphaNoCompatGuaranteeDisruptionTarget, pod.Status)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}, podStatusUpdateTimeout+(nodeShutdownGracePeriod), pollInterval).Should(gomega.BeNil())
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.Context("when gracefully shutting down", func() {
|
||||
|
||||
const (
|
||||
|
Reference in New Issue
Block a user