mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-11-03 23:40:03 +00:00 
			
		
		
		
	Merge pull request #42204 from dashpole/allocatable_eviction
Automatic merge from submit-queue Eviction Manager Enforces Allocatable Thresholds This PR modifies the eviction manager to enforce node allocatable thresholds for memory as described in kubernetes/community#348. This PR should be merged after #41234. cc @kubernetes/sig-node-pr-reviews @kubernetes/sig-node-feature-requests @vishh ** Why is this a bug/regression** Kubelet uses `oom_score_adj` to enforce QoS policies. But the `oom_score_adj` is based on overall memory requested, which means that a Burstable pod that requested a lot of memory can lead to OOM kills for Guaranteed pods, which violates QoS. Even worse, we have observed system daemons like kubelet or kube-proxy being killed by the OOM killer. Without this PR, v1.6 will have node stability issues and regressions in an existing GA feature `out of Resource` handling.
This commit is contained in:
		@@ -57,6 +57,7 @@ go_library(
 | 
			
		||||
go_test(
 | 
			
		||||
    name = "go_default_test",
 | 
			
		||||
    srcs = [
 | 
			
		||||
        "allocatable_eviction_test.go",
 | 
			
		||||
        "apparmor_test.go",
 | 
			
		||||
        "container_manager_test.go",
 | 
			
		||||
        "critical_pod_test.go",
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										104
									
								
								test/e2e_node/allocatable_eviction_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								test/e2e_node/allocatable_eviction_test.go
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,104 @@
 | 
			
		||||
/*
 | 
			
		||||
Copyright 2017 The Kubernetes Authors.
 | 
			
		||||
 | 
			
		||||
Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
you may not use this file except in compliance with the License.
 | 
			
		||||
You may obtain a copy of the License at
 | 
			
		||||
 | 
			
		||||
    http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
 | 
			
		||||
Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
See the License for the specific language governing permissions and
 | 
			
		||||
limitations under the License.
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
package e2e_node
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
	"time"
 | 
			
		||||
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/api/v1"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/apis/componentconfig"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/kubelet/cm"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
 | 
			
		||||
	. "github.com/onsi/ginkgo"
 | 
			
		||||
	. "github.com/onsi/gomega"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// Eviction Policy is described here:
 | 
			
		||||
// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
 | 
			
		||||
 | 
			
		||||
var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
 | 
			
		||||
	f := framework.NewDefaultFramework("allocatable-eviction-test")
 | 
			
		||||
 | 
			
		||||
	podTestSpecs := []podTestSpec{
 | 
			
		||||
		{
 | 
			
		||||
			evictionPriority: 1, // This pod should be evicted before the innocent pod
 | 
			
		||||
			pod:              *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			evictionPriority: 0, // This pod should never be evicted
 | 
			
		||||
			pod: v1.Pod{
 | 
			
		||||
				ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
 | 
			
		||||
				Spec: v1.PodSpec{
 | 
			
		||||
					RestartPolicy: v1.RestartPolicyNever,
 | 
			
		||||
					Containers: []v1.Container{
 | 
			
		||||
						{
 | 
			
		||||
							Image: "gcr.io/google_containers/busybox:1.24",
 | 
			
		||||
							Name:  "normal-memory-usage-container",
 | 
			
		||||
							Command: []string{
 | 
			
		||||
								"sh",
 | 
			
		||||
								"-c", //make one big (5 Gb) file
 | 
			
		||||
								"dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done",
 | 
			
		||||
							},
 | 
			
		||||
						},
 | 
			
		||||
					},
 | 
			
		||||
				},
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
	evictionTestTimeout := 40 * time.Minute
 | 
			
		||||
	testCondition := "Memory Pressure"
 | 
			
		||||
	kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
 | 
			
		||||
		initialConfig.EvictionHard = "memory.available<10%"
 | 
			
		||||
		// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
 | 
			
		||||
		initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
 | 
			
		||||
		initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
 | 
			
		||||
		initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey}
 | 
			
		||||
		initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false
 | 
			
		||||
		initialConfig.CgroupsPerQOS = true
 | 
			
		||||
	}
 | 
			
		||||
	runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate)
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
// Returns TRUE if the node has Memory Pressure, FALSE otherwise
 | 
			
		||||
func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) {
 | 
			
		||||
	localNodeStatus := getLocalNode(f).Status
 | 
			
		||||
	_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure)
 | 
			
		||||
	Expect(pressure).NotTo(BeNil())
 | 
			
		||||
	hasPressure := pressure.Status == v1.ConditionTrue
 | 
			
		||||
	By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
 | 
			
		||||
 | 
			
		||||
	// Additional Logging relating to Memory
 | 
			
		||||
	summary, err := getNodeSummary()
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return false, err
 | 
			
		||||
	}
 | 
			
		||||
	if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
 | 
			
		||||
		framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
 | 
			
		||||
	}
 | 
			
		||||
	for _, pod := range summary.Pods {
 | 
			
		||||
		framework.Logf("Pod: %s", pod.PodRef.Name)
 | 
			
		||||
		for _, container := range pod.Containers {
 | 
			
		||||
			if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
 | 
			
		||||
				framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	return hasPressure, nil
 | 
			
		||||
}
 | 
			
		||||
@@ -22,6 +22,7 @@ import (
 | 
			
		||||
 | 
			
		||||
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/api/v1"
 | 
			
		||||
	"k8s.io/kubernetes/pkg/apis/componentconfig"
 | 
			
		||||
	"k8s.io/kubernetes/test/e2e/framework"
 | 
			
		||||
 | 
			
		||||
	. "github.com/onsi/ginkgo"
 | 
			
		||||
@@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
 | 
			
		||||
	}
 | 
			
		||||
	evictionTestTimeout := 30 * time.Minute
 | 
			
		||||
	testCondition := "Disk Pressure due to Inodes"
 | 
			
		||||
	// Set the EvictionHard threshold lower to decrease test time
 | 
			
		||||
	evictionHardLimit := "nodefs.inodesFree<50%"
 | 
			
		||||
	kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
 | 
			
		||||
		initialConfig.EvictionHard = "nodefs.inodesFree<50%"
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure)
 | 
			
		||||
	runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate)
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
// Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
 | 
			
		||||
@@ -133,12 +135,12 @@ type podTestSpec struct {
 | 
			
		||||
//		It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
 | 
			
		||||
//		It ensures that all lower evictionPriority pods are eventually evicted.
 | 
			
		||||
// runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists
 | 
			
		||||
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string,
 | 
			
		||||
	evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) {
 | 
			
		||||
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration,
 | 
			
		||||
	hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) {
 | 
			
		||||
 | 
			
		||||
	Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() {
 | 
			
		||||
 | 
			
		||||
		tempSetEvictionHard(f, evictionHard)
 | 
			
		||||
		tempSetCurrentKubeletConfig(f, updateFunction)
 | 
			
		||||
		BeforeEach(func() {
 | 
			
		||||
			By("seting up pods to be used by tests")
 | 
			
		||||
			for _, spec := range podTestSpecs {
 | 
			
		||||
@@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
 | 
			
		||||
		})
 | 
			
		||||
 | 
			
		||||
		It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() {
 | 
			
		||||
			configEnabled, err := isKubeletConfigEnabled(f)
 | 
			
		||||
			framework.ExpectNoError(err)
 | 
			
		||||
			if !configEnabled {
 | 
			
		||||
				framework.Skipf("Dynamic kubelet config must be enabled for this test to run.")
 | 
			
		||||
			}
 | 
			
		||||
			Eventually(func() error {
 | 
			
		||||
				hasPressure, err := hasPressureCondition(f, testCondition)
 | 
			
		||||
				if err != nil {
 | 
			
		||||
@@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
 | 
			
		||||
 | 
			
		||||
// Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise
 | 
			
		||||
func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) {
 | 
			
		||||
 | 
			
		||||
	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
 | 
			
		||||
	framework.ExpectNoError(err, "getting node list")
 | 
			
		||||
	if len(nodeList.Items) != 1 {
 | 
			
		||||
		return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	_, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure)
 | 
			
		||||
	localNodeStatus := getLocalNode(f).Status
 | 
			
		||||
	_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure)
 | 
			
		||||
	Expect(pressure).NotTo(BeNil())
 | 
			
		||||
	hasPressure := pressure.Status == v1.ConditionTrue
 | 
			
		||||
	By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
 | 
			
		||||
 
 | 
			
		||||
@@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
 | 
			
		||||
					By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")
 | 
			
		||||
 | 
			
		||||
					// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
 | 
			
		||||
					guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{
 | 
			
		||||
					guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
 | 
			
		||||
						Requests: v1.ResourceList{
 | 
			
		||||
							"cpu":    resource.MustParse("100m"),
 | 
			
		||||
							"memory": resource.MustParse("100Mi"),
 | 
			
		||||
@@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
 | 
			
		||||
							"cpu":    resource.MustParse("100m"),
 | 
			
		||||
							"memory": resource.MustParse("100Mi"),
 | 
			
		||||
						}})
 | 
			
		||||
					guaranteed = f.PodClient().CreateSync(guaranteed)
 | 
			
		||||
					glog.Infof("pod created with name: %s", guaranteed.Name)
 | 
			
		||||
 | 
			
		||||
					// A pod is burstable if limits and requests do not match across all containers.
 | 
			
		||||
					burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{
 | 
			
		||||
					burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
 | 
			
		||||
						Requests: v1.ResourceList{
 | 
			
		||||
							"cpu":    resource.MustParse("100m"),
 | 
			
		||||
							"memory": resource.MustParse("100Mi"),
 | 
			
		||||
						}})
 | 
			
		||||
					burstable = f.PodClient().CreateSync(burstable)
 | 
			
		||||
					glog.Infof("pod created with name: %s", burstable.Name)
 | 
			
		||||
 | 
			
		||||
					// A pod is besteffort if none of its containers have specified any requests or limits.
 | 
			
		||||
					besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{})
 | 
			
		||||
					// A pod is besteffort if none of its containers have specified any requests or limits	.
 | 
			
		||||
					besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
 | 
			
		||||
					besteffort = f.PodClient().CreateSync(besteffort)
 | 
			
		||||
					glog.Infof("pod created with name: %s", besteffort.Name)
 | 
			
		||||
 | 
			
		||||
					// We poll until timeout or all pods are killed.
 | 
			
		||||
					// Inside the func, we check that all pods are in a valid phase with
 | 
			
		||||
@@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
 | 
			
		||||
 | 
			
		||||
})
 | 
			
		||||
 | 
			
		||||
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
 | 
			
		||||
func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
 | 
			
		||||
	env := []v1.EnvVar{
 | 
			
		||||
		{
 | 
			
		||||
			Name: "MEMORY_LIMIT",
 | 
			
		||||
@@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
 | 
			
		||||
		memLimit = "$(MEMORY_LIMIT)"
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pod := &v1.Pod{
 | 
			
		||||
	return &v1.Pod{
 | 
			
		||||
		ObjectMeta: metav1.ObjectMeta{
 | 
			
		||||
			GenerateName: genName,
 | 
			
		||||
			Name: podName,
 | 
			
		||||
		},
 | 
			
		||||
		Spec: v1.PodSpec{
 | 
			
		||||
			RestartPolicy: v1.RestartPolicyNever,
 | 
			
		||||
@@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
	// The generated pod.Name will be on the pod spec returned by CreateSync
 | 
			
		||||
	pod = f.PodClient().CreateSync(pod)
 | 
			
		||||
	glog.Infof("pod created with name: %s", pod.Name)
 | 
			
		||||
	return pod
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) {
 | 
			
		||||
	return kubeCfg, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Convenience method to set the evictionHard threshold during the current context.
 | 
			
		||||
func tempSetEvictionHard(f *framework.Framework, evictionHard string) {
 | 
			
		||||
	tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
 | 
			
		||||
		initialConfig.EvictionHard = evictionHard
 | 
			
		||||
	})
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
 | 
			
		||||
// The change is reverted in the AfterEach of the context.
 | 
			
		||||
// Returns true on success.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user