Merge pull request #42204 from dashpole/allocatable_eviction

Automatic merge from submit-queue

Eviction Manager Enforces Allocatable Thresholds

This PR modifies the eviction manager to enforce node allocatable thresholds for memory as described in kubernetes/community#348.
This PR should be merged after #41234. 

cc @kubernetes/sig-node-pr-reviews @kubernetes/sig-node-feature-requests @vishh 

** Why is this a bug/regression**

Kubelet uses `oom_score_adj` to enforce QoS policies. But the `oom_score_adj` is based on overall memory requested, which means that a Burstable pod that requested a lot of memory can lead to OOM kills for Guaranteed pods, which violates QoS. Even worse, we have observed system daemons like kubelet or kube-proxy being killed by the OOM killer.
Without this PR, v1.6 will have node stability issues and regressions in an existing GA feature `out of Resource` handling.
This commit is contained in:
Kubernetes Submit Queue 2017-03-03 20:20:12 -08:00 committed by GitHub
commit 2d319bd406
14 changed files with 474 additions and 83 deletions

View File

@ -520,7 +520,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
var hardEvictionThresholds []evictionapi.Threshold
// If the user requested to ignore eviction thresholds, then do not set valid values for hardEvictionThresholds here.
if !s.ExperimentalNodeAllocatableIgnoreEvictionThreshold {
hardEvictionThresholds, err = eviction.ParseThresholdConfig(s.EvictionHard, "", "", "")
hardEvictionThresholds, err = eviction.ParseThresholdConfig([]string{}, s.EvictionHard, "", "", "")
if err != nil {
return err
}

View File

@ -33,6 +33,7 @@ go_test(
"//pkg/api:go_default_library",
"//pkg/api/v1:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",

View File

@ -36,6 +36,8 @@ const (
SignalImageFsAvailable Signal = "imagefs.available"
// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers.
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
)
// ThresholdOperator is the operator used to express a Threshold.

View File

@ -134,9 +134,9 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
}
// Start starts the control loop to observe and response to low compute resources.
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) {
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
// start the eviction manager monitoring
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc) }, monitoringInterval, wait.NeverStop)
go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
}
// IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -187,7 +187,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
}
// synchronize is the main control loop that enforces eviction thresholds.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
// if we have nothing to do, just return
thresholds := m.config.Thresholds
if len(thresholds) == 0 {
@ -209,7 +209,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}
// make observations and get a function to derive pod usage stats relative to those observations.
observations, statsFunc, err := makeSignalObservations(m.summaryProvider)
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
if err != nil {
glog.Errorf("eviction manager: unexpected err: %v", err)
return
@ -224,7 +224,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) {
glog.Infof("soft memory eviction threshold crossed at %s", desc)
// TODO wait grace period for soft memory limit
m.synchronize(diskInfoProvider, podFunc)
m.synchronize(diskInfoProvider, podFunc, nodeProvider)
})
if err != nil {
glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
@ -232,7 +232,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// start hard memory notification
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) {
glog.Infof("hard memory eviction threshold crossed at %s", desc)
m.synchronize(diskInfoProvider, podFunc)
m.synchronize(diskInfoProvider, podFunc, nodeProvider)
})
if err != nil {
glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)

View File

@ -59,6 +59,24 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) {
return m.dedicatedImageFs, nil
}
func newMockNodeProvider(allocatableCapacity v1.ResourceList) *mockNodeProvider {
return &mockNodeProvider{
node: v1.Node{
Status: v1.NodeStatus{
Allocatable: allocatableCapacity,
},
},
}
}
type mockNodeProvider struct {
node v1.Node
}
func (m *mockNodeProvider) GetNode() (*v1.Node, error) {
return &m.node, nil
}
// mockImageGC is used to simulate invoking image garbage collection.
type mockImageGC struct {
err error
@ -175,6 +193,7 @@ func TestMemoryPressure(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -217,7 +236,7 @@ func TestMemoryPressure(t *testing.T) {
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
@ -235,7 +254,7 @@ func TestMemoryPressure(t *testing.T) {
// induce soft threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -250,7 +269,7 @@ func TestMemoryPressure(t *testing.T) {
// step forward in time pass the grace period
fakeClock.Step(3 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -275,7 +294,7 @@ func TestMemoryPressure(t *testing.T) {
// remove memory pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
@ -285,7 +304,7 @@ func TestMemoryPressure(t *testing.T) {
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -313,7 +332,7 @@ func TestMemoryPressure(t *testing.T) {
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
@ -337,7 +356,7 @@ func TestMemoryPressure(t *testing.T) {
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure (because transition period met)
if manager.IsUnderMemoryPressure() {
@ -392,6 +411,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -433,7 +453,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0Gi", "0Gi", "0Gi")
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -448,7 +468,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
// induce soft threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -463,7 +483,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
// step forward in time pass the grace period
fakeClock.Step(3 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -488,7 +508,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
// remove disk pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -498,7 +518,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
// induce disk pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -523,7 +543,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure (because transition period not yet met)
if !manager.IsUnderDiskPressure() {
@ -544,7 +564,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure (because transition period met)
if manager.IsUnderDiskPressure() {
@ -589,6 +609,7 @@ func TestMinReclaim(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -622,7 +643,7 @@ func TestMinReclaim(t *testing.T) {
}
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
@ -632,7 +653,7 @@ func TestMinReclaim(t *testing.T) {
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -652,7 +673,7 @@ func TestMinReclaim(t *testing.T) {
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
@ -672,7 +693,7 @@ func TestMinReclaim(t *testing.T) {
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
@ -688,7 +709,7 @@ func TestMinReclaim(t *testing.T) {
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure (because transition period met)
if manager.IsUnderMemoryPressure() {
@ -727,6 +748,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGcFree := resource.MustParse("700Mi")
imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -761,7 +783,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
}
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -771,7 +793,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
// induce hard threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -794,7 +816,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
// remove disk pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -804,7 +826,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
// induce disk pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -830,7 +852,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
imageGC.invoked = false // reset state
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure (because transition period not yet met)
if !manager.IsUnderDiskPressure() {
@ -852,7 +874,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
imageGC.invoked = false // reset state
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure (because transition period met)
if manager.IsUnderDiskPressure() {
@ -920,6 +942,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -961,7 +984,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0", "0", "0")
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -976,7 +999,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
// induce soft threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -991,7 +1014,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
// step forward in time pass the grace period
fakeClock.Step(3 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -1016,7 +1039,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
// remove inode pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
@ -1026,7 +1049,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
// induce inode pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
@ -1051,7 +1074,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have disk pressure (because transition period not yet met)
if !manager.IsUnderDiskPressure() {
@ -1072,7 +1095,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have disk pressure (because transition period met)
if manager.IsUnderDiskPressure() {
@ -1120,6 +1143,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{
Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
@ -1164,7 +1188,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
// induce soft threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -1179,7 +1203,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
// step forward in time pass the grace period
fakeClock.Step(3 * time.Minute)
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -1197,7 +1221,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
// remove memory pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("3Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
@ -1210,7 +1234,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
@ -1222,3 +1246,167 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
}
// TestAllocatableMemoryPressure
func TestAllocatableMemoryPressure(t *testing.T) {
podMaker := makePodWithMemoryStats
summaryStatsMaker := makeMemoryStats
constantCapacity := "4Gi"
podsToMake := []podToMake{
{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"},
{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "400Mi"},
{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"},
{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "500Mi"},
{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "100Mi"},
{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "200Mi"},
}
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
pods = append(pods, pod)
podStats[pod] = podStat
}
podToEvict := pods[5]
activePodsFunc := func() []*v1.Pod {
return pods
}
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Ki"),
},
},
},
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(constantCapacity, podStats)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
}
// create a best effort pod to test admission
bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
t.Errorf("Manager should not report memory pressure")
}
// try to admit our pods (they should succeed)
expected := []bool{true, true}
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
}
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
pod, podStat := podMaker("guaranteed-high-2", newResourceList("100m", "1Gi"), newResourceList("100m", "1Gi"), "1Gi")
podStats[pod] = podStat
summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// check the right pod was killed
if podKiller.pod != podToEvict {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}
// reset state
podKiller.pod = nil
podKiller.gracePeriodOverride = nil
// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
}
// reduce memory pressure
fakeClock.Step(1 * time.Minute)
for pod := range podStats {
if pod.Name == "guaranteed-high-2" {
delete(podStats, pod)
}
}
summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
}
// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
}
// move the clock past transition period to ensure that we stop reporting pressure
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
// we should not have memory pressure (because transition period met)
if manager.IsUnderMemoryPressure() {
t.Errorf("Manager should not report memory pressure")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
}
// all pods should admit now
expected = []bool{true, true}
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
}
}

View File

@ -29,6 +29,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/server/stats"
@ -68,6 +69,7 @@ func init() {
// map eviction signals to node conditions
signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{}
signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure
signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure
signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure
signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
@ -76,6 +78,7 @@ func init() {
// map signals to resources (and vice-versa)
signalToResource = map[evictionapi.Signal]v1.ResourceName{}
signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
@ -93,8 +96,10 @@ func validSignal(signal evictionapi.Signal) bool {
}
// ParseThresholdConfig parses the flags for thresholds.
func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) {
func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) {
results := []evictionapi.Threshold{}
allocatableThresholds := getAllocatableThreshold(allocatableConfig)
results = append(results, allocatableThresholds...)
hardThresholds, err := parseThresholdStatements(evictionHard)
if err != nil {
@ -214,6 +219,27 @@ func parseThresholdStatement(statement string) (evictionapi.Threshold, error) {
}, nil
}
// getAllocatableThreshold returns the thresholds applicable for the allocatable configuration
func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold {
for _, key := range allocatableConfig {
if key == cm.NodeAllocatableEnforcementKey {
return []evictionapi.Threshold{
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
},
MinReclaim: &evictionapi.ThresholdValue{
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
},
},
}
}
}
return []evictionapi.Threshold{}
}
// parsePercentage parses a string representing a percentage value
func parsePercentage(input string) (float32, error) {
value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32)
@ -611,11 +637,15 @@ func (a byEvictionPriority) Less(i, j int) bool {
}
// makeSignalObservations derives observations using the specified summary provider.
func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObservations, statsFunc, error) {
func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) {
summary, err := summaryProvider.Get()
if err != nil {
return nil, nil, err
}
node, err := nodeProvider.GetNode()
if err != nil {
return nil, nil, err
}
// build the function to work against for pod stats
statsFunc := cachedStatsFunc(summary.Pods)
@ -663,6 +693,19 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
}
}
}
if memoryAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
memoryAllocatableAvailable := memoryAllocatableCapacity.Copy()
for _, pod := range summary.Pods {
mu, err := podMemoryUsage(pod)
if err == nil {
memoryAllocatableAvailable.Sub(mu[v1.ResourceMemory])
}
}
result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{
available: memoryAllocatableAvailable,
capacity: memoryAllocatableCapacity.Copy(),
}
}
return result, statsFunc, nil
}

View File

@ -28,6 +28,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/v1"
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/quota"
)
@ -40,6 +41,7 @@ func quantityMustParse(value string) *resource.Quantity {
func TestParseThresholdConfig(t *testing.T) {
gracePeriod, _ := time.ParseDuration("30s")
testCases := map[string]struct {
allocatableConfig []string
evictionHard string
evictionSoft string
evictionSoftGracePeriod string
@ -48,6 +50,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds []evictionapi.Threshold
}{
"no values": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -56,12 +59,23 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"all flag values": {
allocatableConfig: []string{cm.NodeAllocatableEnforcementKey},
evictionHard: "memory.available<150Mi",
evictionSoft: "memory.available<300Mi",
evictionSoftGracePeriod: "memory.available=30s",
evictionMinReclaim: "memory.available=0",
expectErr: false,
expectThresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalAllocatableMemoryAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("0"),
},
MinReclaim: &evictionapi.ThresholdValue{
Quantity: quantityMustParse("0"),
},
},
{
Signal: evictionapi.SignalMemoryAvailable,
Operator: evictionapi.OpLessThan,
@ -86,6 +100,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
"all flag values in percentages": {
allocatableConfig: []string{},
evictionHard: "memory.available<10%",
evictionSoft: "memory.available<30%",
evictionSoftGracePeriod: "memory.available=30s",
@ -116,6 +131,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
"disk flag values": {
allocatableConfig: []string{},
evictionHard: "imagefs.available<150Mi,nodefs.available<100Mi",
evictionSoft: "imagefs.available<300Mi,nodefs.available<200Mi",
evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s",
@ -167,6 +183,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
"disk flag values in percentages": {
allocatableConfig: []string{},
evictionHard: "imagefs.available<15%,nodefs.available<10.5%",
evictionSoft: "imagefs.available<30%,nodefs.available<20.5%",
evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s",
@ -218,6 +235,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
"inode flag values": {
allocatableConfig: []string{},
evictionHard: "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi",
evictionSoft: "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi",
evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s",
@ -269,6 +287,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
"invalid-signal": {
allocatableConfig: []string{},
evictionHard: "mem.available<150Mi",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -277,6 +296,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"hard-signal-negative": {
allocatableConfig: []string{},
evictionHard: "memory.available<-150Mi",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -285,6 +305,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"hard-signal-negative-percentage": {
allocatableConfig: []string{},
evictionHard: "memory.available<-15%",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -293,6 +314,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"soft-signal-negative": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "memory.available<-150Mi",
evictionSoftGracePeriod: "",
@ -301,6 +323,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"duplicate-signal": {
allocatableConfig: []string{},
evictionHard: "memory.available<150Mi,memory.available<100Mi",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -309,6 +332,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"valid-and-invalid-signal": {
allocatableConfig: []string{},
evictionHard: "memory.available<150Mi,invalid.foo<150Mi",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -317,6 +341,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"soft-no-grace-period": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "memory.available<150Mi",
evictionSoftGracePeriod: "",
@ -325,6 +350,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"soft-neg-grace-period": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "memory.available<150Mi",
evictionSoftGracePeriod: "memory.available=-30s",
@ -333,6 +359,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"neg-reclaim": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -341,6 +368,7 @@ func TestParseThresholdConfig(t *testing.T) {
expectThresholds: []evictionapi.Threshold{},
},
"duplicate-reclaim": {
allocatableConfig: []string{},
evictionHard: "",
evictionSoft: "",
evictionSoftGracePeriod: "",
@ -350,7 +378,7 @@ func TestParseThresholdConfig(t *testing.T) {
},
}
for testName, testCase := range testCases {
thresholds, err := ParseThresholdConfig(testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim)
thresholds, err := ParseThresholdConfig(testCase.allocatableConfig, testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim)
if testCase.expectErr != (err != nil) {
t.Errorf("Err not as expected, test: %v, error expected: %v, actual: %v", testName, testCase.expectErr, err)
}
@ -699,6 +727,7 @@ func TestMakeSignalObservations(t *testing.T) {
}
nodeAvailableBytes := uint64(1024 * 1024 * 1024)
nodeWorkingSetBytes := uint64(1024 * 1024 * 1024)
allocatableMemoryCapacity := uint64(5 * 1024 * 1024 * 1024)
imageFsAvailableBytes := uint64(1024 * 1024)
imageFsCapacityBytes := uint64(1024 * 1024 * 2)
nodeFsAvailableBytes := uint64(1024)
@ -738,15 +767,31 @@ func TestMakeSignalObservations(t *testing.T) {
podMaker("pod1", "ns2", "uuid2", 1),
podMaker("pod3", "ns3", "uuid3", 1),
}
containerWorkingSetBytes := int64(1024 * 1024)
containerWorkingSetBytes := int64(1024 * 1024 * 1024)
for _, pod := range pods {
fakeStats.Pods = append(fakeStats.Pods, newPodStats(pod, containerWorkingSetBytes))
}
actualObservations, statsFunc, err := makeSignalObservations(provider)
res := quantityMustParse("5Gi")
nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *res})
// Allocatable thresholds are always 100%. Verify that Threshold == Capacity.
if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 {
t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity)
}
actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider)
if err != nil {
t.Errorf("Unexpected err: %v", err)
}
allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable]
if !found {
t.Errorf("Expected allocatable memory observation, but didnt find one")
}
if allocatableMemQuantity.available.Value() != 2*containerWorkingSetBytes {
t.Errorf("Expected %v, actual: %v", containerWorkingSetBytes, allocatableMemQuantity.available.Value())
}
if expectedBytes := int64(allocatableMemoryCapacity); allocatableMemQuantity.capacity.Value() != expectedBytes {
t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.capacity.Value())
}
memQuantity, found := actualObservations[evictionapi.SignalMemoryAvailable]
if !found {
t.Errorf("Expected available memory observation: %v", err)

View File

@ -53,7 +53,7 @@ type Config struct {
// Manager evaluates when an eviction threshold for node stability has been met on the node.
type Manager interface {
// Start starts the control loop to monitor eviction thresholds at specified interval.
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration)
Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)
// IsUnderMemoryPressure returns true if the node is under memory pressure.
IsUnderMemoryPressure() bool
@ -68,6 +68,12 @@ type DiskInfoProvider interface {
HasDedicatedImageFs() (bool, error)
}
// NodeProvider is responsible for providing the node api object describing this node
type NodeProvider interface {
// GetNode returns the node info for this node
GetNode() (*v1.Node, error)
}
// ImageGC is responsible for performing garbage collection of unused images.
type ImageGC interface {
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.

View File

@ -349,7 +349,12 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
RootFreeDiskMB: int(kubeCfg.LowDiskSpaceThresholdMB),
}
thresholds, err := eviction.ParseThresholdConfig(kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable
if kubeCfg.ExperimentalNodeAllocatableIgnoreEvictionThreshold {
// Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions
enforceNodeAllocatable = []string{}
}
thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
if err != nil {
return nil, err
}
@ -1222,7 +1227,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
glog.Fatalf("Failed to start cAdvisor %v", err)
}
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
kl.evictionManager.Start(kl, kl.getActivePods, evictionMonitoringPeriod)
kl.evictionManager.Start(kl, kl.getActivePods, kl, evictionMonitoringPeriod)
}
// Run starts the kubelet reacting to config updates

View File

@ -57,6 +57,7 @@ go_library(
go_test(
name = "go_default_test",
srcs = [
"allocatable_eviction_test.go",
"apparmor_test.go",
"container_manager_test.go",
"critical_pod_test.go",

View File

@ -0,0 +1,104 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
// Eviction Policy is described here:
// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
f := framework.NewDefaultFramework("allocatable-eviction-test")
podTestSpecs := []podTestSpec{
{
evictionPriority: 1, // This pod should be evicted before the innocent pod
pod: *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
},
{
evictionPriority: 0, // This pod should never be evicted
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "normal-memory-usage-container",
Command: []string{
"sh",
"-c", //make one big (5 Gb) file
"dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done",
},
},
},
},
},
},
}
evictionTestTimeout := 40 * time.Minute
testCondition := "Memory Pressure"
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = "memory.available<10%"
// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey}
initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false
initialConfig.CgroupsPerQOS = true
}
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate)
})
// Returns TRUE if the node has Memory Pressure, FALSE otherwise
func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) {
localNodeStatus := getLocalNode(f).Status
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure)
Expect(pressure).NotTo(BeNil())
hasPressure := pressure.Status == v1.ConditionTrue
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
// Additional Logging relating to Memory
summary, err := getNodeSummary()
if err != nil {
return false, err
}
if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
}
for _, pod := range summary.Pods {
framework.Logf("Pod: %s", pod.PodRef.Name)
for _, container := range pod.Containers {
if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
}
}
}
return hasPressure, nil
}

View File

@ -22,6 +22,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
}
evictionTestTimeout := 30 * time.Minute
testCondition := "Disk Pressure due to Inodes"
// Set the EvictionHard threshold lower to decrease test time
evictionHardLimit := "nodefs.inodesFree<50%"
kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = "nodefs.inodesFree<50%"
}
runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure)
runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate)
})
// Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
@ -133,12 +135,12 @@ type podTestSpec struct {
// It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
// It ensures that all lower evictionPriority pods are eventually evicted.
// runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string,
evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) {
func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration,
hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) {
Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() {
tempSetEvictionHard(f, evictionHard)
tempSetCurrentKubeletConfig(f, updateFunction)
BeforeEach(func() {
By("seting up pods to be used by tests")
for _, spec := range podTestSpecs {
@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
})
It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() {
configEnabled, err := isKubeletConfigEnabled(f)
framework.ExpectNoError(err)
if !configEnabled {
framework.Skipf("Dynamic kubelet config must be enabled for this test to run.")
}
Eventually(func() error {
hasPressure, err := hasPressureCondition(f, testCondition)
if err != nil {
@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
// Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise
func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) {
nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
if len(nodeList.Items) != 1 {
return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
}
_, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure)
localNodeStatus := getLocalNode(f).Status
_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure)
Expect(pressure).NotTo(BeNil())
hasPressure := pressure.Status == v1.ConditionTrue
By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))

View File

@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")
// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{
guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
Requests: v1.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
}})
guaranteed = f.PodClient().CreateSync(guaranteed)
glog.Infof("pod created with name: %s", guaranteed.Name)
// A pod is burstable if limits and requests do not match across all containers.
burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{
burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
Requests: v1.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("100Mi"),
}})
burstable = f.PodClient().CreateSync(burstable)
glog.Infof("pod created with name: %s", burstable.Name)
// A pod is besteffort if none of its containers have specified any requests or limits.
besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{})
// A pod is besteffort if none of its containers have specified any requests or limits .
besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
besteffort = f.PodClient().CreateSync(besteffort)
glog.Infof("pod created with name: %s", besteffort.Name)
// We poll until timeout or all pods are killed.
// Inside the func, we check that all pods are in a valid phase with
@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
})
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
env := []v1.EnvVar{
{
Name: "MEMORY_LIMIT",
@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
memLimit = "$(MEMORY_LIMIT)"
}
pod := &v1.Pod{
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
GenerateName: genName,
Name: podName,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
},
},
}
// The generated pod.Name will be on the pod spec returned by CreateSync
pod = f.PodClient().CreateSync(pod)
glog.Infof("pod created with name: %s", pod.Name)
return pod
}

View File

@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) {
return kubeCfg, nil
}
// Convenience method to set the evictionHard threshold during the current context.
func tempSetEvictionHard(f *framework.Framework, evictionHard string) {
tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.EvictionHard = evictionHard
})
}
// Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
// The change is reverted in the AfterEach of the context.
// Returns true on success.