From ac612eab8ec9fb466f135a3501079657e1aa2350 Mon Sep 17 00:00:00 2001 From: David Ashpole Date: Wed, 1 Mar 2017 17:56:24 -0800 Subject: [PATCH] eviction manager changes for allocatable --- cmd/kubelet/app/server.go | 2 +- pkg/kubelet/eviction/BUILD | 1 + pkg/kubelet/eviction/api/types.go | 2 + pkg/kubelet/eviction/eviction_manager.go | 12 +- pkg/kubelet/eviction/eviction_manager_test.go | 260 +++++++++++++++--- pkg/kubelet/eviction/helpers.go | 47 +++- pkg/kubelet/eviction/helpers_test.go | 51 +++- pkg/kubelet/eviction/types.go | 8 +- pkg/kubelet/kubelet.go | 9 +- test/e2e_node/BUILD | 1 + test/e2e_node/allocatable_eviction_test.go | 104 +++++++ test/e2e_node/inode_eviction_test.go | 29 +- test/e2e_node/memory_eviction_test.go | 24 +- test/e2e_node/util.go | 7 - 14 files changed, 474 insertions(+), 83 deletions(-) create mode 100644 test/e2e_node/allocatable_eviction_test.go diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 89f5639ca20..a8a46ad712e 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -520,7 +520,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { var hardEvictionThresholds []evictionapi.Threshold // If the user requested to ignore eviction thresholds, then do not set valid values for hardEvictionThresholds here. if !s.ExperimentalNodeAllocatableIgnoreEvictionThreshold { - hardEvictionThresholds, err = eviction.ParseThresholdConfig(s.EvictionHard, "", "", "") + hardEvictionThresholds, err = eviction.ParseThresholdConfig([]string{}, s.EvictionHard, "", "", "") if err != nil { return err } diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 350e3083458..f37d12b6f2c 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -33,6 +33,7 @@ go_test( "//pkg/api:go_default_library", "//pkg/api/v1:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", + "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/types:go_default_library", diff --git a/pkg/kubelet/eviction/api/types.go b/pkg/kubelet/eviction/api/types.go index 306b4d787ca..2928688a120 100644 --- a/pkg/kubelet/eviction/api/types.go +++ b/pkg/kubelet/eviction/api/types.go @@ -36,6 +36,8 @@ const ( SignalImageFsAvailable Signal = "imagefs.available" // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers. SignalImageFsInodesFree Signal = "imagefs.inodesFree" + // SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes. + SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available" ) // ThresholdOperator is the operator used to express a Threshold. diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 72404874348..a03601eef80 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -134,9 +134,9 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd } // Start starts the control loop to observe and response to low compute resources. -func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) { +func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) { // start the eviction manager monitoring - go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc) }, monitoringInterval, wait.NeverStop) + go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop) } // IsUnderMemoryPressure returns true if the node is under memory pressure. @@ -187,7 +187,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio } // synchronize is the main control loop that enforces eviction thresholds. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) { // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 { @@ -209,7 +209,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } // make observations and get a function to derive pod usage stats relative to those observations. - observations, statsFunc, err := makeSignalObservations(m.summaryProvider) + observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider) if err != nil { glog.Errorf("eviction manager: unexpected err: %v", err) return @@ -224,7 +224,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) { glog.Infof("soft memory eviction threshold crossed at %s", desc) // TODO wait grace period for soft memory limit - m.synchronize(diskInfoProvider, podFunc) + m.synchronize(diskInfoProvider, podFunc, nodeProvider) }) if err != nil { glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err) @@ -232,7 +232,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // start hard memory notification err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) { glog.Infof("hard memory eviction threshold crossed at %s", desc) - m.synchronize(diskInfoProvider, podFunc) + m.synchronize(diskInfoProvider, podFunc, nodeProvider) }) if err != nil { glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err) diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 8a3f9c8437a..8607acc5c2c 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -59,6 +59,24 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) { return m.dedicatedImageFs, nil } +func newMockNodeProvider(allocatableCapacity v1.ResourceList) *mockNodeProvider { + return &mockNodeProvider{ + node: v1.Node{ + Status: v1.NodeStatus{ + Allocatable: allocatableCapacity, + }, + }, + } +} + +type mockNodeProvider struct { + node v1.Node +} + +func (m *mockNodeProvider) GetNode() (*v1.Node, error) { + return &m.node, nil +} + // mockImageGC is used to simulate invoking image garbage collection. type mockImageGC struct { err error @@ -175,6 +193,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -217,7 +236,7 @@ func TestMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -235,7 +254,7 @@ func TestMemoryPressure(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -250,7 +269,7 @@ func TestMemoryPressure(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -275,7 +294,7 @@ func TestMemoryPressure(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -285,7 +304,7 @@ func TestMemoryPressure(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -313,7 +332,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -337,7 +356,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -392,6 +411,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -433,7 +453,7 @@ func TestDiskPressureNodeFs(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0Gi", "0Gi", "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -448,7 +468,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -463,7 +483,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -488,7 +508,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -498,7 +518,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // induce disk pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -523,7 +543,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -544,7 +564,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -589,6 +609,7 @@ func TestMinReclaim(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -622,7 +643,7 @@ func TestMinReclaim(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -632,7 +653,7 @@ func TestMinReclaim(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -652,7 +673,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -672,7 +693,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -688,7 +709,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -727,6 +748,7 @@ func TestNodeReclaimFuncs(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGcFree := resource.MustParse("700Mi") imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil} nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -761,7 +783,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -771,7 +793,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // induce hard threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -794,7 +816,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -804,7 +826,7 @@ func TestNodeReclaimFuncs(t *testing.T) { // induce disk pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -830,7 +852,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) imageGC.invoked = false // reset state podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -852,7 +874,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) imageGC.invoked = false // reset state podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -920,6 +942,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -961,7 +984,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0", "0", "0") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -976,7 +999,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -991,7 +1014,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -1016,7 +1039,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // remove inode pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -1026,7 +1049,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // induce inode pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -1051,7 +1074,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -1072,7 +1095,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -1120,6 +1143,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &clientv1.ObjectReference{ Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "", @@ -1164,7 +1188,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1179,7 +1203,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1197,7 +1221,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -1210,7 +1234,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1222,3 +1246,167 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) } } + +// TestAllocatableMemoryPressure +func TestAllocatableMemoryPressure(t *testing.T) { + podMaker := makePodWithMemoryStats + summaryStatsMaker := makeMemoryStats + constantCapacity := "4Gi" + podsToMake := []podToMake{ + {name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"}, + {name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "400Mi"}, + {name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"}, + {name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "500Mi"}, + {name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "100Mi"}, + {name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "200Mi"}, + } + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[5] + activePodsFunc := func() []*v1.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")}) + imageGC := &mockImageGC{freed: int64(0), err: nil} + nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1Ki"), + }, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(constantCapacity, podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // create a best effort pod to test admission + bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") + burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) + + // we should not have memory pressure + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // try to admit our pods (they should succeed) + expected := []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // induce memory pressure! + fakeClock.Step(1 * time.Minute) + pod, podStat := podMaker("guaranteed-high-2", newResourceList("100m", "1Gi"), newResourceList("100m", "1Gi"), "1Gi") + podStats[pod] = podStat + summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != podToEvict { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // the best-effort pod should not admit, burstable should + expected = []bool{false, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // reduce memory pressure + fakeClock.Step(1 * time.Minute) + for pod := range podStats { + if pod.Name == "guaranteed-high-2" { + delete(podStats, pod) + } + } + summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) + + // we should have memory pressure (because transition period not yet met) + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // the best-effort pod should not admit, burstable should + expected = []bool{false, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider) + + // we should not have memory pressure (because transition period met) + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // all pods should admit now + expected = []bool{true, true} + for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) + } + } +} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 86059a23da2..3d5582cbc8d 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -29,6 +29,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" + "k8s.io/kubernetes/pkg/kubelet/cm" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" @@ -68,6 +69,7 @@ func init() { // map eviction signals to node conditions signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{} signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure + signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure @@ -76,6 +78,7 @@ func init() { // map signals to resources (and vice-versa) signalToResource = map[evictionapi.Signal]v1.ResourceName{} signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory + signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs @@ -93,8 +96,10 @@ func validSignal(signal evictionapi.Signal) bool { } // ParseThresholdConfig parses the flags for thresholds. -func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) { +func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) { results := []evictionapi.Threshold{} + allocatableThresholds := getAllocatableThreshold(allocatableConfig) + results = append(results, allocatableThresholds...) hardThresholds, err := parseThresholdStatements(evictionHard) if err != nil { @@ -214,6 +219,27 @@ func parseThresholdStatement(statement string) (evictionapi.Threshold, error) { }, nil } +// getAllocatableThreshold returns the thresholds applicable for the allocatable configuration +func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold { + for _, key := range allocatableConfig { + if key == cm.NodeAllocatableEnforcementKey { + return []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: resource.NewQuantity(int64(0), resource.BinarySI), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: resource.NewQuantity(int64(0), resource.BinarySI), + }, + }, + } + } + } + return []evictionapi.Threshold{} +} + // parsePercentage parses a string representing a percentage value func parsePercentage(input string) (float32, error) { value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32) @@ -611,11 +637,15 @@ func (a byEvictionPriority) Less(i, j int) bool { } // makeSignalObservations derives observations using the specified summary provider. -func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObservations, statsFunc, error) { +func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) { summary, err := summaryProvider.Get() if err != nil { return nil, nil, err } + node, err := nodeProvider.GetNode() + if err != nil { + return nil, nil, err + } // build the function to work against for pod stats statsFunc := cachedStatsFunc(summary.Pods) @@ -663,6 +693,19 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv } } } + if memoryAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok { + memoryAllocatableAvailable := memoryAllocatableCapacity.Copy() + for _, pod := range summary.Pods { + mu, err := podMemoryUsage(pod) + if err == nil { + memoryAllocatableAvailable.Sub(mu[v1.ResourceMemory]) + } + } + result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ + available: memoryAllocatableAvailable, + capacity: memoryAllocatableCapacity.Copy(), + } + } return result, statsFunc, nil } diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 67d2c16ffe2..2ba9c00a720 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" + "k8s.io/kubernetes/pkg/kubelet/cm" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/quota" ) @@ -40,6 +41,7 @@ func quantityMustParse(value string) *resource.Quantity { func TestParseThresholdConfig(t *testing.T) { gracePeriod, _ := time.ParseDuration("30s") testCases := map[string]struct { + allocatableConfig []string evictionHard string evictionSoft string evictionSoftGracePeriod string @@ -48,6 +50,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds []evictionapi.Threshold }{ "no values": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "", evictionSoftGracePeriod: "", @@ -56,12 +59,23 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "all flag values": { + allocatableConfig: []string{cm.NodeAllocatableEnforcementKey}, evictionHard: "memory.available<150Mi", evictionSoft: "memory.available<300Mi", evictionSoftGracePeriod: "memory.available=30s", evictionMinReclaim: "memory.available=0", expectErr: false, expectThresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableMemoryAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + }, { Signal: evictionapi.SignalMemoryAvailable, Operator: evictionapi.OpLessThan, @@ -86,6 +100,7 @@ func TestParseThresholdConfig(t *testing.T) { }, }, "all flag values in percentages": { + allocatableConfig: []string{}, evictionHard: "memory.available<10%", evictionSoft: "memory.available<30%", evictionSoftGracePeriod: "memory.available=30s", @@ -116,6 +131,7 @@ func TestParseThresholdConfig(t *testing.T) { }, }, "disk flag values": { + allocatableConfig: []string{}, evictionHard: "imagefs.available<150Mi,nodefs.available<100Mi", evictionSoft: "imagefs.available<300Mi,nodefs.available<200Mi", evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s", @@ -167,6 +183,7 @@ func TestParseThresholdConfig(t *testing.T) { }, }, "disk flag values in percentages": { + allocatableConfig: []string{}, evictionHard: "imagefs.available<15%,nodefs.available<10.5%", evictionSoft: "imagefs.available<30%,nodefs.available<20.5%", evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s", @@ -218,6 +235,7 @@ func TestParseThresholdConfig(t *testing.T) { }, }, "inode flag values": { + allocatableConfig: []string{}, evictionHard: "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi", evictionSoft: "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi", evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s", @@ -269,6 +287,7 @@ func TestParseThresholdConfig(t *testing.T) { }, }, "invalid-signal": { + allocatableConfig: []string{}, evictionHard: "mem.available<150Mi", evictionSoft: "", evictionSoftGracePeriod: "", @@ -277,6 +296,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "hard-signal-negative": { + allocatableConfig: []string{}, evictionHard: "memory.available<-150Mi", evictionSoft: "", evictionSoftGracePeriod: "", @@ -285,6 +305,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "hard-signal-negative-percentage": { + allocatableConfig: []string{}, evictionHard: "memory.available<-15%", evictionSoft: "", evictionSoftGracePeriod: "", @@ -293,6 +314,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "soft-signal-negative": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "memory.available<-150Mi", evictionSoftGracePeriod: "", @@ -301,6 +323,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "duplicate-signal": { + allocatableConfig: []string{}, evictionHard: "memory.available<150Mi,memory.available<100Mi", evictionSoft: "", evictionSoftGracePeriod: "", @@ -309,6 +332,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "valid-and-invalid-signal": { + allocatableConfig: []string{}, evictionHard: "memory.available<150Mi,invalid.foo<150Mi", evictionSoft: "", evictionSoftGracePeriod: "", @@ -317,6 +341,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "soft-no-grace-period": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "memory.available<150Mi", evictionSoftGracePeriod: "", @@ -325,6 +350,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "soft-neg-grace-period": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "memory.available<150Mi", evictionSoftGracePeriod: "memory.available=-30s", @@ -333,6 +359,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "neg-reclaim": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "", evictionSoftGracePeriod: "", @@ -341,6 +368,7 @@ func TestParseThresholdConfig(t *testing.T) { expectThresholds: []evictionapi.Threshold{}, }, "duplicate-reclaim": { + allocatableConfig: []string{}, evictionHard: "", evictionSoft: "", evictionSoftGracePeriod: "", @@ -350,7 +378,7 @@ func TestParseThresholdConfig(t *testing.T) { }, } for testName, testCase := range testCases { - thresholds, err := ParseThresholdConfig(testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim) + thresholds, err := ParseThresholdConfig(testCase.allocatableConfig, testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim) if testCase.expectErr != (err != nil) { t.Errorf("Err not as expected, test: %v, error expected: %v, actual: %v", testName, testCase.expectErr, err) } @@ -699,6 +727,7 @@ func TestMakeSignalObservations(t *testing.T) { } nodeAvailableBytes := uint64(1024 * 1024 * 1024) nodeWorkingSetBytes := uint64(1024 * 1024 * 1024) + allocatableMemoryCapacity := uint64(5 * 1024 * 1024 * 1024) imageFsAvailableBytes := uint64(1024 * 1024) imageFsCapacityBytes := uint64(1024 * 1024 * 2) nodeFsAvailableBytes := uint64(1024) @@ -738,15 +767,31 @@ func TestMakeSignalObservations(t *testing.T) { podMaker("pod1", "ns2", "uuid2", 1), podMaker("pod3", "ns3", "uuid3", 1), } - containerWorkingSetBytes := int64(1024 * 1024) + containerWorkingSetBytes := int64(1024 * 1024 * 1024) for _, pod := range pods { fakeStats.Pods = append(fakeStats.Pods, newPodStats(pod, containerWorkingSetBytes)) } - actualObservations, statsFunc, err := makeSignalObservations(provider) + res := quantityMustParse("5Gi") + nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *res}) + // Allocatable thresholds are always 100%. Verify that Threshold == Capacity. + if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 { + t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity) + } + actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider) if err != nil { t.Errorf("Unexpected err: %v", err) } + allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable] + if !found { + t.Errorf("Expected allocatable memory observation, but didnt find one") + } + if allocatableMemQuantity.available.Value() != 2*containerWorkingSetBytes { + t.Errorf("Expected %v, actual: %v", containerWorkingSetBytes, allocatableMemQuantity.available.Value()) + } + if expectedBytes := int64(allocatableMemoryCapacity); allocatableMemQuantity.capacity.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.capacity.Value()) + } memQuantity, found := actualObservations[evictionapi.SignalMemoryAvailable] if !found { t.Errorf("Expected available memory observation: %v", err) diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index c1c7d1f60ae..977eb06a264 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -53,7 +53,7 @@ type Config struct { // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. - Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) + Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) // IsUnderMemoryPressure returns true if the node is under memory pressure. IsUnderMemoryPressure() bool @@ -68,6 +68,12 @@ type DiskInfoProvider interface { HasDedicatedImageFs() (bool, error) } +// NodeProvider is responsible for providing the node api object describing this node +type NodeProvider interface { + // GetNode returns the node info for this node + GetNode() (*v1.Node, error) +} + // ImageGC is responsible for performing garbage collection of unused images. type ImageGC interface { // DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 0ea11a92600..b37428ec75a 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -349,7 +349,12 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub RootFreeDiskMB: int(kubeCfg.LowDiskSpaceThresholdMB), } - thresholds, err := eviction.ParseThresholdConfig(kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim) + enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable + if kubeCfg.ExperimentalNodeAllocatableIgnoreEvictionThreshold { + // Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions + enforceNodeAllocatable = []string{} + } + thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim) if err != nil { return nil, err } @@ -1222,7 +1227,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { glog.Fatalf("Failed to start cAdvisor %v", err) } // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs - kl.evictionManager.Start(kl, kl.getActivePods, evictionMonitoringPeriod) + kl.evictionManager.Start(kl, kl.getActivePods, kl, evictionMonitoringPeriod) } // Run starts the kubelet reacting to config updates diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 5408bad9009..e363574d1f6 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -52,6 +52,7 @@ go_library( go_test( name = "go_default_test", srcs = [ + "allocatable_eviction_test.go", "apparmor_test.go", "container_manager_test.go", "critical_pod_test.go", diff --git a/test/e2e_node/allocatable_eviction_test.go b/test/e2e_node/allocatable_eviction_test.go new file mode 100644 index 00000000000..2cb03468200 --- /dev/null +++ b/test/e2e_node/allocatable_eviction_test.go @@ -0,0 +1,104 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/pkg/kubelet/cm" + "k8s.io/kubernetes/test/e2e/framework" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +// Eviction Policy is described here: +// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md + +var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() { + f := framework.NewDefaultFramework("allocatable-eviction-test") + + podTestSpecs := []podTestSpec{ + { + evictionPriority: 1, // This pod should be evicted before the innocent pod + pod: *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}), + }, + { + evictionPriority: 0, // This pod should never be evicted + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "normal-memory-usage-container", + Command: []string{ + "sh", + "-c", //make one big (5 Gb) file + "dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done", + }, + }, + }, + }, + }, + }, + } + evictionTestTimeout := 40 * time.Minute + testCondition := "Memory Pressure" + kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) { + initialConfig.EvictionHard = "memory.available<10%" + // Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds. + initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"}) + initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"}) + initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey} + initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false + initialConfig.CgroupsPerQOS = true + } + runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate) +}) + +// Returns TRUE if the node has Memory Pressure, FALSE otherwise +func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) { + localNodeStatus := getLocalNode(f).Status + _, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure) + Expect(pressure).NotTo(BeNil()) + hasPressure := pressure.Status == v1.ConditionTrue + By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure)) + + // Additional Logging relating to Memory + summary, err := getNodeSummary() + if err != nil { + return false, err + } + if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil { + framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes) + } + for _, pod := range summary.Pods { + framework.Logf("Pod: %s", pod.PodRef.Name) + for _, container := range pod.Containers { + if container.Memory != nil && container.Memory.WorkingSetBytes != nil { + framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes) + } + } + } + return hasPressure, nil +} diff --git a/test/e2e_node/inode_eviction_test.go b/test/e2e_node/inode_eviction_test.go index 8ad63de1334..5d262cf93dd 100644 --- a/test/e2e_node/inode_eviction_test.go +++ b/test/e2e_node/inode_eviction_test.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" "k8s.io/kubernetes/test/e2e/framework" . "github.com/onsi/ginkgo" @@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak } evictionTestTimeout := 30 * time.Minute testCondition := "Disk Pressure due to Inodes" - // Set the EvictionHard threshold lower to decrease test time - evictionHardLimit := "nodefs.inodesFree<50%" + kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) { + initialConfig.EvictionHard = "nodefs.inodesFree<50%" + } - runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure) + runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate) }) // Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods @@ -133,12 +135,12 @@ type podTestSpec struct { // It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.) // It ensures that all lower evictionPriority pods are eventually evicted. // runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists -func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string, - evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) { +func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration, + hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) { Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() { - tempSetEvictionHard(f, evictionHard) + tempSetCurrentKubeletConfig(f, updateFunction) BeforeEach(func() { By("seting up pods to be used by tests") for _, spec := range podTestSpecs { @@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs }) It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() { + configEnabled, err := isKubeletConfigEnabled(f) + framework.ExpectNoError(err) + if !configEnabled { + framework.Skipf("Dynamic kubelet config must be enabled for this test to run.") + } Eventually(func() error { hasPressure, err := hasPressureCondition(f, testCondition) if err != nil { @@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs // Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) { - - nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) - framework.ExpectNoError(err, "getting node list") - if len(nodeList.Items) != 1 { - return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items) - } - - _, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure) + localNodeStatus := getLocalNode(f).Status + _, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure) Expect(pressure).NotTo(BeNil()) hasPressure := pressure.Status == v1.ConditionTrue By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure)) diff --git a/test/e2e_node/memory_eviction_test.go b/test/e2e_node/memory_eviction_test.go index eaa572c5a70..41c32c0eb15 100644 --- a/test/e2e_node/memory_eviction_test.go +++ b/test/e2e_node/memory_eviction_test.go @@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu By("creating a guaranteed pod, a burstable pod, and a besteffort pod.") // A pod is guaranteed only when requests and limits are specified for all the containers and they are equal. - guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{ + guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{ Requests: v1.ResourceList{ "cpu": resource.MustParse("100m"), "memory": resource.MustParse("100Mi"), @@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu "cpu": resource.MustParse("100m"), "memory": resource.MustParse("100Mi"), }}) + guaranteed = f.PodClient().CreateSync(guaranteed) + glog.Infof("pod created with name: %s", guaranteed.Name) // A pod is burstable if limits and requests do not match across all containers. - burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{ + burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{ Requests: v1.ResourceList{ "cpu": resource.MustParse("100m"), "memory": resource.MustParse("100Mi"), }}) + burstable = f.PodClient().CreateSync(burstable) + glog.Infof("pod created with name: %s", burstable.Name) - // A pod is besteffort if none of its containers have specified any requests or limits. - besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{}) + // A pod is besteffort if none of its containers have specified any requests or limits . + besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{}) + besteffort = f.PodClient().CreateSync(besteffort) + glog.Infof("pod created with name: %s", besteffort.Name) // We poll until timeout or all pods are killed. // Inside the func, we check that all pods are in a valid phase with @@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu }) -func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod { +func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod { env := []v1.EnvVar{ { Name: "MEMORY_LIMIT", @@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res memLimit = "$(MEMORY_LIMIT)" } - pod := &v1.Pod{ + return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ - GenerateName: genName, + Name: podName, }, Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyNever, @@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res }, }, } - // The generated pod.Name will be on the pod spec returned by CreateSync - pod = f.PodClient().CreateSync(pod) - glog.Infof("pod created with name: %s", pod.Name) - return pod } diff --git a/test/e2e_node/util.go b/test/e2e_node/util.go index 6f6361091e6..ff50b6bb0be 100644 --- a/test/e2e_node/util.go +++ b/test/e2e_node/util.go @@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) { return kubeCfg, nil } -// Convenience method to set the evictionHard threshold during the current context. -func tempSetEvictionHard(f *framework.Framework, evictionHard string) { - tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) { - initialConfig.EvictionHard = evictionHard - }) -} - // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context. // The change is reverted in the AfterEach of the context. // Returns true on success.