Merge pull request #42204 from dashpole/allocatable_eviction

Automatic merge from submit-queue Eviction Manager Enforces Allocatable Thresholds This PR modifies the eviction manager to enforce node allocatable thresholds for memory as described in kubernetes/community#348. This PR should be merged after #41234. cc @kubernetes/sig-node-pr-reviews @kubernetes/sig-node-feature-requests @vishh ** Why is this a bug/regression** Kubelet uses `oom_score_adj` to enforce QoS policies. But the `oom_score_adj` is based on overall memory requested, which means that a Burstable pod that requested a lot of memory can lead to OOM kills for Guaranteed pods, which violates QoS. Even worse, we have observed system daemons like kubelet or kube-proxy being killed by the OOM killer. Without this PR, v1.6 will have node stability issues and regressions in an existing GA feature `out of Resource` handling.
2025-08-04 09:49:50 +00:00 · 2017-03-03 20:20:12 -08:00 · 2017-03-03 20:20:12 -08:00 · 2d319bd406
commit 2d319bd406
parent 99445553df ac612eab8e
14 changed files with 474 additions and 83 deletions
--- a/cmd/kubelet/app/server.go
+++ b/cmd/kubelet/app/server.go
@ -520,7 +520,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
 		var hardEvictionThresholds []evictionapi.Threshold
 		// If the user requested to ignore eviction thresholds, then do not set valid values for hardEvictionThresholds here.
 		if !s.ExperimentalNodeAllocatableIgnoreEvictionThreshold {
-			hardEvictionThresholds, err = eviction.ParseThresholdConfig(s.EvictionHard, "", "", "")
+			hardEvictionThresholds, err = eviction.ParseThresholdConfig([]string{}, s.EvictionHard, "", "", "")
 			if err != nil {
 				return err
 			}
--- a/pkg/kubelet/eviction/BUILD
+++ b/pkg/kubelet/eviction/BUILD
@ -33,6 +33,7 @@ go_test(
        "//pkg/api:go_default_library",
        "//pkg/api/v1:go_default_library",
        "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
+        "//pkg/kubelet/cm:go_default_library",
        "//pkg/kubelet/eviction/api:go_default_library",
        "//pkg/kubelet/lifecycle:go_default_library",
        "//pkg/kubelet/types:go_default_library",
--- a/pkg/kubelet/eviction/api/types.go
+++ b/pkg/kubelet/eviction/api/types.go
@ -36,6 +36,8 @@ const (
 	SignalImageFsAvailable Signal = "imagefs.available"
 	// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers.
 	SignalImageFsInodesFree Signal = "imagefs.inodesFree"
+	// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
+	SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
 )

 // ThresholdOperator is the operator used to express a Threshold.
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -134,9 +134,9 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
 }

 // Start starts the control loop to observe and response to low compute resources.
-func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration) {
+func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration) {
 	// start the eviction manager monitoring
-	go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc) }, monitoringInterval, wait.NeverStop)
+	go wait.Until(func() { m.synchronize(diskInfoProvider, podFunc, nodeProvider) }, monitoringInterval, wait.NeverStop)
 }

 // IsUnderMemoryPressure returns true if the node is under memory pressure.
@ -187,7 +187,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
 }

 // synchronize is the main control loop that enforces eviction thresholds.
-func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
+func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) {
 	// if we have nothing to do, just return
 	thresholds := m.config.Thresholds
 	if len(thresholds) == 0 {
@ -209,7 +209,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	}

 	// make observations and get a function to derive pod usage stats relative to those observations.
-	observations, statsFunc, err := makeSignalObservations(m.summaryProvider)
+	observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
 	if err != nil {
 		glog.Errorf("eviction manager: unexpected err: %v", err)
 		return
@ -224,7 +224,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) {
 			glog.Infof("soft memory eviction threshold crossed at %s", desc)
 			// TODO wait grace period for soft memory limit
-			m.synchronize(diskInfoProvider, podFunc)
+			m.synchronize(diskInfoProvider, podFunc, nodeProvider)
 		})
 		if err != nil {
 			glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
@ -232,7 +232,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 		// start hard memory notification
 		err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) {
 			glog.Infof("hard memory eviction threshold crossed at %s", desc)
-			m.synchronize(diskInfoProvider, podFunc)
+			m.synchronize(diskInfoProvider, podFunc, nodeProvider)
 		})
 		if err != nil {
 			glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -59,6 +59,24 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) {
 	return m.dedicatedImageFs, nil
 }

+func newMockNodeProvider(allocatableCapacity v1.ResourceList) *mockNodeProvider {
+	return &mockNodeProvider{
+		node: v1.Node{
+			Status: v1.NodeStatus{
+				Allocatable: allocatableCapacity,
+			},
+		},
+	}
+}
+
+type mockNodeProvider struct {
+	node v1.Node
+}
+
+func (m *mockNodeProvider) GetNode() (*v1.Node, error) {
+	return &m.node, nil
+}
+
 // mockImageGC is used to simulate invoking image garbage collection.
 type mockImageGC struct {
 	err     error
@ -175,6 +193,7 @@ func TestMemoryPressure(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGC := &mockImageGC{freed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

@ -217,7 +236,7 @@ func TestMemoryPressure(t *testing.T) {
 	burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")

 	// synchronize
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure
 	if manager.IsUnderMemoryPressure() {
@ -235,7 +254,7 @@ func TestMemoryPressure(t *testing.T) {
 	// induce soft threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -250,7 +269,7 @@ func TestMemoryPressure(t *testing.T) {
 	// step forward in time pass the grace period
 	fakeClock.Step(3 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -275,7 +294,7 @@ func TestMemoryPressure(t *testing.T) {
 	// remove memory pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("3Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure
 	if manager.IsUnderMemoryPressure() {
@ -285,7 +304,7 @@ func TestMemoryPressure(t *testing.T) {
 	// induce memory pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -313,7 +332,7 @@ func TestMemoryPressure(t *testing.T) {
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure (because transition period not yet met)
 	if !manager.IsUnderMemoryPressure() {
@ -337,7 +356,7 @@ func TestMemoryPressure(t *testing.T) {
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure (because transition period met)
 	if manager.IsUnderMemoryPressure() {
@ -392,6 +411,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGC := &mockImageGC{freed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

@ -433,7 +453,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0Gi", "0Gi", "0Gi")

 	// synchronize
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -448,7 +468,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	// induce soft threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -463,7 +483,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	// step forward in time pass the grace period
 	fakeClock.Step(3 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -488,7 +508,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	// remove disk pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -498,7 +518,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	// induce disk pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("500Mi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -523,7 +543,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure (because transition period not yet met)
 	if !manager.IsUnderDiskPressure() {
@ -544,7 +564,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure (because transition period met)
 	if manager.IsUnderDiskPressure() {
@ -589,6 +609,7 @@ func TestMinReclaim(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGC := &mockImageGC{freed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

@ -622,7 +643,7 @@ func TestMinReclaim(t *testing.T) {
 	}

 	// synchronize
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure
 	if manager.IsUnderMemoryPressure() {
@ -632,7 +653,7 @@ func TestMinReclaim(t *testing.T) {
 	// induce memory pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -652,7 +673,7 @@ func TestMinReclaim(t *testing.T) {
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1.2Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure (because transition period not yet met)
 	if !manager.IsUnderMemoryPressure() {
@ -672,7 +693,7 @@ func TestMinReclaim(t *testing.T) {
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure (because transition period not yet met)
 	if !manager.IsUnderMemoryPressure() {
@ -688,7 +709,7 @@ func TestMinReclaim(t *testing.T) {
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("2Gi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure (because transition period met)
 	if manager.IsUnderMemoryPressure() {
@ -727,6 +748,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGcFree := resource.MustParse("700Mi")
 	imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
@ -761,7 +783,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	}

 	// synchronize
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -771,7 +793,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// induce hard threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -794,7 +816,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// remove disk pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -804,7 +826,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	// induce disk pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -830,7 +852,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
 	imageGC.invoked = false // reset state
 	podKiller.pod = nil     // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure (because transition period not yet met)
 	if !manager.IsUnderDiskPressure() {
@ -852,7 +874,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 	summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
 	imageGC.invoked = false // reset state
 	podKiller.pod = nil     // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure (because transition period met)
 	if manager.IsUnderDiskPressure() {
@ -920,6 +942,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGC := &mockImageGC{freed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}

@ -961,7 +984,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""), "0", "0", "0")

 	// synchronize
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -976,7 +999,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	// induce soft threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -991,7 +1014,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	// step forward in time pass the grace period
 	fakeClock.Step(3 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -1016,7 +1039,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	// remove inode pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure
 	if manager.IsUnderDiskPressure() {
@ -1026,7 +1049,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	// induce inode pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure
 	if !manager.IsUnderDiskPressure() {
@ -1051,7 +1074,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have disk pressure (because transition period not yet met)
 	if !manager.IsUnderDiskPressure() {
@ -1072,7 +1095,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 	fakeClock.Step(5 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
 	podKiller.pod = nil // reset state
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have disk pressure (because transition period met)
 	if manager.IsUnderDiskPressure() {
@ -1120,6 +1143,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	fakeClock := clock.NewFakeClock(time.Now())
 	podKiller := &mockPodKiller{}
 	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
 	imageGC := &mockImageGC{freed: int64(0), err: nil}
 	nodeRef := &clientv1.ObjectReference{
 		Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
@ -1164,7 +1188,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	// induce soft threshold
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -1179,7 +1203,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	// step forward in time pass the grace period
 	fakeClock.Step(3 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -1197,7 +1221,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	// remove memory pressure
 	fakeClock.Step(20 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("3Gi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should not have memory pressure
 	if manager.IsUnderMemoryPressure() {
@ -1210,7 +1234,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 	// induce memory pressure!
 	fakeClock.Step(1 * time.Minute)
 	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
-	manager.synchronize(diskInfoProvider, activePodsFunc)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)

 	// we should have memory pressure
 	if !manager.IsUnderMemoryPressure() {
@ -1222,3 +1246,167 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 	}
 }
+
+// TestAllocatableMemoryPressure
+func TestAllocatableMemoryPressure(t *testing.T) {
+	podMaker := makePodWithMemoryStats
+	summaryStatsMaker := makeMemoryStats
+	constantCapacity := "4Gi"
+	podsToMake := []podToMake{
+		{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"},
+		{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "400Mi"},
+		{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"},
+		{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "500Mi"},
+		{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "100Mi"},
+		{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "200Mi"},
+	}
+	pods := []*v1.Pod{}
+	podStats := map[*v1.Pod]statsapi.PodStats{}
+	for _, podToMake := range podsToMake {
+		pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
+		pods = append(pods, pod)
+		podStats[pod] = podStat
+	}
+	podToEvict := pods[5]
+	activePodsFunc := func() []*v1.Pod {
+		return pods
+	}
+
+	fakeClock := clock.NewFakeClock(time.Now())
+	podKiller := &mockPodKiller{}
+	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("2Gi")})
+	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	nodeRef := &clientv1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
+
+	config := Config{
+		MaxPodGracePeriodSeconds: 5,
+		PressureTransitionPeriod: time.Minute * 5,
+		Thresholds: []evictionapi.Threshold{
+			{
+				Signal:   evictionapi.SignalAllocatableMemoryAvailable,
+				Operator: evictionapi.OpLessThan,
+				Value: evictionapi.ThresholdValue{
+					Quantity: quantityMustParse("1Ki"),
+				},
+			},
+		},
+	}
+	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(constantCapacity, podStats)}
+	manager := &managerImpl{
+		clock:           fakeClock,
+		killPodFunc:     podKiller.killPodNow,
+		imageGC:         imageGC,
+		config:          config,
+		recorder:        &record.FakeRecorder{},
+		summaryProvider: summaryProvider,
+		nodeRef:         nodeRef,
+		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
+		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+	}
+
+	// create a best effort pod to test admission
+	bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
+	burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
+
+	// synchronize
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
+
+	// we should not have memory pressure
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
+	// try to admit our pods (they should succeed)
+	expected := []bool{true, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
+		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
+			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
+		}
+	}
+
+	// induce memory pressure!
+	fakeClock.Step(1 * time.Minute)
+	pod, podStat := podMaker("guaranteed-high-2", newResourceList("100m", "1Gi"), newResourceList("100m", "1Gi"), "1Gi")
+	podStats[pod] = podStat
+	summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// check the right pod was killed
+	if podKiller.pod != podToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
+	}
+	observedGracePeriod := *podKiller.gracePeriodOverride
+	if observedGracePeriod != int64(0) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	}
+	// reset state
+	podKiller.pod = nil
+	podKiller.gracePeriodOverride = nil
+
+	// the best-effort pod should not admit, burstable should
+	expected = []bool{false, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
+		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
+			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
+		}
+	}
+
+	// reduce memory pressure
+	fakeClock.Step(1 * time.Minute)
+	for pod := range podStats {
+		if pod.Name == "guaranteed-high-2" {
+			delete(podStats, pod)
+		}
+	}
+	summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
+
+	// we should have memory pressure (because transition period not yet met)
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
+	}
+
+	// the best-effort pod should not admit, burstable should
+	expected = []bool{false, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
+		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
+			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
+		}
+	}
+
+	// move the clock past transition period to ensure that we stop reporting pressure
+	fakeClock.Step(5 * time.Minute)
+	summaryProvider.result = summaryStatsMaker(constantCapacity, podStats)
+	podKiller.pod = nil // reset state
+	manager.synchronize(diskInfoProvider, activePodsFunc, nodeProvider)
+
+	// we should not have memory pressure (because transition period met)
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
+	// no pod should have been killed
+	if podKiller.pod != nil {
+		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
+	}
+
+	// all pods should admit now
+	expected = []bool{true, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
+		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
+			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
+		}
+	}
+}
--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@ -29,6 +29,7 @@ import (
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/v1"
 	statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
+	"k8s.io/kubernetes/pkg/kubelet/cm"
 	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
 	"k8s.io/kubernetes/pkg/kubelet/qos"
 	"k8s.io/kubernetes/pkg/kubelet/server/stats"
@ -68,6 +69,7 @@ func init() {
 	// map eviction signals to node conditions
 	signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{}
 	signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure
+	signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure
 	signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure
 	signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
 	signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
@ -76,6 +78,7 @@ func init() {
 	// map signals to resources (and vice-versa)
 	signalToResource = map[evictionapi.Signal]v1.ResourceName{}
 	signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
+	signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
 	signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
 	signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
 	signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
@ -93,8 +96,10 @@ func validSignal(signal evictionapi.Signal) bool {
 }

 // ParseThresholdConfig parses the flags for thresholds.
-func ParseThresholdConfig(evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) {
+func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim string) ([]evictionapi.Threshold, error) {
 	results := []evictionapi.Threshold{}
+	allocatableThresholds := getAllocatableThreshold(allocatableConfig)
+	results = append(results, allocatableThresholds...)

 	hardThresholds, err := parseThresholdStatements(evictionHard)
 	if err != nil {
@ -214,6 +219,27 @@ func parseThresholdStatement(statement string) (evictionapi.Threshold, error) {
 	}, nil
 }

+// getAllocatableThreshold returns the thresholds applicable for the allocatable configuration
+func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold {
+	for _, key := range allocatableConfig {
+		if key == cm.NodeAllocatableEnforcementKey {
+			return []evictionapi.Threshold{
+				{
+					Signal:   evictionapi.SignalAllocatableMemoryAvailable,
+					Operator: evictionapi.OpLessThan,
+					Value: evictionapi.ThresholdValue{
+						Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
+					},
+					MinReclaim: &evictionapi.ThresholdValue{
+						Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
+					},
+				},
+			}
+		}
+	}
+	return []evictionapi.Threshold{}
+}
+
 // parsePercentage parses a string representing a percentage value
 func parsePercentage(input string) (float32, error) {
 	value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32)
@ -611,11 +637,15 @@ func (a byEvictionPriority) Less(i, j int) bool {
 }

 // makeSignalObservations derives observations using the specified summary provider.
-func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObservations, statsFunc, error) {
+func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) {
 	summary, err := summaryProvider.Get()
 	if err != nil {
 		return nil, nil, err
 	}
+	node, err := nodeProvider.GetNode()
+	if err != nil {
+		return nil, nil, err
+	}

 	// build the function to work against for pod stats
 	statsFunc := cachedStatsFunc(summary.Pods)
@ -663,6 +693,19 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
 			}
 		}
 	}
+	if memoryAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceMemory]; ok {
+		memoryAllocatableAvailable := memoryAllocatableCapacity.Copy()
+		for _, pod := range summary.Pods {
+			mu, err := podMemoryUsage(pod)
+			if err == nil {
+				memoryAllocatableAvailable.Sub(mu[v1.ResourceMemory])
+			}
+		}
+		result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{
+			available: memoryAllocatableAvailable,
+			capacity:  memoryAllocatableCapacity.Copy(),
+		}
+	}
 	return result, statsFunc, nil
 }

--- a/pkg/kubelet/eviction/helpers_test.go
+++ b/pkg/kubelet/eviction/helpers_test.go
@ -28,6 +28,7 @@ import (
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/v1"
 	statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
+	"k8s.io/kubernetes/pkg/kubelet/cm"
 	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
 	"k8s.io/kubernetes/pkg/quota"
 )
@ -40,6 +41,7 @@ func quantityMustParse(value string) *resource.Quantity {
 func TestParseThresholdConfig(t *testing.T) {
 	gracePeriod, _ := time.ParseDuration("30s")
 	testCases := map[string]struct {
+		allocatableConfig       []string
 		evictionHard            string
 		evictionSoft            string
 		evictionSoftGracePeriod string
@ -48,6 +50,7 @@ func TestParseThresholdConfig(t *testing.T) {
 		expectThresholds        []evictionapi.Threshold
 	}{
 		"no values": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -56,12 +59,23 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"all flag values": {
+			allocatableConfig:       []string{cm.NodeAllocatableEnforcementKey},
 			evictionHard:            "memory.available<150Mi",
 			evictionSoft:            "memory.available<300Mi",
 			evictionSoftGracePeriod: "memory.available=30s",
 			evictionMinReclaim:      "memory.available=0",
 			expectErr:               false,
 			expectThresholds: []evictionapi.Threshold{
+				{
+					Signal:   evictionapi.SignalAllocatableMemoryAvailable,
+					Operator: evictionapi.OpLessThan,
+					Value: evictionapi.ThresholdValue{
+						Quantity: quantityMustParse("0"),
+					},
+					MinReclaim: &evictionapi.ThresholdValue{
+						Quantity: quantityMustParse("0"),
+					},
+				},
 				{
 					Signal:   evictionapi.SignalMemoryAvailable,
 					Operator: evictionapi.OpLessThan,
@ -86,6 +100,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			},
 		},
 		"all flag values in percentages": {
+			allocatableConfig:       []string{},
 			evictionHard:            "memory.available<10%",
 			evictionSoft:            "memory.available<30%",
 			evictionSoftGracePeriod: "memory.available=30s",
@ -116,6 +131,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			},
 		},
 		"disk flag values": {
+			allocatableConfig:       []string{},
 			evictionHard:            "imagefs.available<150Mi,nodefs.available<100Mi",
 			evictionSoft:            "imagefs.available<300Mi,nodefs.available<200Mi",
 			evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s",
@ -167,6 +183,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			},
 		},
 		"disk flag values in percentages": {
+			allocatableConfig:       []string{},
 			evictionHard:            "imagefs.available<15%,nodefs.available<10.5%",
 			evictionSoft:            "imagefs.available<30%,nodefs.available<20.5%",
 			evictionSoftGracePeriod: "imagefs.available=30s,nodefs.available=30s",
@ -218,6 +235,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			},
 		},
 		"inode flag values": {
+			allocatableConfig:       []string{},
 			evictionHard:            "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi",
 			evictionSoft:            "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi",
 			evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s",
@ -269,6 +287,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			},
 		},
 		"invalid-signal": {
+			allocatableConfig:       []string{},
 			evictionHard:            "mem.available<150Mi",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -277,6 +296,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"hard-signal-negative": {
+			allocatableConfig:       []string{},
 			evictionHard:            "memory.available<-150Mi",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -285,6 +305,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"hard-signal-negative-percentage": {
+			allocatableConfig:       []string{},
 			evictionHard:            "memory.available<-15%",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -293,6 +314,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"soft-signal-negative": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "memory.available<-150Mi",
 			evictionSoftGracePeriod: "",
@ -301,6 +323,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"duplicate-signal": {
+			allocatableConfig:       []string{},
 			evictionHard:            "memory.available<150Mi,memory.available<100Mi",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -309,6 +332,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"valid-and-invalid-signal": {
+			allocatableConfig:       []string{},
 			evictionHard:            "memory.available<150Mi,invalid.foo<150Mi",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -317,6 +341,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"soft-no-grace-period": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "memory.available<150Mi",
 			evictionSoftGracePeriod: "",
@ -325,6 +350,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"soft-neg-grace-period": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "memory.available<150Mi",
 			evictionSoftGracePeriod: "memory.available=-30s",
@ -333,6 +359,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"neg-reclaim": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -341,6 +368,7 @@ func TestParseThresholdConfig(t *testing.T) {
 			expectThresholds:        []evictionapi.Threshold{},
 		},
 		"duplicate-reclaim": {
+			allocatableConfig:       []string{},
 			evictionHard:            "",
 			evictionSoft:            "",
 			evictionSoftGracePeriod: "",
@ -350,7 +378,7 @@ func TestParseThresholdConfig(t *testing.T) {
 		},
 	}
 	for testName, testCase := range testCases {
-		thresholds, err := ParseThresholdConfig(testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim)
+		thresholds, err := ParseThresholdConfig(testCase.allocatableConfig, testCase.evictionHard, testCase.evictionSoft, testCase.evictionSoftGracePeriod, testCase.evictionMinReclaim)
 		if testCase.expectErr != (err != nil) {
 			t.Errorf("Err not as expected, test: %v, error expected: %v, actual: %v", testName, testCase.expectErr, err)
 		}
@ -699,6 +727,7 @@ func TestMakeSignalObservations(t *testing.T) {
 	}
 	nodeAvailableBytes := uint64(1024 * 1024 * 1024)
 	nodeWorkingSetBytes := uint64(1024 * 1024 * 1024)
+	allocatableMemoryCapacity := uint64(5 * 1024 * 1024 * 1024)
 	imageFsAvailableBytes := uint64(1024 * 1024)
 	imageFsCapacityBytes := uint64(1024 * 1024 * 2)
 	nodeFsAvailableBytes := uint64(1024)
@ -738,15 +767,31 @@ func TestMakeSignalObservations(t *testing.T) {
 		podMaker("pod1", "ns2", "uuid2", 1),
 		podMaker("pod3", "ns3", "uuid3", 1),
 	}
-	containerWorkingSetBytes := int64(1024 * 1024)
+	containerWorkingSetBytes := int64(1024 * 1024 * 1024)
 	for _, pod := range pods {
 		fakeStats.Pods = append(fakeStats.Pods, newPodStats(pod, containerWorkingSetBytes))
 	}
-	actualObservations, statsFunc, err := makeSignalObservations(provider)
+	res := quantityMustParse("5Gi")
+	nodeProvider := newMockNodeProvider(v1.ResourceList{v1.ResourceMemory: *res})
+	// Allocatable thresholds are always 100%.  Verify that Threshold == Capacity.
+	if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 {
+		t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity)
+	}
+	actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider)

 	if err != nil {
 		t.Errorf("Unexpected err: %v", err)
 	}
+	allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable]
+	if !found {
+		t.Errorf("Expected allocatable memory observation, but didnt find one")
+	}
+	if allocatableMemQuantity.available.Value() != 2*containerWorkingSetBytes {
+		t.Errorf("Expected %v, actual: %v", containerWorkingSetBytes, allocatableMemQuantity.available.Value())
+	}
+	if expectedBytes := int64(allocatableMemoryCapacity); allocatableMemQuantity.capacity.Value() != expectedBytes {
+		t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.capacity.Value())
+	}
 	memQuantity, found := actualObservations[evictionapi.SignalMemoryAvailable]
 	if !found {
 		t.Errorf("Expected available memory observation: %v", err)
--- a/pkg/kubelet/eviction/types.go
+++ b/pkg/kubelet/eviction/types.go
@ -53,7 +53,7 @@ type Config struct {
 // Manager evaluates when an eviction threshold for node stability has been met on the node.
 type Manager interface {
 	// Start starts the control loop to monitor eviction thresholds at specified interval.
-	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, monitoringInterval time.Duration)
+	Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider, monitoringInterval time.Duration)

 	// IsUnderMemoryPressure returns true if the node is under memory pressure.
 	IsUnderMemoryPressure() bool
@ -68,6 +68,12 @@ type DiskInfoProvider interface {
 	HasDedicatedImageFs() (bool, error)
 }

+// NodeProvider is responsible for providing the node api object describing this node
+type NodeProvider interface {
+	// GetNode returns the node info for this node
+	GetNode() (*v1.Node, error)
+}
+
 // ImageGC is responsible for performing garbage collection of unused images.
 type ImageGC interface {
 	// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -349,7 +349,12 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 		RootFreeDiskMB:   int(kubeCfg.LowDiskSpaceThresholdMB),
 	}

-	thresholds, err := eviction.ParseThresholdConfig(kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
+	enforceNodeAllocatable := kubeCfg.EnforceNodeAllocatable
+	if kubeCfg.ExperimentalNodeAllocatableIgnoreEvictionThreshold {
+		// Do not provide kubeCfg.EnforceNodeAllocatable to eviction threshold parsing if we are not enforcing Evictions
+		enforceNodeAllocatable = []string{}
+	}
+	thresholds, err := eviction.ParseThresholdConfig(enforceNodeAllocatable, kubeCfg.EvictionHard, kubeCfg.EvictionSoft, kubeCfg.EvictionSoftGracePeriod, kubeCfg.EvictionMinimumReclaim)
 	if err != nil {
 		return nil, err
 	}
@ -1222,7 +1227,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
 		glog.Fatalf("Failed to start cAdvisor %v", err)
 	}
 	// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
-	kl.evictionManager.Start(kl, kl.getActivePods, evictionMonitoringPeriod)
+	kl.evictionManager.Start(kl, kl.getActivePods, kl, evictionMonitoringPeriod)
 }

 // Run starts the kubelet reacting to config updates
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@ -57,6 +57,7 @@ go_library(
 go_test(
    name = "go_default_test",
    srcs = [
+        "allocatable_eviction_test.go",
        "apparmor_test.go",
        "container_manager_test.go",
        "critical_pod_test.go",
--- a/test/e2e_node/allocatable_eviction_test.go
+++ b/test/e2e_node/allocatable_eviction_test.go
@ -0,0 +1,104 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e_node
+
+import (
+	"fmt"
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/apis/componentconfig"
+	"k8s.io/kubernetes/pkg/kubelet/cm"
+	"k8s.io/kubernetes/test/e2e/framework"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+// Eviction Policy is described here:
+// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
+
+var _ = framework.KubeDescribe("AllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
+	f := framework.NewDefaultFramework("allocatable-eviction-test")
+
+	podTestSpecs := []podTestSpec{
+		{
+			evictionPriority: 1, // This pod should be evicted before the innocent pod
+			pod:              *getMemhogPod("memory-hog-pod", "memory-hog", v1.ResourceRequirements{}),
+		},
+		{
+			evictionPriority: 0, // This pod should never be evicted
+			pod: v1.Pod{
+				ObjectMeta: metav1.ObjectMeta{Name: "innocent-pod"},
+				Spec: v1.PodSpec{
+					RestartPolicy: v1.RestartPolicyNever,
+					Containers: []v1.Container{
+						{
+							Image: "gcr.io/google_containers/busybox:1.24",
+							Name:  "normal-memory-usage-container",
+							Command: []string{
+								"sh",
+								"-c", //make one big (5 Gb) file
+								"dd if=/dev/urandom of=largefile bs=5000000000 count=1; while true; do sleep 5; done",
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	evictionTestTimeout := 40 * time.Minute
+	testCondition := "Memory Pressure"
+	kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
+		initialConfig.EvictionHard = "memory.available<10%"
+		// Set large system and kube reserved values to trigger allocatable thresholds far before hard eviction thresholds.
+		initialConfig.SystemReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
+		initialConfig.KubeReserved = componentconfig.ConfigurationMap(map[string]string{"memory": "1Gi"})
+		initialConfig.EnforceNodeAllocatable = []string{cm.NodeAllocatableEnforcementKey}
+		initialConfig.ExperimentalNodeAllocatableIgnoreEvictionThreshold = false
+		initialConfig.CgroupsPerQOS = true
+	}
+	runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasMemoryPressure, kubeletConfigUpdate)
+})
+
+// Returns TRUE if the node has Memory Pressure, FALSE otherwise
+func hasMemoryPressure(f *framework.Framework, testCondition string) (bool, error) {
+	localNodeStatus := getLocalNode(f).Status
+	_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeMemoryPressure)
+	Expect(pressure).NotTo(BeNil())
+	hasPressure := pressure.Status == v1.ConditionTrue
+	By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
+
+	// Additional Logging relating to Memory
+	summary, err := getNodeSummary()
+	if err != nil {
+		return false, err
+	}
+	if summary.Node.Memory != nil && summary.Node.Memory.WorkingSetBytes != nil && summary.Node.Memory.AvailableBytes != nil {
+		framework.Logf("Node.Memory.WorkingSetBytes: %d, summary.Node.Memory.AvailableBytes: %d", *summary.Node.Memory.WorkingSetBytes, *summary.Node.Memory.AvailableBytes)
+	}
+	for _, pod := range summary.Pods {
+		framework.Logf("Pod: %s", pod.PodRef.Name)
+		for _, container := range pod.Containers {
+			if container.Memory != nil && container.Memory.WorkingSetBytes != nil {
+				framework.Logf("--- summary Container: %s WorkingSetBytes: %d", container.Name, *container.Memory.WorkingSetBytes)
+			}
+		}
+	}
+	return hasPressure, nil
+}
--- a/test/e2e_node/inode_eviction_test.go
+++ b/test/e2e_node/inode_eviction_test.go
@ -22,6 +22,7 @@ import (

 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/apis/componentconfig"
 	"k8s.io/kubernetes/test/e2e/framework"

 	. "github.com/onsi/ginkgo"
@ -112,10 +113,11 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
 	}
 	evictionTestTimeout := 30 * time.Minute
 	testCondition := "Disk Pressure due to Inodes"
-	// Set the EvictionHard threshold lower to decrease test time
-	evictionHardLimit := "nodefs.inodesFree<50%"
+	kubeletConfigUpdate := func(initialConfig *componentconfig.KubeletConfiguration) {
+		initialConfig.EvictionHard = "nodefs.inodesFree<50%"
+	}

-	runEvictionTest(f, testCondition, podTestSpecs, evictionHardLimit, evictionTestTimeout, hasInodePressure)
+	runEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure, kubeletConfigUpdate)
 })

 // Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods
@ -133,12 +135,12 @@ type podTestSpec struct {
 //		It ensures that lower evictionPriority pods are always evicted before higher evictionPriority pods (2 evicted before 1, etc.)
 //		It ensures that all lower evictionPriority pods are eventually evicted.
 // runEvictionTest then cleans up the testing environment by deleting provided nodes, and ensures that testCondition no longer exists
-func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionHard string,
-	evictionTestTimeout time.Duration, hasPressureCondition func(*framework.Framework, string) (bool, error)) {
+func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podTestSpec, evictionTestTimeout time.Duration,
+	hasPressureCondition func(*framework.Framework, string) (bool, error), updateFunction func(initialConfig *componentconfig.KubeletConfiguration)) {

 	Context(fmt.Sprintf("when we run containers that should cause %s", testCondition), func() {

-		tempSetEvictionHard(f, evictionHard)
+		tempSetCurrentKubeletConfig(f, updateFunction)
 		BeforeEach(func() {
 			By("seting up pods to be used by tests")
 			for _, spec := range podTestSpecs {
@ -148,6 +150,11 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs
 		})

 		It(fmt.Sprintf("should eventually see %s, and then evict all of the correct pods", testCondition), func() {
+			configEnabled, err := isKubeletConfigEnabled(f)
+			framework.ExpectNoError(err)
+			if !configEnabled {
+				framework.Skipf("Dynamic kubelet config must be enabled for this test to run.")
+			}
 			Eventually(func() error {
 				hasPressure, err := hasPressureCondition(f, testCondition)
 				if err != nil {
@ -299,14 +306,8 @@ func runEvictionTest(f *framework.Framework, testCondition string, podTestSpecs

 // Returns TRUE if the node has disk pressure due to inodes exists on the node, FALSE otherwise
 func hasInodePressure(f *framework.Framework, testCondition string) (bool, error) {
-
-	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
-	framework.ExpectNoError(err, "getting node list")
-	if len(nodeList.Items) != 1 {
-		return false, fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
-	}
-
-	_, pressure := v1.GetNodeCondition(&nodeList.Items[0].Status, v1.NodeDiskPressure)
+	localNodeStatus := getLocalNode(f).Status
+	_, pressure := v1.GetNodeCondition(&localNodeStatus, v1.NodeDiskPressure)
 	Expect(pressure).NotTo(BeNil())
 	hasPressure := pressure.Status == v1.ConditionTrue
 	By(fmt.Sprintf("checking if pod has %s: %v", testCondition, hasPressure))
--- a/test/e2e_node/memory_eviction_test.go
+++ b/test/e2e_node/memory_eviction_test.go
@ -136,7 +136,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
 					By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")

 					// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
-					guaranteed := createMemhogPod(f, "guaranteed-", "guaranteed", v1.ResourceRequirements{
+					guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
 						Requests: v1.ResourceList{
 							"cpu":    resource.MustParse("100m"),
 							"memory": resource.MustParse("100Mi"),
@ -145,16 +145,22 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
 							"cpu":    resource.MustParse("100m"),
 							"memory": resource.MustParse("100Mi"),
 						}})
+					guaranteed = f.PodClient().CreateSync(guaranteed)
+					glog.Infof("pod created with name: %s", guaranteed.Name)

 					// A pod is burstable if limits and requests do not match across all containers.
-					burstable := createMemhogPod(f, "burstable-", "burstable", v1.ResourceRequirements{
+					burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
 						Requests: v1.ResourceList{
 							"cpu":    resource.MustParse("100m"),
 							"memory": resource.MustParse("100Mi"),
 						}})
+					burstable = f.PodClient().CreateSync(burstable)
+					glog.Infof("pod created with name: %s", burstable.Name)

-					// A pod is besteffort if none of its containers have specified any requests or limits.
-					besteffort := createMemhogPod(f, "besteffort-", "besteffort", v1.ResourceRequirements{})
+					// A pod is besteffort if none of its containers have specified any requests or limits	.
+					besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
+					besteffort = f.PodClient().CreateSync(besteffort)
+					glog.Infof("pod created with name: %s", besteffort.Name)

 					// We poll until timeout or all pods are killed.
 					// Inside the func, we check that all pods are in a valid phase with
@ -232,7 +238,7 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu

 })

-func createMemhogPod(f *framework.Framework, genName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
+func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
 	env := []v1.EnvVar{
 		{
 			Name: "MEMORY_LIMIT",
@ -256,9 +262,9 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
 		memLimit = "$(MEMORY_LIMIT)"
 	}

-	pod := &v1.Pod{
+	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
-			GenerateName: genName,
+			Name: podName,
 		},
 		Spec: v1.PodSpec{
 			RestartPolicy: v1.RestartPolicyNever,
@ -277,8 +283,4 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
 			},
 		},
 	}
-	// The generated pod.Name will be on the pod spec returned by CreateSync
-	pod = f.PodClient().CreateSync(pod)
-	glog.Infof("pod created with name: %s", pod.Name)
-	return pod
 }
--- a/test/e2e_node/util.go
+++ b/test/e2e_node/util.go
@ -86,13 +86,6 @@ func getCurrentKubeletConfig() (*componentconfig.KubeletConfiguration, error) {
 	return kubeCfg, nil
 }

-// Convenience method to set the evictionHard threshold during the current context.
-func tempSetEvictionHard(f *framework.Framework, evictionHard string) {
-	tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
-		initialConfig.EvictionHard = evictionHard
-	})
-}
-
 // Must be called within a Context. Allows the function to modify the KubeletConfiguration during the BeforeEach of the context.
 // The change is reverted in the AfterEach of the context.
 // Returns true on success.