diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index adb4ee85bd4..c6fa59e0fa2 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -40,6 +40,8 @@ type managerImpl struct { config Config // the function to invoke to kill a pod killPodFunc KillPodFunc + // the interface that knows how to do image gc + imageGC ImageGC // protects access to internal state sync.RWMutex // node conditions are the set of conditions present @@ -54,8 +56,12 @@ type managerImpl struct { summaryProvider stats.SummaryProvider // records when a threshold was first observed thresholdsFirstObservedAt thresholdsObservedAt + // records the set of thresholds that have been met (including graceperiod) but not yet resolved + thresholdsMet []Threshold // resourceToRankFunc maps a resource to ranking function for that resource. resourceToRankFunc map[api.ResourceName]rankFunc + // resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. + resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs } // ensure it implements the required interface @@ -66,12 +72,14 @@ func NewManager( summaryProvider stats.SummaryProvider, config Config, killPodFunc KillPodFunc, + imageGC ImageGC, recorder record.EventRecorder, nodeRef *api.ObjectReference, clock clock.Clock) (Manager, lifecycle.PodAdmitHandler, error) { manager := &managerImpl{ clock: clock, killPodFunc: killPodFunc, + imageGC: imageGC, config: config, recorder: recorder, summaryProvider: summaryProvider, @@ -138,13 +146,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // build the ranking functions (if not yet known) // TODO: have a function in cadvisor that lets us know if global housekeeping has completed - if len(m.resourceToRankFunc) == 0 { + if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 { // this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass. hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs() if err != nil { return } m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs) + m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs) } // make observations and get a function to derive pod usage stats relative to those observations. @@ -158,7 +167,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act now := m.clock.Now() // determine the set of thresholds met independent of grace period - thresholds = thresholdsMet(thresholds, observations) + thresholds = thresholdsMet(thresholds, observations, false) + + // determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim + if len(m.thresholdsMet) > 0 { + thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true) + thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved) + } // track when a threshold was first observed thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now) @@ -180,10 +195,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act m.nodeConditions = nodeConditions m.thresholdsFirstObservedAt = thresholdsFirstObservedAt m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt + m.thresholdsMet = thresholds m.Unlock() // determine the set of resources under starvation - starvedResources := reclaimResources(thresholds) + starvedResources := getStarvedResources(thresholds) if len(starvedResources) == 0 { glog.V(3).Infof("eviction manager: no resources are starved") return @@ -200,6 +216,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) + // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. + if m.reclaimNodeLevelResources(resourceToReclaim, observations) { + glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim) + return + } + + glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim) + // rank the pods for eviction rank, ok := m.resourceToRankFunc[resourceToReclaim] if !ok { @@ -245,3 +269,31 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } glog.Infof("eviction manager: unable to evict any pods from the node") } + +// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. +func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim api.ResourceName, observations signalObservations) bool { + nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim] + for _, nodeReclaimFunc := range nodeReclaimFuncs { + // attempt to reclaim the pressured resource. + reclaimed, err := nodeReclaimFunc() + if err == nil { + // update our local observations based on the amount reported to have been reclaimed. + // note: this is optimistic, other things could have been still consuming the pressured resource in the interim. + signal := resourceToSignal[resourceToReclaim] + value, ok := observations[signal] + if !ok { + glog.Errorf("eviction manager: unable to find value associated with signal %v", signal) + continue + } + value.Add(*reclaimed) + + // evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals + if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 { + return true + } + } else { + glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err) + } + } + return false +} diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 0d8c365838e..9870a654f19 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -54,6 +54,19 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) { return m.dedicatedImageFs, nil } +// mockImageGC is used to simulate invoking image garbage collection. +type mockImageGC struct { + err error + freed int64 + invoked bool +} + +// DeleteUnusedImages returns the mocked values. +func (m *mockImageGC) DeleteUnusedImages() (int64, error) { + m.invoked = true + return m.freed, m.err +} + // TestMemoryPressure func TestMemoryPressure(t *testing.T) { podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) { @@ -106,6 +119,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} config := Config{ @@ -129,6 +143,7 @@ func TestMemoryPressure(t *testing.T) { manager := &managerImpl{ clock: fakeClock, killPodFunc: podKiller.killPodNow, + imageGC: imageGC, config: config, recorder: &record.FakeRecorder{}, summaryProvider: summaryProvider, @@ -351,6 +366,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} config := Config{ @@ -374,6 +390,7 @@ func TestDiskPressureNodeFs(t *testing.T) { manager := &managerImpl{ clock: fakeClock, killPodFunc: podKiller.killPodNow, + imageGC: imageGC, config: config, recorder: &record.FakeRecorder{}, summaryProvider: summaryProvider, @@ -514,3 +531,362 @@ func TestDiskPressureNodeFs(t *testing.T) { t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) } } + +// TestMinReclaim verifies that min-reclaim works as desired. +func TestMinReclaim(t *testing.T) { + podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) { + pod := newPod(name, []api.Container{ + newContainer(name, requests, limits), + }, nil) + podStats := newPodMemoryStats(pod, resource.MustParse(memoryWorkingSet)) + return pod, podStats + } + summaryStatsMaker := func(nodeAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary { + val := resource.MustParse(nodeAvailableBytes) + availableBytes := uint64(val.Value()) + result := &statsapi.Summary{ + Node: statsapi.NodeStats{ + Memory: &statsapi.MemoryStats{ + AvailableBytes: &availableBytes, + }, + }, + Pods: []statsapi.PodStats{}, + } + for _, podStat := range podStats { + result.Pods = append(result.Pods, podStat) + } + return result + } + podsToMake := []struct { + name string + requests api.ResourceList + limits api.ResourceList + memoryWorkingSet string + }{ + {name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "500Mi"}, + {name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "300Mi"}, + {name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "800Mi"}, + {name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"}, + {name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"}, + {name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"}, + } + pods := []*api.Pod{} + podStats := map[*api.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet) + pods = append(pods, pod) + podStats[pod] = podStat + } + activePodsFunc := func() []*api.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} + nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: quantityMustParse("1Gi"), + MinReclaim: quantityMustParse("500Mi"), + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // induce memory pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce memory pressure, but not below the min-reclaim amount + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure (because transition period not yet met) + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce memory pressure and ensure the min-reclaim amount + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have memory pressure (because transition period not yet met) + if !manager.IsUnderMemoryPressure() { + t.Errorf("Manager should report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have memory pressure (because transition period met) + if manager.IsUnderMemoryPressure() { + t.Errorf("Manager should not report memory pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } +} + +func TestNodeReclaimFuncs(t *testing.T) { + podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string) (*api.Pod, statsapi.PodStats) { + pod := newPod(name, []api.Container{ + newContainer(name, requests, limits), + }, nil) + podStats := newPodDiskStats(pod, parseQuantity(rootFsUsed), parseQuantity(logsUsed), parseQuantity(perLocalVolumeUsed)) + return pod, podStats + } + summaryStatsMaker := func(rootFsAvailableBytes, imageFsAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary { + rootFsVal := resource.MustParse(rootFsAvailableBytes) + rootFsBytes := uint64(rootFsVal.Value()) + imageFsVal := resource.MustParse(imageFsAvailableBytes) + imageFsBytes := uint64(imageFsVal.Value()) + result := &statsapi.Summary{ + Node: statsapi.NodeStats{ + Fs: &statsapi.FsStats{ + AvailableBytes: &rootFsBytes, + }, + Runtime: &statsapi.RuntimeStats{ + ImageFs: &statsapi.FsStats{ + AvailableBytes: &imageFsBytes, + }, + }, + }, + Pods: []statsapi.PodStats{}, + } + for _, podStat := range podStats { + result.Pods = append(result.Pods, podStat) + } + return result + } + podsToMake := []struct { + name string + requests api.ResourceList + limits api.ResourceList + rootFsUsed string + logsFsUsed string + perLocalVolumeUsed string + }{ + {name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "500Mi"}, + {name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), perLocalVolumeUsed: "300Mi"}, + {name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), logsFsUsed: "300Mi"}, + {name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "800Mi"}, + {name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "200Mi"}, + } + pods := []*api.Pod{} + podStats := map[*api.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed) + pods = append(pods, pod) + podStats[pod] = podStat + } + activePodsFunc := func() []*api.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGcFree := resource.MustParse("700Mi") + imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil} + nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalNodeFsAvailable, + Operator: OpLessThan, + Value: quantityMustParse("1Gi"), + MinReclaim: quantityMustParse("500Mi"), + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("16Gi", "200Gi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // induce hard threshold + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure since soft threshold was met") + } + + // verify image gc was invoked + if !imageGC.invoked { + t.Errorf("Manager should have invoked image gc") + } + + // verify no pod was killed because image gc was sufficient + if podKiller.pod != nil { + t.Errorf("Manager should not have killed a pod, but killed: %v", podKiller.pod) + } + + // reset state + imageGC.invoked = false + + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure") + } + + // ensure image gc was invoked + if !imageGC.invoked { + t.Errorf("Manager should have invoked image gc") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) + imageGC.invoked = false // reset state + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure") + } + + // no image gc should have occurred + if imageGC.invoked { + t.Errorf("Manager chose to perform image gc when it was not neeed") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) + imageGC.invoked = false // reset state + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // no image gc should have occurred + if imageGC.invoked { + t.Errorf("Manager chose to perform image gc when it was not neeed") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } +} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 2709153cd8c..6f04d600a8c 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -47,18 +47,31 @@ const ( resourceNodeFs api.ResourceName = "nodefs" ) -// signalToNodeCondition maps a signal to the node condition to report if threshold is met. -var signalToNodeCondition = map[Signal]api.NodeConditionType{ - SignalMemoryAvailable: api.NodeMemoryPressure, - SignalImageFsAvailable: api.NodeDiskPressure, - SignalNodeFsAvailable: api.NodeDiskPressure, -} +var ( + // signalToNodeCondition maps a signal to the node condition to report if threshold is met. + signalToNodeCondition map[Signal]api.NodeConditionType + // signalToResource maps a Signal to its associated Resource. + signalToResource map[Signal]api.ResourceName + // resourceToSignal maps a Resource to its associated Signal + resourceToSignal map[api.ResourceName]Signal +) -// signalToResource maps a Signal to its associated Resource. -var signalToResource = map[Signal]api.ResourceName{ - SignalMemoryAvailable: api.ResourceMemory, - SignalImageFsAvailable: resourceImageFs, - SignalNodeFsAvailable: resourceNodeFs, +func init() { + // map eviction signals to node conditions + signalToNodeCondition = map[Signal]api.NodeConditionType{} + signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure + signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure + signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure + + // map signals to resources (and vice-versa) + signalToResource = map[Signal]api.ResourceName{} + signalToResource[SignalMemoryAvailable] = api.ResourceMemory + signalToResource[SignalImageFsAvailable] = resourceImageFs + signalToResource[SignalNodeFsAvailable] = resourceNodeFs + resourceToSignal = map[api.ResourceName]Signal{} + for key, value := range signalToResource { + resourceToSignal[value] = key + } } // validSignal returns true if the signal is supported. @@ -534,7 +547,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv } // thresholdsMet returns the set of thresholds that were met independent of grace period -func thresholdsMet(thresholds []Threshold, observations signalObservations) []Threshold { +func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold { results := []Threshold{} for i := range thresholds { threshold := thresholds[i] @@ -545,7 +558,12 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations) []Th } // determine if we have met the specified threshold thresholdMet := false - thresholdResult := threshold.Value.Cmp(*observed) + quantity := threshold.Value.Copy() + // if enforceMinReclaim is specified, we compare relative to value - minreclaim + if enforceMinReclaim && threshold.MinReclaim != nil { + quantity.Add(*threshold.MinReclaim) + } + thresholdResult := quantity.Cmp(*observed) switch threshold.Operator { case OpLessThan: thresholdMet = thresholdResult > 0 @@ -589,7 +607,9 @@ func nodeConditions(thresholds []Threshold) []api.NodeConditionType { results := []api.NodeConditionType{} for _, threshold := range thresholds { if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found { - results = append(results, nodeCondition) + if !hasNodeCondition(results, nodeCondition) { + results = append(results, nodeCondition) + } } } return results @@ -644,7 +664,18 @@ func hasNodeCondition(inputs []api.NodeConditionType, item api.NodeConditionType return false } -// hasThreshold returns true if the node condition is in the input list +// mergeThresholds will merge both threshold lists eliminating duplicates. +func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold { + results := inputsA + for _, threshold := range inputsB { + if !hasThreshold(results, threshold) { + results = append(results, threshold) + } + } + return results +} + +// hasThreshold returns true if the threshold is in the input list func hasThreshold(inputs []Threshold, item Threshold) bool { for _, input := range inputs { if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && input.Value.Cmp(*item.Value) == 0 { @@ -654,8 +685,8 @@ func hasThreshold(inputs []Threshold, item Threshold) bool { return false } -// reclaimResources returns the set of resources that are starved based on thresholds met. -func reclaimResources(thresholds []Threshold) []api.ResourceName { +// getStarvedResources returns the set of resources that are starved based on thresholds met. +func getStarvedResources(thresholds []Threshold) []api.ResourceName { results := []api.ResourceName{} for _, threshold := range thresholds { if starvedResource, found := signalToResource[threshold.Signal]; found { @@ -699,3 +730,39 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc { func PodIsEvicted(podStatus api.PodStatus) bool { return podStatus.Phase == api.PodFailed && podStatus.Reason == reason } + +// buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources. +func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.ResourceName]nodeReclaimFuncs { + resourceToReclaimFunc := map[api.ResourceName]nodeReclaimFuncs{} + // usage of an imagefs is optional + if withImageFs { + // with an imagefs, nodefs pressure should just delete logs + resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()} + // with an imagefs, imagefs pressure should delete unused images + resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)} + } else { + // without an imagefs, nodefs pressure should delete logs, and unused images + resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)} + } + return resourceToReclaimFunc +} + +// deleteLogs will delete logs to free up disk pressure. +func deleteLogs() nodeReclaimFunc { + return func() (*resource.Quantity, error) { + // TODO: not yet supported. + return resource.NewQuantity(int64(0), resource.BinarySI), nil + } +} + +// deleteImages will delete unused images to free up disk pressure. +func deleteImages(imageGC ImageGC) nodeReclaimFunc { + return func() (*resource.Quantity, error) { + glog.Infof("eviction manager: attempting to delete unused images") + reclaimed, err := imageGC.DeleteUnusedImages() + if err != nil { + return nil, err + } + return resource.NewQuantity(reclaimed, resource.BinarySI), nil + } +} diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 93713ddb7f1..aeb6cd0ea09 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -585,29 +585,50 @@ func TestMakeSignalObservations(t *testing.T) { func TestThresholdsMet(t *testing.T) { hardThreshold := Threshold{ - Signal: SignalMemoryAvailable, - Operator: OpLessThan, - Value: quantityMustParse("1Gi"), + Signal: SignalMemoryAvailable, + Operator: OpLessThan, + Value: quantityMustParse("1Gi"), + MinReclaim: quantityMustParse("500Mi"), } testCases := map[string]struct { - thresholds []Threshold - observations signalObservations - result []Threshold + enforceMinReclaim bool + thresholds []Threshold + observations signalObservations + result []Threshold }{ "empty": { - thresholds: []Threshold{}, - observations: signalObservations{}, - result: []Threshold{}, + enforceMinReclaim: false, + thresholds: []Threshold{}, + observations: signalObservations{}, + result: []Threshold{}, }, "threshold-met": { - thresholds: []Threshold{hardThreshold}, + enforceMinReclaim: false, + thresholds: []Threshold{hardThreshold}, observations: signalObservations{ SignalMemoryAvailable: quantityMustParse("500Mi"), }, result: []Threshold{hardThreshold}, }, "threshold-not-met": { - thresholds: []Threshold{hardThreshold}, + enforceMinReclaim: false, + thresholds: []Threshold{hardThreshold}, + observations: signalObservations{ + SignalMemoryAvailable: quantityMustParse("2Gi"), + }, + result: []Threshold{}, + }, + "threshold-met-with-min-reclaim": { + enforceMinReclaim: true, + thresholds: []Threshold{hardThreshold}, + observations: signalObservations{ + SignalMemoryAvailable: quantityMustParse("1.05Gi"), + }, + result: []Threshold{hardThreshold}, + }, + "threshold-not-met-with-min-reclaim": { + enforceMinReclaim: true, + thresholds: []Threshold{hardThreshold}, observations: signalObservations{ SignalMemoryAvailable: quantityMustParse("2Gi"), }, @@ -615,7 +636,7 @@ func TestThresholdsMet(t *testing.T) { }, } for testName, testCase := range testCases { - actual := thresholdsMet(testCase.thresholds, testCase.observations) + actual := thresholdsMet(testCase.thresholds, testCase.observations, testCase.enforceMinReclaim) if !thresholdList(actual).Equal(thresholdList(testCase.result)) { t.Errorf("Test case: %s, expected: %v, actual: %v", testName, testCase.result, actual) } @@ -851,7 +872,7 @@ func TestHasNodeConditions(t *testing.T) { } } -func TestReclaimResources(t *testing.T) { +func TestGetStarvedResources(t *testing.T) { testCases := map[string]struct { inputs []Threshold result []api.ResourceName @@ -876,7 +897,7 @@ func TestReclaimResources(t *testing.T) { }, } for testName, testCase := range testCases { - actual := reclaimResources(testCase.inputs) + actual := getStarvedResources(testCase.inputs) actualSet := quota.ToSet(actual) expectedSet := quota.ToSet(testCase.result) if !actualSet.Equal(expectedSet) { diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 50085439dd5..897aae3e93d 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -98,6 +98,12 @@ type DiskInfoProvider interface { HasDedicatedImageFs() (bool, error) } +// ImageGC is responsible for performing garbage collection of unused images. +type ImageGC interface { + // DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error. + DeleteUnusedImages() (int64, error) +} + // KillPodFunc kills a pod. // The pod status is updated, and then it is killed with the specified grace period. // This function must block until either the pod is killed or an error is encountered. @@ -124,3 +130,9 @@ type thresholdsObservedAt map[Threshold]time.Time // nodeConditionsObservedAt maps a node condition to a time that it was observed type nodeConditionsObservedAt map[api.NodeConditionType]time.Time + +// nodeReclaimFunc is a function that knows how to reclaim a resource from the node without impacting pods. +type nodeReclaimFunc func() (*resource.Quantity, error) + +// nodeReclaimFuncs is an ordered list of nodeReclaimFunc +type nodeReclaimFuncs []nodeReclaimFunc diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 4959032a5ca..526f1a92c92 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -537,7 +537,7 @@ func NewMainKubelet( klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs() // setup eviction manager - evictionManager, evictionAdmitHandler, err := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers), recorder, nodeRef, klet.clock) + evictionManager, evictionAdmitHandler, err := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers), klet.imageManager, recorder, nodeRef, klet.clock) if err != nil { return nil, fmt.Errorf("failed to initialize eviction manager: %v", err) } diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index c6a5a737dda..a7df5585412 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -225,7 +225,7 @@ func newTestKubeletWithImageList( Namespace: "", } // setup eviction manager - evictionManager, evictionAdmitHandler, err := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers), fakeRecorder, nodeRef, kubelet.clock) + evictionManager, evictionAdmitHandler, err := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers), kubelet.imageManager, fakeRecorder, nodeRef, kubelet.clock) if err != nil { t.Fatalf("failed to initialize eviction manager: %v", err) } diff --git a/pkg/kubelet/runonce_test.go b/pkg/kubelet/runonce_test.go index 56a8bd6fa6d..7c922a4feec 100644 --- a/pkg/kubelet/runonce_test.go +++ b/pkg/kubelet/runonce_test.go @@ -114,7 +114,7 @@ func TestRunOnce(t *testing.T) { fakeKillPodFunc := func(pod *api.Pod, podStatus api.PodStatus, gracePeriodOverride *int64) error { return nil } - evictionManager, evictionAdmitHandler, err := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, kb.recorder, nodeRef, kb.clock) + evictionManager, evictionAdmitHandler, err := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, kb.recorder, nodeRef, kb.clock) if err != nil { t.Fatalf("failed to initialize eviction manager: %v", err) }