kubelet eviction manager support for min-reclaim

This commit is contained in:
derekwaynecarr 2016-08-02 22:58:48 -04:00
parent 8ab06a3f86
commit 611c127f0d
4 changed files with 223 additions and 17 deletions

View File

@ -54,6 +54,8 @@ type managerImpl struct {
summaryProvider stats.SummaryProvider
// records when a threshold was first observed
thresholdsFirstObservedAt thresholdsObservedAt
// records the set of thresholds that have been met (including graceperiod) but not yet resolved
thresholdsMet []Threshold
// resourceToRankFunc maps a resource to ranking function for that resource.
resourceToRankFunc map[api.ResourceName]rankFunc
}
@ -158,7 +160,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
now := m.clock.Now()
// determine the set of thresholds met independent of grace period
thresholds = thresholdsMet(thresholds, observations)
thresholds = thresholdsMet(thresholds, observations, false)
// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
if len(m.thresholdsMet) > 0 {
thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
}
// track when a threshold was first observed
thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
@ -180,6 +188,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
m.nodeConditions = nodeConditions
m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
m.thresholdsMet = thresholds
m.Unlock()
// determine the set of resources under starvation

View File

@ -514,3 +514,161 @@ func TestDiskPressureNodeFs(t *testing.T) {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
}
}
// TestMinReclaim verifies that min-reclaim works as desired.
func TestMinReclaim(t *testing.T) {
podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) {
pod := newPod(name, []api.Container{
newContainer(name, requests, limits),
}, nil)
podStats := newPodMemoryStats(pod, resource.MustParse(memoryWorkingSet))
return pod, podStats
}
summaryStatsMaker := func(nodeAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary {
val := resource.MustParse(nodeAvailableBytes)
availableBytes := uint64(val.Value())
result := &statsapi.Summary{
Node: statsapi.NodeStats{
Memory: &statsapi.MemoryStats{
AvailableBytes: &availableBytes,
},
},
Pods: []statsapi.PodStats{},
}
for _, podStat := range podStats {
result.Pods = append(result.Pods, podStat)
}
return result
}
podsToMake := []struct {
name string
requests api.ResourceList
limits api.ResourceList
memoryWorkingSet string
}{
{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "500Mi"},
{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), memoryWorkingSet: "300Mi"},
{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "800Mi"},
{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), memoryWorkingSet: "300Mi"},
{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"},
{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "200Mi"},
}
pods := []*api.Pod{}
podStats := map[*api.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
pods = append(pods, pod)
podStats[pod] = podStat
}
activePodsFunc := func() []*api.Pod {
return pods
}
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []Threshold{
{
Signal: SignalMemoryAvailable,
Operator: OpLessThan,
Value: quantityMustParse("1Gi"),
MinReclaim: quantityMustParse("500Mi"),
},
},
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
}
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have memory pressure
if manager.IsUnderMemoryPressure() {
t.Errorf("Manager should not report memory pressure")
}
// induce memory pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// check the right pod was killed
if podKiller.pod != pods[0] {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}
// reduce memory pressure, but not below the min-reclaim amount
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("1.2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// check the right pod was killed
if podKiller.pod != pods[0] {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}
// reduce memory pressure and ensure the min-reclaim amount
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have memory pressure (because transition period not yet met)
if !manager.IsUnderMemoryPressure() {
t.Errorf("Manager should report memory pressure")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
}
// move the clock past transition period to ensure that we stop reporting pressure
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("2Gi", podStats)
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have memory pressure (because transition period met)
if manager.IsUnderMemoryPressure() {
t.Errorf("Manager should not report memory pressure")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
}
}

View File

@ -534,7 +534,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
}
// thresholdsMet returns the set of thresholds that were met independent of grace period
func thresholdsMet(thresholds []Threshold, observations signalObservations) []Threshold {
func thresholdsMet(thresholds []Threshold, observations signalObservations, enforceMinReclaim bool) []Threshold {
results := []Threshold{}
for i := range thresholds {
threshold := thresholds[i]
@ -545,7 +545,12 @@ func thresholdsMet(thresholds []Threshold, observations signalObservations) []Th
}
// determine if we have met the specified threshold
thresholdMet := false
thresholdResult := threshold.Value.Cmp(*observed)
quantity := threshold.Value.Copy()
// if enforceMinReclaim is specified, we compare relative to value - minreclaim
if enforceMinReclaim && threshold.MinReclaim != nil {
quantity.Add(*threshold.MinReclaim)
}
thresholdResult := quantity.Cmp(*observed)
switch threshold.Operator {
case OpLessThan:
thresholdMet = thresholdResult > 0
@ -589,7 +594,9 @@ func nodeConditions(thresholds []Threshold) []api.NodeConditionType {
results := []api.NodeConditionType{}
for _, threshold := range thresholds {
if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
results = append(results, nodeCondition)
if !hasNodeCondition(results, nodeCondition) {
results = append(results, nodeCondition)
}
}
}
return results
@ -644,7 +651,18 @@ func hasNodeCondition(inputs []api.NodeConditionType, item api.NodeConditionType
return false
}
// hasThreshold returns true if the node condition is in the input list
// mergeThresholds will merge both threshold lists eliminating duplicates.
func mergeThresholds(inputsA []Threshold, inputsB []Threshold) []Threshold {
results := inputsA
for _, threshold := range inputsB {
if !hasThreshold(results, threshold) {
results = append(results, threshold)
}
}
return results
}
// hasThreshold returns true if the threshold is in the input list
func hasThreshold(inputs []Threshold, item Threshold) bool {
for _, input := range inputs {
if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && input.Value.Cmp(*item.Value) == 0 {

View File

@ -585,29 +585,50 @@ func TestMakeSignalObservations(t *testing.T) {
func TestThresholdsMet(t *testing.T) {
hardThreshold := Threshold{
Signal: SignalMemoryAvailable,
Operator: OpLessThan,
Value: quantityMustParse("1Gi"),
Signal: SignalMemoryAvailable,
Operator: OpLessThan,
Value: quantityMustParse("1Gi"),
MinReclaim: quantityMustParse("500Mi"),
}
testCases := map[string]struct {
thresholds []Threshold
observations signalObservations
result []Threshold
enforceMinReclaim bool
thresholds []Threshold
observations signalObservations
result []Threshold
}{
"empty": {
thresholds: []Threshold{},
observations: signalObservations{},
result: []Threshold{},
enforceMinReclaim: false,
thresholds: []Threshold{},
observations: signalObservations{},
result: []Threshold{},
},
"threshold-met": {
thresholds: []Threshold{hardThreshold},
enforceMinReclaim: false,
thresholds: []Threshold{hardThreshold},
observations: signalObservations{
SignalMemoryAvailable: quantityMustParse("500Mi"),
},
result: []Threshold{hardThreshold},
},
"threshold-not-met": {
thresholds: []Threshold{hardThreshold},
enforceMinReclaim: false,
thresholds: []Threshold{hardThreshold},
observations: signalObservations{
SignalMemoryAvailable: quantityMustParse("2Gi"),
},
result: []Threshold{},
},
"threshold-met-with-min-reclaim": {
enforceMinReclaim: true,
thresholds: []Threshold{hardThreshold},
observations: signalObservations{
SignalMemoryAvailable: quantityMustParse("1.05Gi"),
},
result: []Threshold{hardThreshold},
},
"threshold-not-met-with-min-reclaim": {
enforceMinReclaim: true,
thresholds: []Threshold{hardThreshold},
observations: signalObservations{
SignalMemoryAvailable: quantityMustParse("2Gi"),
},
@ -615,7 +636,7 @@ func TestThresholdsMet(t *testing.T) {
},
}
for testName, testCase := range testCases {
actual := thresholdsMet(testCase.thresholds, testCase.observations)
actual := thresholdsMet(testCase.thresholds, testCase.observations, testCase.enforceMinReclaim)
if !thresholdList(actual).Equal(thresholdList(testCase.result)) {
t.Errorf("Test case: %s, expected: %v, actual: %v", testName, testCase.result, actual)
}