diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 336b8e69bde..24d9213cdfb 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -169,11 +169,6 @@ func makePIDStats(nodeAvailablePIDs string, numberOfRunningProcesses string, pod MaxPID: &availablePIDs, NumOfRunningProcesses: &NumberOfRunningProcesses, }, - SystemContainers: []statsapi.ContainerStats{ - { - Name: statsapi.SystemContainerPods, - }, - }, }, Pods: []statsapi.PodStats{}, } @@ -402,8 +397,8 @@ func TestPIDPressure_VerifyPodStatus(t *testing.T) { podMaker := makePodWithPIDStats summaryStatsMaker := makePIDStats podsToMake := []podToMake{ - {name: "below-requests"}, - {name: "above-requests"}, + {}, + {}, } pods := []*v1.Pod{} podStats := map[*v1.Pod]statsapi.PodStats{} @@ -924,229 +919,253 @@ func makeContainersByQOS(class v1.PodQOSClass) []v1.Container { } func TestPIDPressure(t *testing.T) { - podMaker := makePodWithPIDStats - summaryStatsMaker := makePIDStats - podsToMake := []podToMake{ - {name: "high-priority-high-usage", priority: highPriority, pidUsage: 900}, - {name: "default-priority-low-usage", priority: defaultPriority, pidUsage: 100}, - {name: "default-priority-medium-usage", priority: defaultPriority, pidUsage: 400}, - {name: "low-priority-high-usage", priority: lowPriority, pidUsage: 600}, - {name: "low-priority-low-usage", priority: lowPriority, pidUsage: 50}, - } - pods := []*v1.Pod{} - podStats := map[*v1.Pod]statsapi.PodStats{} - for _, podToMake := range podsToMake { - pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.pidUsage) - pods = append(pods, pod) - podStats[pod] = podStat - } - podToEvict := pods[3] - activePodsFunc := func() []*v1.Pod { - return pods - } - fakeClock := testingclock.NewFakeClock(time.Now()) - podKiller := &mockPodKiller{} - diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} - diskGC := &mockDiskGC{err: nil} - nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - - config := Config{ - MaxPodGracePeriodSeconds: 5, - PressureTransitionPeriod: time.Minute * 5, - Thresholds: []evictionapi.Threshold{ - { - Signal: evictionapi.SignalPIDAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1200"), - }, - }, - { - Signal: evictionapi.SignalPIDAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1500"), - }, - GracePeriod: time.Minute * 2, + // Define test cases + testCases := []struct { + name string + podsToMake []podToMake + evictPodIndex int + noPressurePIDUsage string + pressurePIDUsageWithGracePeriod string + pressurePIDUsageWithoutGracePeriod string + totalPID string + }{ + { + name: "eviction due to pid pressure", + podsToMake: []podToMake{ + {name: "high-priority-high-usage", priority: highPriority, pidUsage: 900}, + {name: "default-priority-low-usage", priority: defaultPriority, pidUsage: 100}, + {name: "default-priority-medium-usage", priority: defaultPriority, pidUsage: 400}, + {name: "low-priority-high-usage", priority: lowPriority, pidUsage: 600}, + {name: "low-priority-low-usage", priority: lowPriority, pidUsage: 50}, }, + evictPodIndex: 3, // we expect the low-priority-high-usage pod to be evicted + noPressurePIDUsage: "300", + pressurePIDUsageWithGracePeriod: "700", + pressurePIDUsageWithoutGracePeriod: "1200", + totalPID: "2000", }, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2000", "300", podStats)} - manager := &managerImpl{ - clock: fakeClock, - killPodFunc: podKiller.killPodNow, - imageGC: diskGC, - containerGC: diskGC, - config: config, - recorder: &record.FakeRecorder{}, - summaryProvider: summaryProvider, - nodeRef: nodeRef, - nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, - thresholdsFirstObservedAt: thresholdsObservedAt{}, - } - // create a best effort pod to test admission - podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, 50) + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + podMaker := makePodWithPIDStats + summaryStatsMaker := makePIDStats + pods := []*v1.Pod{} + podStats := map[*v1.Pod]statsapi.PodStats{} + for _, podToMake := range tc.podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.pidUsage) + pods = append(pods, pod) + podStats[pod] = podStat + } + podToEvict := pods[tc.evictPodIndex] + activePodsFunc := func() []*v1.Pod { return pods } - // synchronize - _, err := manager.synchronize(diskInfoProvider, activePodsFunc) + fakeClock := testingclock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)} + diskGC := &mockDiskGC{err: nil} + nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalPIDAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1200"), + }, + }, + { + Signal: evictionapi.SignalPIDAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("1500"), + }, + GracePeriod: time.Minute * 2, + }, + }, + } - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Fatalf("Manager should not report disk pressure") - } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: diskGC, + containerGC: diskGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) - } + // create a pod to test admission + podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, 50) - // induce soft threshold for PID pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("2000", "700", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + // synchronize + _, err := manager.synchronize(diskInfoProvider, activePodsFunc) - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // now, we should have PID pressure - if !manager.IsUnderPIDPressure() { - t.Errorf("Manager should report PID pressure since soft threshold was met") - } + // we should not have PID pressure + if manager.IsUnderPIDPressure() { + t.Fatalf("Manager should not report PID pressure") + } - // verify no pod was yet killed because there has not yet been enough time passed - if podKiller.pod != nil { - t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) - } + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } - // step forward in time past the grace period - fakeClock.Step(3 * time.Minute) - // no change in PID stats to simulate continued pressure - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + // induce soft threshold for PID pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithGracePeriod, podStats) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // verify PID pressure is still reported - if !manager.IsUnderPIDPressure() { - t.Errorf("Manager should still report PID pressure") - } + // now, we should have PID pressure + if !manager.IsUnderPIDPressure() { + t.Errorf("Manager should report PID pressure since soft threshold was met") + } - // verify the right pod was killed with the right grace period. - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - if podKiller.gracePeriodOverride == nil { - t.Errorf("Manager chose to kill pod but should have had a grace period override.") - } - observedGracePeriod := *podKiller.gracePeriodOverride - if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) - } + // verify no pod was yet killed because there has not yet been enough time passed + if podKiller.pod != nil { + t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name) + } - // reset state - podKiller.pod = nil - podKiller.gracePeriodOverride = nil + // step forward in time past the grace period + fakeClock.Step(3 * time.Minute) + // no change in PID stats to simulate continued pressure + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // remove PID pressure by simulating increased PID availability - fakeClock.Step(20 * time.Minute) - summaryProvider.result = summaryStatsMaker("2000", "300", podStats) // Simulate increased PID availability - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + // verify PID pressure is still reported + if !manager.IsUnderPIDPressure() { + t.Errorf("Manager should still report PID pressure") + } - // verify PID pressure is resolved - if manager.IsUnderPIDPressure() { - t.Errorf("Manager should not report PID pressure") - } + // verify the right pod was killed with the right grace period. + if podKiller.pod != podToEvict { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + if podKiller.gracePeriodOverride == nil { + t.Errorf("Manager chose to kill pod but should have had a grace period override.") + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) + } - // re-induce PID pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("2000", "1200", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + // remove PID pressure by simulating increased PID availability + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) // Simulate increased PID availability + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // verify PID pressure is reported again - if !manager.IsUnderPIDPressure() { - t.Errorf("Manager should report PID pressure") - } + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - // verify the right pod was killed with the right grace period. - if podKiller.pod != podToEvict { - t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) - } - if podKiller.gracePeriodOverride == nil { - t.Errorf("Manager chose to kill pod but should have had a grace period override.") - } - observedGracePeriod = *podKiller.gracePeriodOverride - if observedGracePeriod != int64(0) { - t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) - } + // verify PID pressure is resolved + if manager.IsUnderPIDPressure() { + t.Errorf("Manager should not report PID pressure") + } - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // re-induce PID pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.pressurePIDUsageWithoutGracePeriod, podStats) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // reduce PID pressure - fakeClock.Step(1 * time.Minute) - summaryProvider.result = summaryStatsMaker("2000", "300", podStats) - podKiller.pod = nil // reset state - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + // verify PID pressure is reported again + if !manager.IsUnderPIDPressure() { + t.Errorf("Manager should report PID pressure") + } - // we should have PID pressure (because transition period not yet met) - if !manager.IsUnderPIDPressure() { - t.Errorf("Manager should report PID pressure") - } + // verify the right pod was killed with the right grace period. + if podKiller.pod != podToEvict { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name) + } + if podKiller.gracePeriodOverride == nil { + t.Errorf("Manager chose to kill pod but should have had a grace period override.") + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } - // try to admit our pod (should fail) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) - } + // reduce PID pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) + podKiller.pod = nil // reset state + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) - // move the clock past the transition period - fakeClock.Step(5 * time.Minute) - summaryProvider.result = summaryStatsMaker("2000", "300", podStats) - _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } - if err != nil { - t.Fatalf("Manager expects no error but got %v", err) - } + // we should have PID pressure (because transition period not yet met) + if !manager.IsUnderPIDPressure() { + t.Errorf("Manager should report PID pressure") + } - // we should not have PID pressure (because transition period met) - if manager.IsUnderPIDPressure() { - t.Errorf("Manager should not report PID pressure") - } + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } - // no pod should have been killed - if podKiller.pod != nil { - t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) - } + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } - // try to admit our pod (should succeed) - if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { - t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + // move the clock past the transition period + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker(tc.totalPID, tc.noPressurePIDUsage, podStats) + _, err = manager.synchronize(diskInfoProvider, activePodsFunc) + + if err != nil { + t.Fatalf("Manager expects no error but got %v", err) + } + + // we should not have PID pressure (because transition period met) + if manager.IsUnderPIDPressure() { + t.Errorf("Manager should not report PID pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) + } + + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Fatalf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } + }) } }