Add support to invoke image gc in response to disk eviction thresholds

This commit is contained in:
derekwaynecarr 2016-08-03 16:55:52 -04:00
parent 611c127f0d
commit 68bc47ecc6
8 changed files with 342 additions and 20 deletions

View File

@ -40,6 +40,8 @@ type managerImpl struct {
config Config
// the function to invoke to kill a pod
killPodFunc KillPodFunc
// the interface that knows how to do image gc
imageGC ImageGC
// protects access to internal state
sync.RWMutex
// node conditions are the set of conditions present
@ -58,6 +60,8 @@ type managerImpl struct {
thresholdsMet []Threshold
// resourceToRankFunc maps a resource to ranking function for that resource.
resourceToRankFunc map[api.ResourceName]rankFunc
// resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs
}
// ensure it implements the required interface
@ -68,12 +72,14 @@ func NewManager(
summaryProvider stats.SummaryProvider,
config Config,
killPodFunc KillPodFunc,
imageGC ImageGC,
recorder record.EventRecorder,
nodeRef *api.ObjectReference,
clock clock.Clock) (Manager, lifecycle.PodAdmitHandler, error) {
manager := &managerImpl{
clock: clock,
killPodFunc: killPodFunc,
imageGC: imageGC,
config: config,
recorder: recorder,
summaryProvider: summaryProvider,
@ -140,13 +146,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// build the ranking functions (if not yet known)
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
if len(m.resourceToRankFunc) == 0 {
if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
if err != nil {
return
}
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
}
// make observations and get a function to derive pod usage stats relative to those observations.
@ -192,7 +199,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
m.Unlock()
// determine the set of resources under starvation
starvedResources := reclaimResources(thresholds)
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
glog.V(3).Infof("eviction manager: no resources are starved")
return
@ -209,6 +216,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// record an event about the resources we are now attempting to reclaim via eviction
m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
if m.reclaimNodeLevelResources(resourceToReclaim, observations) {
glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
return
}
glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
// rank the pods for eviction
rank, ok := m.resourceToRankFunc[resourceToReclaim]
if !ok {
@ -254,3 +269,31 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
}
glog.Infof("eviction manager: unable to evict any pods from the node")
}
// reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required.
func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim api.ResourceName, observations signalObservations) bool {
nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim]
for _, nodeReclaimFunc := range nodeReclaimFuncs {
// attempt to reclaim the pressured resource.
reclaimed, err := nodeReclaimFunc()
if err == nil {
// update our local observations based on the amount reported to have been reclaimed.
// note: this is optimistic, other things could have been still consuming the pressured resource in the interim.
signal := resourceToSignal[resourceToReclaim]
value, ok := observations[signal]
if !ok {
glog.Errorf("eviction manager: unable to find value associated with signal %v", signal)
continue
}
value.Add(*reclaimed)
// evaluate all current thresholds to see if with adjusted observations, we think we have met min reclaim goals
if len(thresholdsMet(m.thresholdsMet, observations, true)) == 0 {
return true
}
} else {
glog.Errorf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
}
}
return false
}

View File

@ -54,6 +54,19 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) {
return m.dedicatedImageFs, nil
}
// mockImageGC is used to simulate invoking image garbage collection.
type mockImageGC struct {
err error
freed int64
invoked bool
}
// DeleteUnusedImages returns the mocked values.
func (m *mockImageGC) DeleteUnusedImages() (int64, error) {
m.invoked = true
return m.freed, m.err
}
// TestMemoryPressure
func TestMemoryPressure(t *testing.T) {
podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, memoryWorkingSet string) (*api.Pod, statsapi.PodStats) {
@ -106,6 +119,7 @@ func TestMemoryPressure(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -129,6 +143,7 @@ func TestMemoryPressure(t *testing.T) {
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
@ -351,6 +366,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -374,6 +390,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
@ -567,6 +584,7 @@ func TestMinReclaim(t *testing.T) {
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
imageGC := &mockImageGC{freed: int64(0), err: nil}
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
@ -585,6 +603,7 @@ func TestMinReclaim(t *testing.T) {
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
@ -672,3 +691,202 @@ func TestMinReclaim(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
}
}
func TestNodeReclaimFuncs(t *testing.T) {
podMaker := func(name string, requests api.ResourceList, limits api.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string) (*api.Pod, statsapi.PodStats) {
pod := newPod(name, []api.Container{
newContainer(name, requests, limits),
}, nil)
podStats := newPodDiskStats(pod, parseQuantity(rootFsUsed), parseQuantity(logsUsed), parseQuantity(perLocalVolumeUsed))
return pod, podStats
}
summaryStatsMaker := func(rootFsAvailableBytes, imageFsAvailableBytes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary {
rootFsVal := resource.MustParse(rootFsAvailableBytes)
rootFsBytes := uint64(rootFsVal.Value())
imageFsVal := resource.MustParse(imageFsAvailableBytes)
imageFsBytes := uint64(imageFsVal.Value())
result := &statsapi.Summary{
Node: statsapi.NodeStats{
Fs: &statsapi.FsStats{
AvailableBytes: &rootFsBytes,
},
Runtime: &statsapi.RuntimeStats{
ImageFs: &statsapi.FsStats{
AvailableBytes: &imageFsBytes,
},
},
},
Pods: []statsapi.PodStats{},
}
for _, podStat := range podStats {
result.Pods = append(result.Pods, podStat)
}
return result
}
podsToMake := []struct {
name string
requests api.ResourceList
limits api.ResourceList
rootFsUsed string
logsFsUsed string
perLocalVolumeUsed string
}{
{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", ""), rootFsUsed: "500Mi"},
{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", ""), perLocalVolumeUsed: "300Mi"},
{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), rootFsUsed: "800Mi"},
{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi"), logsFsUsed: "300Mi"},
{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "800Mi"},
{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), rootFsUsed: "200Mi"},
}
pods := []*api.Pod{}
podStats := map[*api.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pods = append(pods, pod)
podStats[pod] = podStat
}
activePodsFunc := func() []*api.Pod {
return pods
}
fakeClock := clock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
imageGcFree := resource.MustParse("700Mi")
imageGC := &mockImageGC{freed: imageGcFree.Value(), err: nil}
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []Threshold{
{
Signal: SignalNodeFsAvailable,
Operator: OpLessThan,
Value: quantityMustParse("1Gi"),
MinReclaim: quantityMustParse("500Mi"),
},
},
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("16Gi", "200Gi", podStats)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: imageGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
}
// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}
// induce hard threshold
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
t.Errorf("Manager should report disk pressure since soft threshold was met")
}
// verify image gc was invoked
if !imageGC.invoked {
t.Errorf("Manager should have invoked image gc")
}
// verify no pod was killed because image gc was sufficient
if podKiller.pod != nil {
t.Errorf("Manager should not have killed a pod, but killed: %v", podKiller.pod)
}
// reset state
imageGC.invoked = false
// remove disk pressure
fakeClock.Step(20 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have disk pressure
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}
// induce disk pressure!
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats)
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have disk pressure
if !manager.IsUnderDiskPressure() {
t.Errorf("Manager should report disk pressure")
}
// ensure image gc was invoked
if !imageGC.invoked {
t.Errorf("Manager should have invoked image gc")
}
// check the right pod was killed
if podKiller.pod != pods[0] {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}
// reduce disk pressure
fakeClock.Step(1 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
imageGC.invoked = false // reset state
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should have disk pressure (because transition period not yet met)
if !manager.IsUnderDiskPressure() {
t.Errorf("Manager should report disk pressure")
}
// no image gc should have occurred
if imageGC.invoked {
t.Errorf("Manager chose to perform image gc when it was not neeed")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
}
// move the clock past transition period to ensure that we stop reporting pressure
fakeClock.Step(5 * time.Minute)
summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats)
imageGC.invoked = false // reset state
podKiller.pod = nil // reset state
manager.synchronize(diskInfoProvider, activePodsFunc)
// we should not have disk pressure (because transition period met)
if manager.IsUnderDiskPressure() {
t.Errorf("Manager should not report disk pressure")
}
// no image gc should have occurred
if imageGC.invoked {
t.Errorf("Manager chose to perform image gc when it was not neeed")
}
// no pod should have been killed
if podKiller.pod != nil {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
}
}

View File

@ -47,18 +47,31 @@ const (
resourceNodeFs api.ResourceName = "nodefs"
)
// signalToNodeCondition maps a signal to the node condition to report if threshold is met.
var signalToNodeCondition = map[Signal]api.NodeConditionType{
SignalMemoryAvailable: api.NodeMemoryPressure,
SignalImageFsAvailable: api.NodeDiskPressure,
SignalNodeFsAvailable: api.NodeDiskPressure,
}
var (
// signalToNodeCondition maps a signal to the node condition to report if threshold is met.
signalToNodeCondition map[Signal]api.NodeConditionType
// signalToResource maps a Signal to its associated Resource.
signalToResource map[Signal]api.ResourceName
// resourceToSignal maps a Resource to its associated Signal
resourceToSignal map[api.ResourceName]Signal
)
// signalToResource maps a Signal to its associated Resource.
var signalToResource = map[Signal]api.ResourceName{
SignalMemoryAvailable: api.ResourceMemory,
SignalImageFsAvailable: resourceImageFs,
SignalNodeFsAvailable: resourceNodeFs,
func init() {
// map eviction signals to node conditions
signalToNodeCondition = map[Signal]api.NodeConditionType{}
signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure
signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure
signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure
// map signals to resources (and vice-versa)
signalToResource = map[Signal]api.ResourceName{}
signalToResource[SignalMemoryAvailable] = api.ResourceMemory
signalToResource[SignalImageFsAvailable] = resourceImageFs
signalToResource[SignalNodeFsAvailable] = resourceNodeFs
resourceToSignal = map[api.ResourceName]Signal{}
for key, value := range signalToResource {
resourceToSignal[value] = key
}
}
// validSignal returns true if the signal is supported.
@ -672,8 +685,8 @@ func hasThreshold(inputs []Threshold, item Threshold) bool {
return false
}
// reclaimResources returns the set of resources that are starved based on thresholds met.
func reclaimResources(thresholds []Threshold) []api.ResourceName {
// getStarvedResources returns the set of resources that are starved based on thresholds met.
func getStarvedResources(thresholds []Threshold) []api.ResourceName {
results := []api.ResourceName{}
for _, threshold := range thresholds {
if starvedResource, found := signalToResource[threshold.Signal]; found {
@ -717,3 +730,39 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
func PodIsEvicted(podStatus api.PodStatus) bool {
return podStatus.Phase == api.PodFailed && podStatus.Reason == reason
}
// buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources.
func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.ResourceName]nodeReclaimFuncs {
resourceToReclaimFunc := map[api.ResourceName]nodeReclaimFuncs{}
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
// with an imagefs, imagefs pressure should delete unused images
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)}
} else {
// without an imagefs, nodefs pressure should delete logs, and unused images
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)}
}
return resourceToReclaimFunc
}
// deleteLogs will delete logs to free up disk pressure.
func deleteLogs() nodeReclaimFunc {
return func() (*resource.Quantity, error) {
// TODO: not yet supported.
return resource.NewQuantity(int64(0), resource.BinarySI), nil
}
}
// deleteImages will delete unused images to free up disk pressure.
func deleteImages(imageGC ImageGC) nodeReclaimFunc {
return func() (*resource.Quantity, error) {
glog.Infof("eviction manager: attempting to delete unused images")
reclaimed, err := imageGC.DeleteUnusedImages()
if err != nil {
return nil, err
}
return resource.NewQuantity(reclaimed, resource.BinarySI), nil
}
}

View File

@ -872,7 +872,7 @@ func TestHasNodeConditions(t *testing.T) {
}
}
func TestReclaimResources(t *testing.T) {
func TestGetStarvedResources(t *testing.T) {
testCases := map[string]struct {
inputs []Threshold
result []api.ResourceName
@ -897,7 +897,7 @@ func TestReclaimResources(t *testing.T) {
},
}
for testName, testCase := range testCases {
actual := reclaimResources(testCase.inputs)
actual := getStarvedResources(testCase.inputs)
actualSet := quota.ToSet(actual)
expectedSet := quota.ToSet(testCase.result)
if !actualSet.Equal(expectedSet) {

View File

@ -98,6 +98,12 @@ type DiskInfoProvider interface {
HasDedicatedImageFs() (bool, error)
}
// ImageGC is responsible for performing garbage collection of unused images.
type ImageGC interface {
// DeleteUnusedImages deletes unused images and returns the number of bytes freed, or an error.
DeleteUnusedImages() (int64, error)
}
// KillPodFunc kills a pod.
// The pod status is updated, and then it is killed with the specified grace period.
// This function must block until either the pod is killed or an error is encountered.
@ -124,3 +130,9 @@ type thresholdsObservedAt map[Threshold]time.Time
// nodeConditionsObservedAt maps a node condition to a time that it was observed
type nodeConditionsObservedAt map[api.NodeConditionType]time.Time
// nodeReclaimFunc is a function that knows how to reclaim a resource from the node without impacting pods.
type nodeReclaimFunc func() (*resource.Quantity, error)
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc
type nodeReclaimFuncs []nodeReclaimFunc

View File

@ -537,7 +537,7 @@ func NewMainKubelet(
klet.setNodeStatusFuncs = klet.defaultNodeStatusFuncs()
// setup eviction manager
evictionManager, evictionAdmitHandler, err := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers), recorder, nodeRef, klet.clock)
evictionManager, evictionAdmitHandler, err := eviction.NewManager(klet.resourceAnalyzer, evictionConfig, killPodNow(klet.podWorkers), klet.imageManager, recorder, nodeRef, klet.clock)
if err != nil {
return nil, fmt.Errorf("failed to initialize eviction manager: %v", err)
}

View File

@ -225,7 +225,7 @@ func newTestKubeletWithImageList(
Namespace: "",
}
// setup eviction manager
evictionManager, evictionAdmitHandler, err := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers), fakeRecorder, nodeRef, kubelet.clock)
evictionManager, evictionAdmitHandler, err := eviction.NewManager(kubelet.resourceAnalyzer, eviction.Config{}, killPodNow(kubelet.podWorkers), kubelet.imageManager, fakeRecorder, nodeRef, kubelet.clock)
if err != nil {
t.Fatalf("failed to initialize eviction manager: %v", err)
}

View File

@ -114,7 +114,7 @@ func TestRunOnce(t *testing.T) {
fakeKillPodFunc := func(pod *api.Pod, podStatus api.PodStatus, gracePeriodOverride *int64) error {
return nil
}
evictionManager, evictionAdmitHandler, err := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, kb.recorder, nodeRef, kb.clock)
evictionManager, evictionAdmitHandler, err := eviction.NewManager(kb.resourceAnalyzer, eviction.Config{}, fakeKillPodFunc, nil, kb.recorder, nodeRef, kb.clock)
if err != nil {
t.Fatalf("failed to initialize eviction manager: %v", err)
}