From db99c20a9a9dc7db82d236e7cc2ff08c65c269ef Mon Sep 17 00:00:00 2001 From: David Ashpole Date: Fri, 4 May 2018 10:58:54 -0700 Subject: [PATCH] cleanup eviction events --- pkg/kubelet/eviction/BUILD | 1 - pkg/kubelet/eviction/api/types.go | 14 +- pkg/kubelet/eviction/eviction_manager.go | 87 ++++------ pkg/kubelet/eviction/helpers.go | 204 +++++++++++------------ pkg/kubelet/eviction/helpers_test.go | 126 ++++++++------ 5 files changed, 222 insertions(+), 210 deletions(-) diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 03d662a5d05..6a3ebb3fa08 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -20,7 +20,6 @@ go_test( "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", "//pkg/kubelet/types:go_default_library", - "//pkg/quota:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", diff --git a/pkg/kubelet/eviction/api/types.go b/pkg/kubelet/eviction/api/types.go index 3e16d249c97..d32ba9b30f0 100644 --- a/pkg/kubelet/eviction/api/types.go +++ b/pkg/kubelet/eviction/api/types.go @@ -38,8 +38,6 @@ const ( SignalImageFsInodesFree Signal = "imagefs.inodesFree" // SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes. SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available" - // SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation - SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available" // SignalPIDAvailable is amount of PID available for pod allocation SignalPIDAvailable Signal = "pid.available" ) @@ -60,13 +58,11 @@ const ( // from either above or below, never both). There is thus no reason to expose the // operator in the Kubelet's public API. Instead, we internally map signal types to operators. var OpForSignal = map[Signal]ThresholdOperator{ - SignalMemoryAvailable: OpLessThan, - SignalNodeFsAvailable: OpLessThan, - SignalNodeFsInodesFree: OpLessThan, - SignalImageFsAvailable: OpLessThan, - SignalImageFsInodesFree: OpLessThan, - SignalAllocatableMemoryAvailable: OpLessThan, - SignalAllocatableNodeFsAvailable: OpLessThan, + SignalMemoryAvailable: OpLessThan, + SignalNodeFsAvailable: OpLessThan, + SignalNodeFsInodesFree: OpLessThan, + SignalImageFsAvailable: OpLessThan, + SignalImageFsInodesFree: OpLessThan, } // ThresholdValue is a value holder that abstracts literal versus percentage based quantity diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index f580e2951b6..fe7800da4ca 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -76,10 +76,10 @@ type managerImpl struct { thresholdsFirstObservedAt thresholdsObservedAt // records the set of thresholds that have been met (including graceperiod) but not yet resolved thresholdsMet []evictionapi.Threshold - // resourceToRankFunc maps a resource to ranking function for that resource. - resourceToRankFunc map[v1.ResourceName]rankFunc - // resourceToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. - resourceToNodeReclaimFuncs map[v1.ResourceName]nodeReclaimFuncs + // signalToRankFunc maps a resource to ranking function for that resource. + signalToRankFunc map[evictionapi.Signal]rankFunc + // signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource. + signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs // last observations from synchronize lastObservations signalObservations // notifierStopCh is a channel used to stop all thresholdNotifiers @@ -239,8 +239,8 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act return nil } m.dedicatedImageFs = &hasImageFs - m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs) - m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs) + m.signalToRankFunc = buildSignalToRankFunc(hasImageFs) + m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs) } activePods := podFunc() @@ -333,26 +333,26 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } } - // determine the set of resources under starvation - starvedResources := getStarvedResources(thresholds) - if len(starvedResources) == 0 { + if len(thresholds) == 0 { glog.V(3).Infof("eviction manager: no resources are starved") return nil } - // rank the resources to reclaim by eviction priority - sort.Sort(byEvictionPriority(starvedResources)) - resourceToReclaim := starvedResources[0] + // rank the thresholds by eviction priority + sort.Sort(byEvictionPriority(thresholds)) + thresholdToReclaim := thresholds[0] + resourceToReclaim, found := signalToResource[thresholdToReclaim.Signal] + if !found { + glog.V(3).Infof("eviction manager: threshold %s was crossed, but reclaim is not implemented for this threshold.", thresholdToReclaim.Signal) + return nil + } glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim) - // determine if this is a soft or hard eviction associated with the resource - softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim) - // record an event about the resources we are now attempting to reclaim via eviction m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. - if m.reclaimNodeLevelResources(resourceToReclaim) { + if m.reclaimNodeLevelResources(thresholdToReclaim.Signal, resourceToReclaim) { glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim) return nil } @@ -360,9 +360,9 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act glog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim) // rank the pods for eviction - rank, ok := m.resourceToRankFunc[resourceToReclaim] + rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal] if !ok { - glog.Errorf("eviction manager: no ranking function for resource %s", resourceToReclaim) + glog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal) return nil } @@ -388,30 +388,13 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // we kill at most a single pod during each eviction interval for i := range activePods { pod := activePods[i] - // If the pod is marked as critical and static, and support for critical pod annotations is enabled, - // do not evict such pods. Static pods are not re-admitted after evictions. - // https://github.com/kubernetes/kubernetes/issues/40573 has more details. - if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && - kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) { - continue - } - status := v1.PodStatus{ - Phase: v1.PodFailed, - Message: fmt.Sprintf(message, resourceToReclaim), - Reason: reason, - } - // record that we are evicting the pod - m.recorder.Eventf(pod, v1.EventTypeWarning, reason, fmt.Sprintf(message, resourceToReclaim)) gracePeriodOverride := int64(0) - if softEviction { + if !isHardEvictionThreshold(thresholdToReclaim) { gracePeriodOverride = m.config.MaxPodGracePeriodSeconds } - // this is a blocking call and should only return when the pod and its containers are killed. - err := m.killPodFunc(pod, status, &gracePeriodOverride) - if err != nil { - glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err) + if m.evictPod(pod, gracePeriodOverride, evictionMessage(resourceToReclaim, pod, statsFunc)) { + return []*v1.Pod{pod} } - return []*v1.Pod{pod} } glog.Infof("eviction manager: unable to evict any pods from the node") return nil @@ -440,8 +423,8 @@ func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods } // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. -func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceName) bool { - nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim] +func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool { + nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim] for _, nodeReclaimFunc := range nodeReclaimFuncs { // attempt to reclaim the pressured resource. if err := nodeReclaimFunc(); err != nil { @@ -512,7 +495,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1. used := podVolumeUsed[pod.Spec.Volumes[i].Name] if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 { // the emptyDir usage exceeds the size limit, evict the pod - return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String())) + return m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessage, pod.Spec.Volumes[i].Name, size.String())) } } } @@ -540,10 +523,11 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat return false } - podEphemeralStorageTotalUsage.Add(podEphemeralUsage[resourceDisk]) - if podEphemeralStorageTotalUsage.Cmp(podLimits[v1.ResourceEphemeralStorage]) > 0 { + podEphemeralStorageTotalUsage.Add(podEphemeralUsage[v1.ResourceEphemeralStorage]) + podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage] + if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 { // the total usage of pod exceeds the total size limit of containers, evict the pod - return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("pod ephemeral local storage usage exceeds the total limit of containers %v", podLimits[v1.ResourceEphemeralStorage])) + return m.evictPod(pod, 0, fmt.Sprintf(podEphemeralStorageMessage, podEphemeralStorageLimit.String())) } return false } @@ -565,7 +549,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok { if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 { - return m.evictPod(pod, v1.ResourceEphemeralStorage, fmt.Sprintf("container's ephemeral local storage usage exceeds the limit %q", ephemeralStorageThreshold.String())) + return m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessage, containerStat.Name, ephemeralStorageThreshold.String())) } } @@ -573,21 +557,24 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P return false } -func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool { +func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string) bool { + // If the pod is marked as critical and static, and support for critical pod annotations is enabled, + // do not evict such pods. Static pods are not re-admitted after evictions. + // https://github.com/kubernetes/kubernetes/issues/40573 has more details. if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) { - glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod)) + glog.Errorf("eviction manager: cannot evict a critical static pod %s", format.Pod(pod)) return false } status := v1.PodStatus{ Phase: v1.PodFailed, - Message: fmt.Sprintf(message, resourceName), + Message: evictMsg, Reason: reason, } // record that we are evicting the pod m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg) - gracePeriod := int64(0) - err := m.killPodFunc(pod, status, &gracePeriod) + // this is a blocking call and should only return when the pod and its containers are killed. + err := m.killPodFunc(pod, status, &gracePeriodOverride) if err != nil { glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err) } else { diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 444dbbd1729..6f75a84244d 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -41,19 +41,17 @@ const ( // the reason reported back in status. reason = "Evicted" // the message associated with the reason. - message = "The node was low on resource: %v." - // disk, in bytes. internal to this module, used to account for local disk usage. - resourceDisk v1.ResourceName = "disk" + message = "The node was low on resource: %v. " + // additional information for containers exceeding requests + containerMessage = "Container %s was using %s, which exceeds its request of %s. " + // additional information for containers which have exceeded their ES limit + containerEphemeralStorageMessage = "Container %s exceeded its local ephemeral storage limit %q. " + // additional information for pods which have exceeded their ES limit + podEphemeralStorageMessage = "Pod ephemeral local storage usage exceeds the total limit of containers %s. " + // additional information for empty-dir volumes which have exceeded their size limit + emptyDirMessage = "Usage of EmptyDir volume %q exceeds the limit %q. " // inodes, number. internal to this module, used to account for local disk inode consumption. resourceInodes v1.ResourceName = "inodes" - // imagefs, in bytes. internal to this module, used to account for local image filesystem usage. - resourceImageFs v1.ResourceName = "imagefs" - // imagefs inodes, number. internal to this module, used to account for local image filesystem inodes. - resourceImageFsInodes v1.ResourceName = "imagefsInodes" - // nodefs, in bytes. internal to this module, used to account for local node root filesystem usage. - resourceNodeFs v1.ResourceName = "nodefs" - // nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes. - resourceNodeFsInodes v1.ResourceName = "nodefsInodes" // this prevents constantly updating the memcg notifier if synchronize // is run frequently. notifierRefreshInterval = 10 * time.Second @@ -64,8 +62,6 @@ var ( signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType // signalToResource maps a Signal to its associated Resource. signalToResource map[evictionapi.Signal]v1.ResourceName - // resourceClaimToSignal maps a Resource that can be reclaimed to its associated Signal - resourceClaimToSignal map[v1.ResourceName][]evictionapi.Signal ) func init() { @@ -83,17 +79,10 @@ func init() { signalToResource = map[evictionapi.Signal]v1.ResourceName{} signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory - signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs - signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes - signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs - signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes - - // maps resource to signals (the following resource could be reclaimed) - resourceClaimToSignal = map[v1.ResourceName][]evictionapi.Signal{} - resourceClaimToSignal[resourceNodeFs] = []evictionapi.Signal{evictionapi.SignalNodeFsAvailable} - resourceClaimToSignal[resourceImageFs] = []evictionapi.Signal{evictionapi.SignalImageFsAvailable} - resourceClaimToSignal[resourceNodeFsInodes] = []evictionapi.Signal{evictionapi.SignalNodeFsInodesFree} - resourceClaimToSignal[resourceImageFsInodes] = []evictionapi.Signal{evictionapi.SignalImageFsInodesFree} + signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage + signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes + signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage + signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes } // validSignal returns true if the signal is supported. @@ -310,10 +299,10 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity { // inodeUsage converts inodes consumed into a resource quantity. func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity { if fsStats == nil || fsStats.InodesUsed == nil { - return &resource.Quantity{Format: resource.BinarySI} + return &resource.Quantity{Format: resource.DecimalSI} } usage := int64(*fsStats.InodesUsed) - return resource.NewQuantity(usage, resource.BinarySI) + return resource.NewQuantity(usage, resource.DecimalSI) } // memoryUsage converts working set into a resource quantity. @@ -343,7 +332,7 @@ func localVolumeNames(pod *v1.Pod) []string { // containerUsage aggregates container disk usage and inode consumption for the specified stats to measure. func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList { disk := resource.Quantity{Format: resource.BinarySI} - inodes := resource.Quantity{Format: resource.BinarySI} + inodes := resource.Quantity{Format: resource.DecimalSI} for _, container := range podStats.Containers { if hasFsStatsType(statsToMeasure, fsStatsRoot) { disk.Add(*diskUsage(container.Rootfs)) @@ -355,15 +344,15 @@ func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1 } } return v1.ResourceList{ - resourceDisk: disk, - resourceInodes: inodes, + v1.ResourceEphemeralStorage: disk, + resourceInodes: inodes, } } // podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure. func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList { disk := resource.Quantity{Format: resource.BinarySI} - inodes := resource.Quantity{Format: resource.BinarySI} + inodes := resource.Quantity{Format: resource.DecimalSI} for _, volumeName := range volumeNames { for _, volumeStats := range podStats.VolumeStats { if volumeStats.Name == volumeName { @@ -374,29 +363,29 @@ func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.Re } } return v1.ResourceList{ - resourceDisk: disk, - resourceInodes: inodes, + v1.ResourceEphemeralStorage: disk, + resourceInodes: inodes, } } // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure. func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) { disk := resource.Quantity{Format: resource.BinarySI} - inodes := resource.Quantity{Format: resource.BinarySI} + inodes := resource.Quantity{Format: resource.DecimalSI} containerUsageList := containerUsage(podStats, statsToMeasure) - disk.Add(containerUsageList[resourceDisk]) + disk.Add(containerUsageList[v1.ResourceEphemeralStorage]) inodes.Add(containerUsageList[resourceInodes]) if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { volumeNames := localVolumeNames(pod) podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats) - disk.Add(podLocalVolumeUsageList[resourceDisk]) + disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage]) inodes.Add(podLocalVolumeUsageList[resourceInodes]) } return v1.ResourceList{ - resourceDisk: disk, - resourceInodes: inodes, + v1.ResourceEphemeralStorage: disk, + resourceInodes: inodes, }, nil } @@ -426,21 +415,21 @@ func localEphemeralVolumeNames(pod *v1.Pod) []string { // podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure. func podLocalEphemeralStorageUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) { disk := resource.Quantity{Format: resource.BinarySI} - inodes := resource.Quantity{Format: resource.BinarySI} + inodes := resource.Quantity{Format: resource.DecimalSI} containerUsageList := containerUsage(podStats, statsToMeasure) - disk.Add(containerUsageList[resourceDisk]) + disk.Add(containerUsageList[v1.ResourceEphemeralStorage]) inodes.Add(containerUsageList[resourceInodes]) if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { volumeNames := localEphemeralVolumeNames(pod) podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats) - disk.Add(podLocalVolumeUsageList[resourceDisk]) + disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage]) inodes.Add(podLocalVolumeUsageList[resourceInodes]) } return v1.ResourceList{ - resourceDisk: disk, - resourceInodes: inodes, + v1.ResourceEphemeralStorage: disk, + resourceInodes: inodes, }, nil } @@ -605,7 +594,7 @@ func memory(stats statsFunc) cmpFunc { // max(max of init container requests, sum of container requests) func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity { containerValue := resource.Quantity{Format: resource.BinarySI} - if resourceName == resourceDisk && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { + if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { // if the local storage capacity isolation feature gate is disabled, pods request 0 disk return containerValue } @@ -613,7 +602,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity { switch resourceName { case v1.ResourceMemory: containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.Memory()) - case resourceDisk: + case v1.ResourceEphemeralStorage: containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.StorageEphemeral()) } } @@ -624,7 +613,7 @@ func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity { if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.Memory()) < 0 { initValue = *pod.Spec.InitContainers[i].Resources.Requests.Memory() } - case resourceDisk: + case v1.ResourceEphemeralStorage: if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()) < 0 { initValue = *pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral() } @@ -681,9 +670,9 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.Resou // adjust p1, p2 usage relative to the request (if any) p1Disk := p1Usage[diskResource] p2Disk := p2Usage[diskResource] - p1Request := podRequest(p1, resourceDisk) + p1Request := podRequest(p1, v1.ResourceEphemeralStorage) p1Disk.Sub(p1Request) - p2Request := podRequest(p2, resourceDisk) + p2Request := podRequest(p2, v1.ResourceEphemeralStorage) p2Disk.Sub(p2Request) // prioritize evicting the pod which has the larger consumption of disk return p2Disk.Cmp(p1Disk) @@ -716,14 +705,15 @@ func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.Resour } // byEvictionPriority implements sort.Interface for []v1.ResourceName. -type byEvictionPriority []v1.ResourceName +type byEvictionPriority []evictionapi.Threshold func (a byEvictionPriority) Len() int { return len(a) } func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -// Less ranks memory before all other resources. +// Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last func (a byEvictionPriority) Less(i, j int) bool { - return a[i] == v1.ResourceMemory + _, jSignalHasResource := signalToResource[a[j].Signal] + return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource } // makeSignalObservations derives observations using the specified summary provider. @@ -761,8 +751,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat } if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { result[evictionapi.SignalNodeFsInodesFree] = signalObservation{ - available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI), - capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI), + available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI), + capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI), time: nodeFs.Time, } } @@ -777,8 +767,8 @@ func makeSignalObservations(summary *statsapi.Summary) (signalObservations, stat } if imageFs.InodesFree != nil && imageFs.Inodes != nil { result[evictionapi.SignalImageFsInodesFree] = signalObservation{ - available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI), - capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI), + available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI), + capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI), time: imageFs.Time, } } @@ -1006,57 +996,34 @@ func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.Threshold return a.Percentage == b.Percentage } -// getStarvedResources returns the set of resources that are starved based on thresholds met. -func getStarvedResources(thresholds []evictionapi.Threshold) []v1.ResourceName { - results := []v1.ResourceName{} - for _, threshold := range thresholds { - if starvedResource, found := signalToResource[threshold.Signal]; found { - results = append(results, starvedResource) - } - } - return results -} - -// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds -func isSoftEvictionThresholds(thresholds []evictionapi.Threshold, starvedResource v1.ResourceName) bool { - for _, threshold := range thresholds { - if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource { - continue - } - if isHardEvictionThreshold(threshold) { - return false - } - } - return true -} - // isHardEvictionThreshold returns true if eviction should immediately occur func isHardEvictionThreshold(threshold evictionapi.Threshold) bool { return threshold.GracePeriod == time.Duration(0) } -// buildResourceToRankFunc returns ranking functions associated with resources -func buildResourceToRankFunc(withImageFs bool) map[v1.ResourceName]rankFunc { - resourceToRankFunc := map[v1.ResourceName]rankFunc{ - v1.ResourceMemory: rankMemoryPressure, +// buildSignalToRankFunc returns ranking functions associated with resources +func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc { + signalToRankFunc := map[evictionapi.Signal]rankFunc{ + evictionapi.SignalMemoryAvailable: rankMemoryPressure, + evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure, } // usage of an imagefs is optional if withImageFs { // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes - resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) - resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) // with an imagefs, imagefs pod rank func for eviction only includes rootfs - resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk) - resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) + signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) } else { // without an imagefs, nodefs pod rank func for eviction looks at all fs stats. // since imagefs and nodefs share a common device, they share common ranking functions. - resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) - resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) - resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) - resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) + signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) + signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) } - return resourceToRankFunc + return signalToRankFunc } // PodIsEvicted returns true if the reported pod status is due to an eviction. @@ -1064,26 +1031,57 @@ func PodIsEvicted(podStatus v1.PodStatus) bool { return podStatus.Phase == v1.PodFailed && podStatus.Reason == reason } -// buildResourceToNodeReclaimFuncs returns reclaim functions associated with resources. -func buildResourceToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[v1.ResourceName]nodeReclaimFuncs { - resourceToReclaimFunc := map[v1.ResourceName]nodeReclaimFuncs{} +// buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources. +func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs { + signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{} // usage of an imagefs is optional if withImageFs { // with an imagefs, nodefs pressure should just delete logs - resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{} - resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{} + signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{} + signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{} // with an imagefs, imagefs pressure should delete unused images - resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} - resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} } else { // without an imagefs, nodefs pressure should delete logs, and unused images // since imagefs and nodefs share a common device, they share common reclaim functions - resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} - resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} - resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} - resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} + signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} } - return resourceToReclaimFunc + return signalToReclaimFunc +} + +// evictionMessage constructs a useful message about why an eviction occurred +func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc) string { + message := fmt.Sprintf(message, resourceToReclaim) + podStats, ok := stats(pod) + if !ok { + return message + } + for _, containerStats := range podStats.Containers { + for _, container := range pod.Spec.Containers { + if container.Name == containerStats.Name { + requests := container.Resources.Requests[resourceToReclaim] + var usage *resource.Quantity + switch resourceToReclaim { + case v1.ResourceEphemeralStorage: + if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil { + usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI) + } + case v1.ResourceMemory: + if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil { + usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI) + } + } + if usage != nil && usage.Cmp(requests) > 0 { + message += fmt.Sprintf(containerMessage, container.Name, usage.String(), requests.String()) + } + } + } + } + return message } // thresholdStopCh is a ThresholdStopCh which can only be closed after notifierRefreshInterval time has passed diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 094940bea98..d7ce3c8586f 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -19,6 +19,7 @@ package eviction import ( "fmt" "reflect" + "sort" "sync" "testing" "time" @@ -29,12 +30,10 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/clock" utilfeature "k8s.io/apiserver/pkg/util/feature" - api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/features" statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" - "k8s.io/kubernetes/pkg/quota" ) func quantityMustParse(value string) *resource.Quantity { @@ -470,7 +469,7 @@ func TestOrderedByExceedsRequestDisk(t *testing.T) { return result, found } pods := []*v1.Pod{below, exceeds} - orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) + orderedBy(exceedDiskRequests(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods) expected := []*v1.Pod{exceeds, below} for i := range expected { @@ -584,7 +583,7 @@ func TestOrderedbyDisk(t *testing.T) { return result, found } pods := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6} - orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) + orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods) expected := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6} for i := range expected { if pods[i] != expected[i] { @@ -651,7 +650,7 @@ func TestOrderedbyDiskDisableLocalStorage(t *testing.T) { return result, found } pods := []*v1.Pod{pod1, pod3, pod2, pod4, pod5, pod6} - orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) + orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)).Sort(pods) expected := []*v1.Pod{pod5, pod3, pod1, pod6, pod4, pod2} for i := range expected { if pods[i] != expected[i] { @@ -780,7 +779,7 @@ func TestOrderedByPriorityDisk(t *testing.T) { pods := []*v1.Pod{pod8, pod7, pod6, pod5, pod4, pod3, pod2, pod1} expected := []*v1.Pod{pod1, pod2, pod3, pod4, pod5, pod6, pod7, pod8} fsStatsToMeasure := []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource} - orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, resourceDisk), priority, disk(statsFn, fsStatsToMeasure, resourceDisk)).Sort(pods) + orderedBy(exceedDiskRequests(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage), priority, disk(statsFn, fsStatsToMeasure, v1.ResourceEphemeralStorage)).Sort(pods) for i := range expected { if pods[i] != expected[i] { t.Errorf("Expected pod[%d]: %s, but got: %s", i, expected[i].Name, pods[i].Name) @@ -932,6 +931,80 @@ func TestOrderedByPriorityMemory(t *testing.T) { } } +func TestSortByEvictionPriority(t *testing.T) { + for _, tc := range []struct { + name string + thresholds []evictionapi.Threshold + expected []evictionapi.Threshold + }{ + { + name: "empty threshold list", + thresholds: []evictionapi.Threshold{}, + expected: []evictionapi.Threshold{}, + }, + { + name: "memory first, PID last", + thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalPIDAvailable, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + }, + { + Signal: evictionapi.SignalMemoryAvailable, + }, + }, + expected: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalMemoryAvailable, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + }, + { + Signal: evictionapi.SignalPIDAvailable, + }, + }, + }, + { + name: "allocatable memory first, PID last", + thresholds: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalPIDAvailable, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + }, + { + Signal: evictionapi.SignalAllocatableMemoryAvailable, + }, + }, + expected: []evictionapi.Threshold{ + { + Signal: evictionapi.SignalAllocatableMemoryAvailable, + }, + { + Signal: evictionapi.SignalNodeFsAvailable, + }, + { + Signal: evictionapi.SignalPIDAvailable, + }, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + sort.Sort(byEvictionPriority(tc.thresholds)) + for i := range tc.expected { + if tc.thresholds[i].Signal != tc.expected[i].Signal { + t.Errorf("At index %d, expected threshold with signal %s, but got %s", i, tc.expected[i].Signal, tc.thresholds[i].Signal) + } + } + + }) + } +} + type fakeSummaryProvider struct { result *statsapi.Summary } @@ -1622,47 +1695,6 @@ func TestHasNodeConditions(t *testing.T) { } } -func TestGetStarvedResources(t *testing.T) { - testCases := map[string]struct { - inputs []evictionapi.Threshold - result []v1.ResourceName - }{ - "memory.available": { - inputs: []evictionapi.Threshold{ - {Signal: evictionapi.SignalMemoryAvailable}, - }, - result: []v1.ResourceName{v1.ResourceMemory}, - }, - "imagefs.available": { - inputs: []evictionapi.Threshold{ - {Signal: evictionapi.SignalImageFsAvailable}, - }, - result: []v1.ResourceName{resourceImageFs}, - }, - "nodefs.available": { - inputs: []evictionapi.Threshold{ - {Signal: evictionapi.SignalNodeFsAvailable}, - }, - result: []v1.ResourceName{resourceNodeFs}, - }, - } - var internalResourceNames = func(in []v1.ResourceName) []api.ResourceName { - var out []api.ResourceName - for _, name := range in { - out = append(out, api.ResourceName(name)) - } - return out - } - for testName, testCase := range testCases { - actual := getStarvedResources(testCase.inputs) - actualSet := quota.ToSet(internalResourceNames(actual)) - expectedSet := quota.ToSet(internalResourceNames(testCase.result)) - if !actualSet.Equal(expectedSet) { - t.Errorf("Test case: %s, expected: %v, actual: %v", testName, expectedSet, actualSet) - } - } -} - func TestParsePercentage(t *testing.T) { testCases := map[string]struct { hasError bool