mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-31 05:40:42 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			593 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			593 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
| Copyright 2016 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package eviction
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"sort"
 | |
| 	"sync"
 | |
| 	"time"
 | |
| 
 | |
| 	"k8s.io/klog/v2"
 | |
| 
 | |
| 	v1 "k8s.io/api/core/v1"
 | |
| 	"k8s.io/apimachinery/pkg/api/resource"
 | |
| 	"k8s.io/apimachinery/pkg/types"
 | |
| 	"k8s.io/apimachinery/pkg/util/clock"
 | |
| 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 | |
| 	"k8s.io/client-go/tools/record"
 | |
| 	apiv1resource "k8s.io/kubernetes/pkg/api/v1/resource"
 | |
| 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 | |
| 	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
 | |
| 	"k8s.io/kubernetes/pkg/features"
 | |
| 	statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
 | |
| 	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/metrics"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/server/stats"
 | |
| 	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	podCleanupTimeout  = 30 * time.Second
 | |
| 	podCleanupPollFreq = time.Second
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	// signalEphemeralContainerFsLimit is amount of storage available on filesystem requested by the container
 | |
| 	signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit"
 | |
| 	// signalEphemeralPodFsLimit is amount of storage available on filesystem requested by the pod
 | |
| 	signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
 | |
| 	// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
 | |
| 	signalEmptyDirFsLimit string = "emptydirfs.limit"
 | |
| )
 | |
| 
 | |
| // managerImpl implements Manager
 | |
| type managerImpl struct {
 | |
| 	//  used to track time
 | |
| 	clock clock.Clock
 | |
| 	// config is how the manager is configured
 | |
| 	config Config
 | |
| 	// the function to invoke to kill a pod
 | |
| 	killPodFunc KillPodFunc
 | |
| 	// the function to get the mirror pod by a given statid pod
 | |
| 	mirrorPodFunc MirrorPodFunc
 | |
| 	// the interface that knows how to do image gc
 | |
| 	imageGC ImageGC
 | |
| 	// the interface that knows how to do container gc
 | |
| 	containerGC ContainerGC
 | |
| 	// protects access to internal state
 | |
| 	sync.RWMutex
 | |
| 	// node conditions are the set of conditions present
 | |
| 	nodeConditions []v1.NodeConditionType
 | |
| 	// captures when a node condition was last observed based on a threshold being met
 | |
| 	nodeConditionsLastObservedAt nodeConditionsObservedAt
 | |
| 	// nodeRef is a reference to the node
 | |
| 	nodeRef *v1.ObjectReference
 | |
| 	// used to record events about the node
 | |
| 	recorder record.EventRecorder
 | |
| 	// used to measure usage stats on system
 | |
| 	summaryProvider stats.SummaryProvider
 | |
| 	// records when a threshold was first observed
 | |
| 	thresholdsFirstObservedAt thresholdsObservedAt
 | |
| 	// records the set of thresholds that have been met (including graceperiod) but not yet resolved
 | |
| 	thresholdsMet []evictionapi.Threshold
 | |
| 	// signalToRankFunc maps a resource to ranking function for that resource.
 | |
| 	signalToRankFunc map[evictionapi.Signal]rankFunc
 | |
| 	// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
 | |
| 	signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
 | |
| 	// last observations from synchronize
 | |
| 	lastObservations signalObservations
 | |
| 	// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
 | |
| 	dedicatedImageFs *bool
 | |
| 	// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
 | |
| 	thresholdNotifiers []ThresholdNotifier
 | |
| 	// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
 | |
| 	thresholdsLastUpdated time.Time
 | |
| 	// etcHostsPath is a function that will get the etc-hosts file's path for a pod given its UID
 | |
| 	etcHostsPath func(podUID types.UID) string
 | |
| }
 | |
| 
 | |
| // ensure it implements the required interface
 | |
| var _ Manager = &managerImpl{}
 | |
| 
 | |
| // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
 | |
| func NewManager(
 | |
| 	summaryProvider stats.SummaryProvider,
 | |
| 	config Config,
 | |
| 	killPodFunc KillPodFunc,
 | |
| 	mirrorPodFunc MirrorPodFunc,
 | |
| 	imageGC ImageGC,
 | |
| 	containerGC ContainerGC,
 | |
| 	recorder record.EventRecorder,
 | |
| 	nodeRef *v1.ObjectReference,
 | |
| 	clock clock.Clock,
 | |
| 	etcHostsPath func(types.UID) string,
 | |
| ) (Manager, lifecycle.PodAdmitHandler) {
 | |
| 	manager := &managerImpl{
 | |
| 		clock:                        clock,
 | |
| 		killPodFunc:                  killPodFunc,
 | |
| 		mirrorPodFunc:                mirrorPodFunc,
 | |
| 		imageGC:                      imageGC,
 | |
| 		containerGC:                  containerGC,
 | |
| 		config:                       config,
 | |
| 		recorder:                     recorder,
 | |
| 		summaryProvider:              summaryProvider,
 | |
| 		nodeRef:                      nodeRef,
 | |
| 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 | |
| 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
 | |
| 		dedicatedImageFs:             nil,
 | |
| 		thresholdNotifiers:           []ThresholdNotifier{},
 | |
| 		etcHostsPath:                 etcHostsPath,
 | |
| 	}
 | |
| 	return manager, manager
 | |
| }
 | |
| 
 | |
| // Admit rejects a pod if its not safe to admit for node stability.
 | |
| func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
 | |
| 	m.RLock()
 | |
| 	defer m.RUnlock()
 | |
| 	if len(m.nodeConditions) == 0 {
 | |
| 		return lifecycle.PodAdmitResult{Admit: true}
 | |
| 	}
 | |
| 	// Admit Critical pods even under resource pressure since they are required for system stability.
 | |
| 	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
 | |
| 	if kubelettypes.IsCriticalPod(attrs.Pod) {
 | |
| 		return lifecycle.PodAdmitResult{Admit: true}
 | |
| 	}
 | |
| 
 | |
| 	// Conditions other than memory pressure reject all pods
 | |
| 	nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1
 | |
| 	if nodeOnlyHasMemoryPressureCondition {
 | |
| 		notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)
 | |
| 		if notBestEffort {
 | |
| 			return lifecycle.PodAdmitResult{Admit: true}
 | |
| 		}
 | |
| 
 | |
| 		// When node has memory pressure, check BestEffort Pod's toleration:
 | |
| 		// admit it if tolerates memory pressure taint, fail for other tolerations, e.g. DiskPressure.
 | |
| 		if v1helper.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
 | |
| 			Key:    v1.TaintNodeMemoryPressure,
 | |
| 			Effect: v1.TaintEffectNoSchedule,
 | |
| 		}) {
 | |
| 			return lifecycle.PodAdmitResult{Admit: true}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
 | |
| 	klog.Warningf("Failed to admit pod %s - node has conditions: %v", format.Pod(attrs.Pod), m.nodeConditions)
 | |
| 	return lifecycle.PodAdmitResult{
 | |
| 		Admit:   false,
 | |
| 		Reason:  Reason,
 | |
| 		Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Start starts the control loop to observe and response to low compute resources.
 | |
| func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
 | |
| 	thresholdHandler := func(message string) {
 | |
| 		klog.Infof(message)
 | |
| 		m.synchronize(diskInfoProvider, podFunc)
 | |
| 	}
 | |
| 	if m.config.KernelMemcgNotification {
 | |
| 		for _, threshold := range m.config.Thresholds {
 | |
| 			if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
 | |
| 				notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
 | |
| 				if err != nil {
 | |
| 					klog.Warningf("eviction manager: failed to create memory threshold notifier: %v", err)
 | |
| 				} else {
 | |
| 					go notifier.Start()
 | |
| 					m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	// start the eviction manager monitoring
 | |
| 	go func() {
 | |
| 		for {
 | |
| 			if evictedPods := m.synchronize(diskInfoProvider, podFunc); evictedPods != nil {
 | |
| 				klog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods))
 | |
| 				m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
 | |
| 			} else {
 | |
| 				time.Sleep(monitoringInterval)
 | |
| 			}
 | |
| 		}
 | |
| 	}()
 | |
| }
 | |
| 
 | |
| // IsUnderMemoryPressure returns true if the node is under memory pressure.
 | |
| func (m *managerImpl) IsUnderMemoryPressure() bool {
 | |
| 	m.RLock()
 | |
| 	defer m.RUnlock()
 | |
| 	return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure)
 | |
| }
 | |
| 
 | |
| // IsUnderDiskPressure returns true if the node is under disk pressure.
 | |
| func (m *managerImpl) IsUnderDiskPressure() bool {
 | |
| 	m.RLock()
 | |
| 	defer m.RUnlock()
 | |
| 	return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure)
 | |
| }
 | |
| 
 | |
| // IsUnderPIDPressure returns true if the node is under PID pressure.
 | |
| func (m *managerImpl) IsUnderPIDPressure() bool {
 | |
| 	m.RLock()
 | |
| 	defer m.RUnlock()
 | |
| 	return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
 | |
| }
 | |
| 
 | |
| // synchronize is the main control loop that enforces eviction thresholds.
 | |
| // Returns the pod that was killed, or nil if no pod was killed.
 | |
| func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) []*v1.Pod {
 | |
| 	// if we have nothing to do, just return
 | |
| 	thresholds := m.config.Thresholds
 | |
| 	if len(thresholds) == 0 && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	klog.V(3).Infof("eviction manager: synchronize housekeeping")
 | |
| 	// build the ranking functions (if not yet known)
 | |
| 	// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
 | |
| 	if m.dedicatedImageFs == nil {
 | |
| 		hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
 | |
| 		if ok != nil {
 | |
| 			return nil
 | |
| 		}
 | |
| 		m.dedicatedImageFs = &hasImageFs
 | |
| 		m.signalToRankFunc = buildSignalToRankFunc(hasImageFs)
 | |
| 		m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs)
 | |
| 	}
 | |
| 
 | |
| 	activePods := podFunc()
 | |
| 	updateStats := true
 | |
| 	summary, err := m.summaryProvider.Get(updateStats)
 | |
| 	if err != nil {
 | |
| 		klog.Errorf("eviction manager: failed to get summary stats: %v", err)
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
 | |
| 		m.thresholdsLastUpdated = m.clock.Now()
 | |
| 		for _, notifier := range m.thresholdNotifiers {
 | |
| 			if err := notifier.UpdateThreshold(summary); err != nil {
 | |
| 				klog.Warningf("eviction manager: failed to update %s: %v", notifier.Description(), err)
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// make observations and get a function to derive pod usage stats relative to those observations.
 | |
| 	observations, statsFunc := makeSignalObservations(summary)
 | |
| 	debugLogObservations("observations", observations)
 | |
| 
 | |
| 	// determine the set of thresholds met independent of grace period
 | |
| 	thresholds = thresholdsMet(thresholds, observations, false)
 | |
| 	debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
 | |
| 
 | |
| 	// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
 | |
| 	if len(m.thresholdsMet) > 0 {
 | |
| 		thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
 | |
| 		thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
 | |
| 	}
 | |
| 	debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)
 | |
| 
 | |
| 	// track when a threshold was first observed
 | |
| 	now := m.clock.Now()
 | |
| 	thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
 | |
| 
 | |
| 	// the set of node conditions that are triggered by currently observed thresholds
 | |
| 	nodeConditions := nodeConditions(thresholds)
 | |
| 	if len(nodeConditions) > 0 {
 | |
| 		klog.V(3).Infof("eviction manager: node conditions - observed: %v", nodeConditions)
 | |
| 	}
 | |
| 
 | |
| 	// track when a node condition was last observed
 | |
| 	nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
 | |
| 
 | |
| 	// node conditions report true if it has been observed within the transition period window
 | |
| 	nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
 | |
| 	if len(nodeConditions) > 0 {
 | |
| 		klog.V(3).Infof("eviction manager: node conditions - transition period not met: %v", nodeConditions)
 | |
| 	}
 | |
| 
 | |
| 	// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
 | |
| 	thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
 | |
| 	debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations)
 | |
| 
 | |
| 	// update internal state
 | |
| 	m.Lock()
 | |
| 	m.nodeConditions = nodeConditions
 | |
| 	m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
 | |
| 	m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
 | |
| 	m.thresholdsMet = thresholds
 | |
| 
 | |
| 	// determine the set of thresholds whose stats have been updated since the last sync
 | |
| 	thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
 | |
| 	debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)
 | |
| 
 | |
| 	m.lastObservations = observations
 | |
| 	m.Unlock()
 | |
| 
 | |
| 	// evict pods if there is a resource usage violation from local volume temporary storage
 | |
| 	// If eviction happens in localStorageEviction function, skip the rest of eviction action
 | |
| 	if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
 | |
| 		if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 {
 | |
| 			return evictedPods
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if len(thresholds) == 0 {
 | |
| 		klog.V(3).Infof("eviction manager: no resources are starved")
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// rank the thresholds by eviction priority
 | |
| 	sort.Sort(byEvictionPriority(thresholds))
 | |
| 	thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds)
 | |
| 	if !foundAny {
 | |
| 		return nil
 | |
| 	}
 | |
| 	klog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
 | |
| 
 | |
| 	// record an event about the resources we are now attempting to reclaim via eviction
 | |
| 	m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
 | |
| 
 | |
| 	// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
 | |
| 	if m.reclaimNodeLevelResources(thresholdToReclaim.Signal, resourceToReclaim) {
 | |
| 		klog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim)
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	klog.Infof("eviction manager: must evict pod(s) to reclaim %v", resourceToReclaim)
 | |
| 
 | |
| 	// rank the pods for eviction
 | |
| 	rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
 | |
| 	if !ok {
 | |
| 		klog.Errorf("eviction manager: no ranking function for signal %s", thresholdToReclaim.Signal)
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// the only candidates viable for eviction are those pods that had anything running.
 | |
| 	if len(activePods) == 0 {
 | |
| 		klog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	// rank the running pods for eviction for the specified resource
 | |
| 	rank(activePods, statsFunc)
 | |
| 
 | |
| 	klog.Infof("eviction manager: pods ranked for eviction: %s", format.Pods(activePods))
 | |
| 
 | |
| 	//record age of metrics for met thresholds that we are using for evictions.
 | |
| 	for _, t := range thresholds {
 | |
| 		timeObserved := observations[t.Signal].time
 | |
| 		if !timeObserved.IsZero() {
 | |
| 			metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time))
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// we kill at most a single pod during each eviction interval
 | |
| 	for i := range activePods {
 | |
| 		pod := activePods[i]
 | |
| 		gracePeriodOverride := int64(0)
 | |
| 		if !isHardEvictionThreshold(thresholdToReclaim) {
 | |
| 			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
 | |
| 		}
 | |
| 		message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc)
 | |
| 		if m.evictPod(pod, gracePeriodOverride, message, annotations) {
 | |
| 			metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc()
 | |
| 			return []*v1.Pod{pod}
 | |
| 		}
 | |
| 	}
 | |
| 	klog.Infof("eviction manager: unable to evict any pods from the node")
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
 | |
| 	timeout := m.clock.NewTimer(podCleanupTimeout)
 | |
| 	defer timeout.Stop()
 | |
| 	ticker := m.clock.NewTicker(podCleanupPollFreq)
 | |
| 	defer ticker.Stop()
 | |
| 	for {
 | |
| 		select {
 | |
| 		case <-timeout.C():
 | |
| 			klog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods))
 | |
| 			return
 | |
| 		case <-ticker.C():
 | |
| 			for i, pod := range pods {
 | |
| 				if !podCleanedUpFunc(pod) {
 | |
| 					break
 | |
| 				}
 | |
| 				if i == len(pods)-1 {
 | |
| 					klog.Infof("eviction manager: pods %s successfully cleaned up", format.Pods(pods))
 | |
| 					return
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // reclaimNodeLevelResources attempts to reclaim node level resources.  returns true if thresholds were satisfied and no pod eviction is required.
 | |
| func (m *managerImpl) reclaimNodeLevelResources(signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
 | |
| 	nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
 | |
| 	for _, nodeReclaimFunc := range nodeReclaimFuncs {
 | |
| 		// attempt to reclaim the pressured resource.
 | |
| 		if err := nodeReclaimFunc(); err != nil {
 | |
| 			klog.Warningf("eviction manager: unexpected error when attempting to reduce %v pressure: %v", resourceToReclaim, err)
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 	if len(nodeReclaimFuncs) > 0 {
 | |
| 		summary, err := m.summaryProvider.Get(true)
 | |
| 		if err != nil {
 | |
| 			klog.Errorf("eviction manager: failed to get summary stats after resource reclaim: %v", err)
 | |
| 			return false
 | |
| 		}
 | |
| 
 | |
| 		// make observations and get a function to derive pod usage stats relative to those observations.
 | |
| 		observations, _ := makeSignalObservations(summary)
 | |
| 		debugLogObservations("observations after resource reclaim", observations)
 | |
| 
 | |
| 		// determine the set of thresholds met independent of grace period
 | |
| 		thresholds := thresholdsMet(m.config.Thresholds, observations, false)
 | |
| 		debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
 | |
| 
 | |
| 		if len(thresholds) == 0 {
 | |
| 			return true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| // localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
 | |
| // to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
 | |
| func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod {
 | |
| 	evicted := []*v1.Pod{}
 | |
| 	for _, pod := range pods {
 | |
| 		podStats, ok := statsFunc(pod)
 | |
| 		if !ok {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if m.emptyDirLimitEviction(podStats, pod) {
 | |
| 			evicted = append(evicted, pod)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if m.podEphemeralStorageLimitEviction(podStats, pod) {
 | |
| 			evicted = append(evicted, pod)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if m.containerEphemeralStorageLimitEviction(podStats, pod) {
 | |
| 			evicted = append(evicted, pod)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return evicted
 | |
| }
 | |
| 
 | |
| func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
 | |
| 	podVolumeUsed := make(map[string]*resource.Quantity)
 | |
| 	for _, volume := range podStats.VolumeStats {
 | |
| 		podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
 | |
| 	}
 | |
| 	for i := range pod.Spec.Volumes {
 | |
| 		source := &pod.Spec.Volumes[i].VolumeSource
 | |
| 		if source.EmptyDir != nil {
 | |
| 			size := source.EmptyDir.SizeLimit
 | |
| 			used := podVolumeUsed[pod.Spec.Volumes[i].Name]
 | |
| 			if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
 | |
| 				// the emptyDir usage exceeds the size limit, evict the pod
 | |
| 				if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil) {
 | |
| 					metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
 | |
| 					return true
 | |
| 				}
 | |
| 				return false
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
 | |
| 	_, podLimits := apiv1resource.PodRequestsAndLimits(pod)
 | |
| 	_, found := podLimits[v1.ResourceEphemeralStorage]
 | |
| 	if !found {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	podEphemeralStorageTotalUsage := &resource.Quantity{}
 | |
| 	var fsStatsSet []fsStatsType
 | |
| 	if *m.dedicatedImageFs {
 | |
| 		fsStatsSet = []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}
 | |
| 	} else {
 | |
| 		fsStatsSet = []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}
 | |
| 	}
 | |
| 	podEphemeralUsage, err := podLocalEphemeralStorageUsage(podStats, pod, fsStatsSet, m.etcHostsPath(pod.UID))
 | |
| 	if err != nil {
 | |
| 		klog.Errorf("eviction manager: error getting pod disk usage %v", err)
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	podEphemeralStorageTotalUsage.Add(podEphemeralUsage[v1.ResourceEphemeralStorage])
 | |
| 	podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
 | |
| 	if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
 | |
| 		// the total usage of pod exceeds the total size limit of containers, evict the pod
 | |
| 		if m.evictPod(pod, 0, fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String()), nil) {
 | |
| 			metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
 | |
| 			return true
 | |
| 		}
 | |
| 		return false
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
 | |
| 	thresholdsMap := make(map[string]*resource.Quantity)
 | |
| 	for _, container := range pod.Spec.Containers {
 | |
| 		ephemeralLimit := container.Resources.Limits.StorageEphemeral()
 | |
| 		if ephemeralLimit != nil && ephemeralLimit.Value() != 0 {
 | |
| 			thresholdsMap[container.Name] = ephemeralLimit
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for _, containerStat := range podStats.Containers {
 | |
| 		containerUsed := diskUsage(containerStat.Logs)
 | |
| 		if !*m.dedicatedImageFs {
 | |
| 			containerUsed.Add(*diskUsage(containerStat.Rootfs))
 | |
| 		}
 | |
| 
 | |
| 		if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
 | |
| 			if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
 | |
| 				if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil) {
 | |
| 					metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
 | |
| 					return true
 | |
| 				}
 | |
| 				return false
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string) bool {
 | |
| 	// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
 | |
| 	// do not evict such pods. Static pods are not re-admitted after evictions.
 | |
| 	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
 | |
| 	if kubelettypes.IsCriticalPod(pod) {
 | |
| 		klog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod))
 | |
| 		return false
 | |
| 	}
 | |
| 	status := v1.PodStatus{
 | |
| 		Phase:   v1.PodFailed,
 | |
| 		Message: evictMsg,
 | |
| 		Reason:  Reason,
 | |
| 	}
 | |
| 	// record that we are evicting the pod
 | |
| 	m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
 | |
| 	// this is a blocking call and should only return when the pod and its containers are killed.
 | |
| 	err := m.killPodFunc(pod, status, &gracePeriodOverride)
 | |
| 	if err != nil {
 | |
| 		klog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
 | |
| 	} else {
 | |
| 		klog.Infof("eviction manager: pod %s is evicted successfully", format.Pod(pod))
 | |
| 	}
 | |
| 	return true
 | |
| }
 |