Add local storage (scratch space) allocatable support

This PR adds the support for allocatable local storage (scratch space). This feature is only for root file system which is shared by kubernetes componenets, users' containers and/or images. User could use --kube-reserved flag to reserve the storage for kube system components. If the allocatable storage for user's pods is used up, some pods will be evicted to free the storage resource.
2026-01-16 06:49:38 +00:00 · 2017-05-25 12:29:19 -07:00
parent 68dd748ba1
commit dd67e96c01
11 changed files with 461 additions and 17 deletions
--- a/pkg/kubelet/cadvisor/util.go
+++ b/pkg/kubelet/cadvisor/util.go
@@ -18,6 +18,7 @@ package cadvisor

 import (
 	cadvisorapi "github.com/google/cadvisor/info/v1"
+	cadvisorapi2 "github.com/google/cadvisor/info/v2"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/kubernetes/pkg/api/v1"
 )
@@ -33,3 +34,12 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
 	}
 	return c
 }
+
+func StorageScratchCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
+	c := v1.ResourceList{
+		v1.ResourceStorage: *resource.NewQuantity(
+			int64(info.Capacity),
+			resource.BinarySI),
+	}
+	return c
+}
--- a/pkg/kubelet/cm/node_container_manager.go
+++ b/pkg/kubelet/cm/node_container_manager.go
@@ -29,6 +29,7 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	clientv1 "k8s.io/client-go/pkg/api/v1"
 	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/kubelet/cadvisor"
 	"k8s.io/kubernetes/pkg/kubelet/events"
 	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
 )
@@ -180,9 +181,18 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {

 }

-// GetNodeAllocatable returns amount of compute resource that have to be reserved on this node from scheduling.
+// GetNodeAllocatable returns amount of compute or storage resource that have to be reserved on this node from scheduling.
 func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
 	evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
+	if _, ok := cm.capacity[v1.ResourceStorage]; !ok {
+		if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil {
+			for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) {
+				cm.capacity[rName] = rCap
+			}
+		} else {
+			glog.Warning("Error getting rootfs info: %v", err)
+		}
+	}
 	result := make(v1.ResourceList)
 	for k := range cm.capacity {
 		value := resource.NewQuantity(0, resource.DecimalSI)
--- a/pkg/kubelet/eviction/api/types.go
+++ b/pkg/kubelet/eviction/api/types.go
@@ -38,6 +38,8 @@ const (
 	SignalImageFsInodesFree Signal = "imagefs.inodesFree"
 	// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
 	SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
+	// SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation
+	SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available"
 )

 // ThresholdOperator is the operator used to express a Threshold.
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@@ -82,6 +82,8 @@ type managerImpl struct {
 	lastObservations signalObservations
 	// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
 	notifiersInitialized bool
+	// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
+	dedicatedImageFs *bool
 }

 // ensure it implements the required interface
@@ -106,6 +108,7 @@ func NewManager(
 		nodeRef:         nodeRef,
 		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
 		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+		dedicatedImageFs:             nil,
 	}
 	return manager, manager
 }
@@ -211,21 +214,22 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	}

 	glog.V(3).Infof("eviction manager: synchronize housekeeping")
-
 	// build the ranking functions (if not yet known)
 	// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
-	if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
-		// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
-		hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
-		if err != nil {
+	if m.dedicatedImageFs == nil {
+		hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
+		if ok != nil {
 			return nil
 		}
-		m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
-		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
+		m.dedicatedImageFs = &hasImageFs
+		m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
+		m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasImageFs)
+
 	}

+	activePods := podFunc()
 	// make observations and get a function to derive pod usage stats relative to those observations.
-	observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
+	observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider, activePods, *m.dedicatedImageFs)
 	if err != nil {
 		glog.Errorf("eviction manager: unexpected err: %v", err)
 		return nil
@@ -336,7 +340,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	}

 	// the only candidates viable for eviction are those pods that had anything running.
-	activePods := podFunc()
+	if len(activePods) == 0 {
+		glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
+		return nil
+	}
+
 	// rank the running pods for eviction for the specified resource
 	rank(activePods, statsFunc)

--- a/pkg/kubelet/eviction/helpers.go
+++ b/pkg/kubelet/eviction/helpers.go
@@ -54,6 +54,8 @@ const (
 	resourceNodeFs v1.ResourceName = "nodefs"
 	// nodefs inodes, number.  internal to this module, used to account for local node root filesystem inodes.
 	resourceNodeFsInodes v1.ResourceName = "nodefsInodes"
+	// container overlay storage, in bytes.  internal to this module, used to account for local disk usage for container overlay.
+	resourceOverlay v1.ResourceName = "overlay"
 )

 var (
@@ -74,19 +76,25 @@ func init() {
 	signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
 	signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
 	signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure
+	signalToNodeCondition[evictionapi.SignalAllocatableNodeFsAvailable] = v1.NodeDiskPressure

 	// map signals to resources (and vice-versa)
 	signalToResource = map[evictionapi.Signal]v1.ResourceName{}
 	signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
 	signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
+	signalToResource[evictionapi.SignalAllocatableNodeFsAvailable] = resourceNodeFs
 	signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
 	signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
 	signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
 	signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes
+
 	resourceToSignal = map[v1.ResourceName]evictionapi.Signal{}
 	for key, value := range signalToResource {
 		resourceToSignal[value] = key
 	}
+	// Hard-code here to make sure resourceNodeFs maps to evictionapi.SignalNodeFsAvailable
+	// (TODO) resourceToSignal is a map from resource name to a list of signals
+	resourceToSignal[resourceNodeFs] = evictionapi.SignalNodeFsAvailable
 }

 // validSignal returns true if the signal is supported.
@@ -234,6 +242,16 @@ func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold
 						Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
 					},
 				},
+				{
+					Signal:   evictionapi.SignalAllocatableNodeFsAvailable,
+					Operator: evictionapi.OpLessThan,
+					Value: evictionapi.ThresholdValue{
+						Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
+					},
+					MinReclaim: &evictionapi.ThresholdValue{
+						Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
+					},
+				},
 			}
 		}
 	}
@@ -382,10 +400,12 @@ func localVolumeNames(pod *v1.Pod) []string {
 func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
 	disk := resource.Quantity{Format: resource.BinarySI}
 	inodes := resource.Quantity{Format: resource.BinarySI}
+	overlay := resource.Quantity{Format: resource.BinarySI}
 	for _, container := range podStats.Containers {
 		if hasFsStatsType(statsToMeasure, fsStatsRoot) {
 			disk.Add(*diskUsage(container.Rootfs))
 			inodes.Add(*inodeUsage(container.Rootfs))
+			overlay.Add(*diskUsage(container.Rootfs))
 		}
 		if hasFsStatsType(statsToMeasure, fsStatsLogs) {
 			disk.Add(*diskUsage(container.Logs))
@@ -405,8 +425,9 @@ func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsSt
 		}
 	}
 	return v1.ResourceList{
-		resourceDisk:   disk,
-		resourceInodes: inodes,
+		resourceDisk:    disk,
+		resourceInodes:  inodes,
+		resourceOverlay: overlay,
 	}, nil
 }

@@ -637,7 +658,7 @@ func (a byEvictionPriority) Less(i, j int) bool {
 }

 // makeSignalObservations derives observations using the specified summary provider.
-func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) {
+func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider, pods []*v1.Pod, withImageFs bool) (signalObservations, statsFunc, error) {
 	summary, err := summaryProvider.Get()
 	if err != nil {
 		return nil, nil, err
@@ -706,6 +727,37 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
 			capacity:  memoryAllocatableCapacity.Copy(),
 		}
 	}
+
+	if storageScratchAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceStorage]; ok {
+		storageScratchAllocatable := storageScratchAllocatableCapacity.Copy()
+		for _, pod := range pods {
+			podStat, ok := statsFunc(pod)
+			if !ok {
+				continue
+			}
+
+			usage, err := podDiskUsage(podStat, pod, []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot})
+			if err != nil {
+				glog.Warningf("eviction manager: error getting pod disk usage %v", err)
+				continue
+			}
+			// If there is a seperate imagefs set up for container runtimes, the scratch disk usage from nodefs should exclude the overlay usage
+			if withImageFs {
+				diskUsage := usage[resourceDisk]
+				diskUsageP := &diskUsage
+				diskUsagep := diskUsageP.Copy()
+				diskUsagep.Sub(usage[resourceOverlay])
+				storageScratchAllocatable.Sub(*diskUsagep)
+			} else {
+				storageScratchAllocatable.Sub(usage[resourceDisk])
+			}
+		}
+		result[evictionapi.SignalAllocatableNodeFsAvailable] = signalObservation{
+			available: storageScratchAllocatable,
+			capacity:  storageScratchAllocatableCapacity.Copy(),
+		}
+	}
+
 	return result, statsFunc, nil
 }

--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -37,6 +37,7 @@ import (
 	clientgoclientset "k8s.io/client-go/kubernetes"

 	cadvisorapi "github.com/google/cadvisor/info/v1"
+	cadvisorapiv2 "github.com/google/cadvisor/info/v2"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/labels"
@@ -927,6 +928,9 @@ type Kubelet struct {
 	// Cached MachineInfo returned by cadvisor.
 	machineInfo *cadvisorapi.MachineInfo

+	//Cached RootFsInfo returned by cadvisor
+	rootfsInfo *cadvisorapiv2.FsInfo
+
 	// Handles certificate rotations.
 	serverCertificateManager certificate.Manager

--- a/pkg/kubelet/kubelet_cadvisor.go
+++ b/pkg/kubelet/kubelet_cadvisor.go
@@ -100,3 +100,15 @@ func (kl *Kubelet) GetCachedMachineInfo() (*cadvisorapi.MachineInfo, error) {
 	}
 	return kl.machineInfo, nil
 }
+
+// GetCachedRootFsInfo assumes that the rootfs info can't change without a reboot
+func (kl *Kubelet) GetCachedRootFsInfo() (cadvisorapiv2.FsInfo, error) {
+	if kl.rootfsInfo == nil {
+		info, err := kl.cadvisor.RootFsInfo()
+		if err != nil {
+			return cadvisorapiv2.FsInfo{}, err
+		}
+		kl.rootfsInfo = &info
+	}
+	return *kl.rootfsInfo, nil
+}
--- a/pkg/kubelet/kubelet_node_status.go
+++ b/pkg/kubelet/kubelet_node_status.go
@@ -551,6 +551,26 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
 		node.Status.NodeInfo.BootID = info.BootID
 	}

+	rootfs, err := kl.GetCachedRootFsInfo()
+	if err != nil {
+		node.Status.Capacity[v1.ResourceStorage] = resource.MustParse("0Gi")
+	} else {
+		for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) {
+			node.Status.Capacity[rName] = rCap
+		}
+	}
+
+	if hasDedicatedImageFs, _ := kl.HasDedicatedImageFs(); hasDedicatedImageFs {
+		imagesfs, err := kl.ImagesFsInfo()
+		if err != nil {
+			node.Status.Capacity[v1.ResourceStorageOverlay] = resource.MustParse("0Gi")
+		} else {
+			for rName, rCap := range cadvisor.StorageOverlayCapacityFromFsInfo(imagesfs) {
+				node.Status.Capacity[rName] = rCap
+			}
+		}
+	}
+
 	// Set Allocatable.
 	if node.Status.Allocatable == nil {
 		node.Status.Allocatable = make(v1.ResourceList)