mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #30311 from derekwaynecarr/inode_eviction
Automatic merge from submit-queue kubelet eviction on inode exhaustion Add support for kubelet to monitor for inode exhaustion of either image or rootfs, and in response, attempt to reclaim node level resources and/or evict pods.
This commit is contained in:
commit
ff58d04a34
@ -478,9 +478,19 @@ for eviction. Instead `DaemonSet` should ideally include Guaranteed pods only.
|
||||
|
||||
## Known issues
|
||||
|
||||
### kubelet may evict more pods than needed
|
||||
|
||||
The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding
|
||||
the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future.
|
||||
|
||||
### How kubelet ranks pods for eviction in response to inode exhaustion
|
||||
|
||||
At this time, it is not possible to know how many inodes were consumed by a particular container. If the `kubelet` observes
|
||||
inode exhaustion, it will evict pods by ranking them by quality of service. The following issue has been opened in cadvisor
|
||||
to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods
|
||||
by inode consumption. For example, this would let us identify a container that created large numbers of 0 byte files, and evict
|
||||
that pod over others.
|
||||
|
||||
<!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
|
||||
[]()
|
||||
<!-- END MUNGE: GENERATED_ANALYTICS -->
|
||||
|
@ -96,6 +96,7 @@ pkg/credentialprovider/aws
|
||||
pkg/hyperkube
|
||||
pkg/kubelet/api
|
||||
pkg/kubelet/container
|
||||
pkg/kubelet/eviction
|
||||
pkg/kubelet/envvars
|
||||
pkg/kubelet/util/format
|
||||
pkg/kubelet/util/ioutils
|
||||
|
@ -914,3 +914,228 @@ func TestNodeReclaimFuncs(t *testing.T) {
|
||||
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiskPressureNodeFsInodes(t *testing.T) {
|
||||
// TODO: we need to know inodes used when cadvisor supports per container stats
|
||||
podMaker := func(name string, requests api.ResourceList, limits api.ResourceList) (*api.Pod, statsapi.PodStats) {
|
||||
pod := newPod(name, []api.Container{
|
||||
newContainer(name, requests, limits),
|
||||
}, nil)
|
||||
podStats := newPodInodeStats(pod)
|
||||
return pod, podStats
|
||||
}
|
||||
summaryStatsMaker := func(rootFsInodesFree, rootFsInodes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary {
|
||||
rootFsInodesFreeVal := resource.MustParse(rootFsInodesFree)
|
||||
internalRootFsInodesFree := uint64(rootFsInodesFreeVal.Value())
|
||||
rootFsInodesVal := resource.MustParse(rootFsInodes)
|
||||
internalRootFsInodes := uint64(rootFsInodesVal.Value())
|
||||
result := &statsapi.Summary{
|
||||
Node: statsapi.NodeStats{
|
||||
Fs: &statsapi.FsStats{
|
||||
InodesFree: &internalRootFsInodesFree,
|
||||
Inodes: &internalRootFsInodes,
|
||||
},
|
||||
},
|
||||
Pods: []statsapi.PodStats{},
|
||||
}
|
||||
for _, podStat := range podStats {
|
||||
result.Pods = append(result.Pods, podStat)
|
||||
}
|
||||
return result
|
||||
}
|
||||
// TODO: pass inodes used in future when supported by cadvisor.
|
||||
podsToMake := []struct {
|
||||
name string
|
||||
requests api.ResourceList
|
||||
limits api.ResourceList
|
||||
}{
|
||||
{name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", "")},
|
||||
{name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", "")},
|
||||
{name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")},
|
||||
{name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")},
|
||||
{name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")},
|
||||
{name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")},
|
||||
}
|
||||
pods := []*api.Pod{}
|
||||
podStats := map[*api.Pod]statsapi.PodStats{}
|
||||
for _, podToMake := range podsToMake {
|
||||
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits)
|
||||
pods = append(pods, pod)
|
||||
podStats[pod] = podStat
|
||||
}
|
||||
activePodsFunc := func() []*api.Pod {
|
||||
return pods
|
||||
}
|
||||
|
||||
fakeClock := clock.NewFakeClock(time.Now())
|
||||
podKiller := &mockPodKiller{}
|
||||
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
|
||||
imageGC := &mockImageGC{freed: int64(0), err: nil}
|
||||
nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""}
|
||||
|
||||
config := Config{
|
||||
MaxPodGracePeriodSeconds: 5,
|
||||
PressureTransitionPeriod: time.Minute * 5,
|
||||
Thresholds: []Threshold{
|
||||
{
|
||||
Signal: SignalNodeFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("1Mi"),
|
||||
},
|
||||
},
|
||||
{
|
||||
Signal: SignalNodeFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("2Mi"),
|
||||
},
|
||||
GracePeriod: time.Minute * 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Mi", "4Mi", podStats)}
|
||||
manager := &managerImpl{
|
||||
clock: fakeClock,
|
||||
killPodFunc: podKiller.killPodNow,
|
||||
imageGC: imageGC,
|
||||
config: config,
|
||||
recorder: &record.FakeRecorder{},
|
||||
summaryProvider: summaryProvider,
|
||||
nodeRef: nodeRef,
|
||||
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
|
||||
thresholdsFirstObservedAt: thresholdsObservedAt{},
|
||||
}
|
||||
|
||||
// create a best effort pod to test admission
|
||||
podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", ""))
|
||||
|
||||
// synchronize
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should not have disk pressure
|
||||
if manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should not report disk pressure")
|
||||
}
|
||||
|
||||
// try to admit our pod (should succeed)
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
|
||||
}
|
||||
|
||||
// induce soft threshold
|
||||
fakeClock.Step(1 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have disk pressure
|
||||
if !manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should report disk pressure since soft threshold was met")
|
||||
}
|
||||
|
||||
// verify no pod was yet killed because there has not yet been enough time passed.
|
||||
if podKiller.pod != nil {
|
||||
t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod)
|
||||
}
|
||||
|
||||
// step forward in time pass the grace period
|
||||
fakeClock.Step(3 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have disk pressure
|
||||
if !manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should report disk pressure since soft threshold was met")
|
||||
}
|
||||
|
||||
// verify the right pod was killed with the right grace period.
|
||||
if podKiller.pod != pods[0] {
|
||||
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
|
||||
}
|
||||
if podKiller.gracePeriodOverride == nil {
|
||||
t.Errorf("Manager chose to kill pod but should have had a grace period override.")
|
||||
}
|
||||
observedGracePeriod := *podKiller.gracePeriodOverride
|
||||
if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds {
|
||||
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod)
|
||||
}
|
||||
// reset state
|
||||
podKiller.pod = nil
|
||||
podKiller.gracePeriodOverride = nil
|
||||
|
||||
// remove disk pressure
|
||||
fakeClock.Step(20 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should not have disk pressure
|
||||
if manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should not report disk pressure")
|
||||
}
|
||||
|
||||
// induce disk pressure!
|
||||
fakeClock.Step(1 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have disk pressure
|
||||
if !manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should report disk pressure")
|
||||
}
|
||||
|
||||
// check the right pod was killed
|
||||
if podKiller.pod != pods[0] {
|
||||
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0])
|
||||
}
|
||||
observedGracePeriod = *podKiller.gracePeriodOverride
|
||||
if observedGracePeriod != int64(0) {
|
||||
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
|
||||
}
|
||||
|
||||
// try to admit our pod (should fail)
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit)
|
||||
}
|
||||
|
||||
// reduce disk pressure
|
||||
fakeClock.Step(1 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
|
||||
podKiller.pod = nil // reset state
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have disk pressure (because transition period not yet met)
|
||||
if !manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should report disk pressure")
|
||||
}
|
||||
|
||||
// no pod should have been killed
|
||||
if podKiller.pod != nil {
|
||||
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
|
||||
}
|
||||
|
||||
// try to admit our pod (should fail)
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit)
|
||||
}
|
||||
|
||||
// move the clock past transition period to ensure that we stop reporting pressure
|
||||
fakeClock.Step(5 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats)
|
||||
podKiller.pod = nil // reset state
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should not have disk pressure (because transition period met)
|
||||
if manager.IsUnderDiskPressure() {
|
||||
t.Errorf("Manager should not report disk pressure")
|
||||
}
|
||||
|
||||
// no pod should have been killed
|
||||
if podKiller.pod != nil {
|
||||
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod)
|
||||
}
|
||||
|
||||
// try to admit our pod (should succeed)
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
|
||||
}
|
||||
}
|
||||
|
@ -41,10 +41,16 @@ const (
|
||||
message = "The node was low on compute resources."
|
||||
// disk, in bytes. internal to this module, used to account for local disk usage.
|
||||
resourceDisk api.ResourceName = "disk"
|
||||
// inodes, number. internal to this module, used to account for local disk inode consumption.
|
||||
resourceInodes api.ResourceName = "inodes"
|
||||
// imagefs, in bytes. internal to this module, used to account for local image filesystem usage.
|
||||
resourceImageFs api.ResourceName = "imagefs"
|
||||
// imagefs inodes, number. internal to this module, used to account for local image filesystem inodes.
|
||||
resourceImageFsInodes api.ResourceName = "imagefsInodes"
|
||||
// nodefs, in bytes. internal to this module, used to account for local node root filesystem usage.
|
||||
resourceNodeFs api.ResourceName = "nodefs"
|
||||
// nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
|
||||
resourceNodeFsInodes api.ResourceName = "nodefsInodes"
|
||||
)
|
||||
|
||||
var (
|
||||
@ -62,12 +68,16 @@ func init() {
|
||||
signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure
|
||||
signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure
|
||||
signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure
|
||||
signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure
|
||||
signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure
|
||||
|
||||
// map signals to resources (and vice-versa)
|
||||
signalToResource = map[Signal]api.ResourceName{}
|
||||
signalToResource[SignalMemoryAvailable] = api.ResourceMemory
|
||||
signalToResource[SignalImageFsAvailable] = resourceImageFs
|
||||
signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes
|
||||
signalToResource[SignalNodeFsAvailable] = resourceNodeFs
|
||||
signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes
|
||||
resourceToSignal = map[api.ResourceName]Signal{}
|
||||
for key, value := range signalToResource {
|
||||
resourceToSignal[value] = key
|
||||
@ -185,22 +195,21 @@ func parseThresholdStatement(statement string) (Threshold, error) {
|
||||
Percentage: percentage,
|
||||
},
|
||||
}, nil
|
||||
} else {
|
||||
quantity, err := resource.ParseQuantity(quantityValue)
|
||||
if err != nil {
|
||||
return Threshold{}, err
|
||||
}
|
||||
if quantity.Sign() < 0 || quantity.IsZero() {
|
||||
return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
|
||||
}
|
||||
return Threshold{
|
||||
Signal: signal,
|
||||
Operator: operator,
|
||||
Value: ThresholdValue{
|
||||
Quantity: &quantity,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
quantity, err := resource.ParseQuantity(quantityValue)
|
||||
if err != nil {
|
||||
return Threshold{}, err
|
||||
}
|
||||
if quantity.Sign() < 0 || quantity.IsZero() {
|
||||
return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
|
||||
}
|
||||
return Threshold{
|
||||
Signal: signal,
|
||||
Operator: operator,
|
||||
Value: ThresholdValue{
|
||||
Quantity: &quantity,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// parsePercentage parses a string representing a percentage value
|
||||
@ -287,6 +296,18 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
|
||||
return resource.NewQuantity(usage, resource.BinarySI)
|
||||
}
|
||||
|
||||
// inodeUsage converts inodes consumed into a resource quantity.
|
||||
func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
|
||||
// TODO: cadvisor needs to support inodes used per container
|
||||
// right now, cadvisor reports total inodes and inodes free per filesystem.
|
||||
// this is insufficient to know how many inodes are consumed by the container.
|
||||
// for example, with the overlay driver, the rootfs and each container filesystem
|
||||
// will report the same total inode and inode free values but no way of knowing
|
||||
// how many inodes consumed in that filesystem are charged to this container.
|
||||
// for now, we report 0 as inode usage pending support in cadvisor.
|
||||
return resource.NewQuantity(int64(0), resource.BinarySI)
|
||||
}
|
||||
|
||||
// memoryUsage converts working set into a resource quantity.
|
||||
func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
|
||||
if memStats == nil || memStats.WorkingSetBytes == nil {
|
||||
@ -311,15 +332,18 @@ func localVolumeNames(pod *api.Pod) []string {
|
||||
return result
|
||||
}
|
||||
|
||||
// podDiskUsage aggregates pod disk usage for the specified stats to measure.
|
||||
// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
|
||||
func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) {
|
||||
disk := resource.Quantity{Format: resource.BinarySI}
|
||||
inodes := resource.Quantity{Format: resource.BinarySI}
|
||||
for _, container := range podStats.Containers {
|
||||
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
|
||||
disk.Add(*diskUsage(container.Rootfs))
|
||||
inodes.Add(*inodeUsage(container.Rootfs))
|
||||
}
|
||||
if hasFsStatsType(statsToMeasure, fsStatsLogs) {
|
||||
disk.Add(*diskUsage(container.Logs))
|
||||
inodes.Add(*inodeUsage(container.Logs))
|
||||
}
|
||||
}
|
||||
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
|
||||
@ -328,13 +352,15 @@ func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsS
|
||||
for _, volumeStats := range podStats.VolumeStats {
|
||||
if volumeStats.Name == volumeName {
|
||||
disk.Add(*diskUsage(&volumeStats.FsStats))
|
||||
inodes.Add(*inodeUsage(&volumeStats.FsStats))
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return api.ResourceList{
|
||||
resourceDisk: disk,
|
||||
resourceDisk: disk,
|
||||
resourceInodes: inodes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -502,8 +528,8 @@ func memory(stats statsFunc) cmpFunc {
|
||||
}
|
||||
}
|
||||
|
||||
// disk compares pods by largest consumer of disk relative to request.
|
||||
func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {
|
||||
// disk compares pods by largest consumer of disk relative to request for the specified disk resource.
|
||||
func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc {
|
||||
return func(p1, p2 *api.Pod) int {
|
||||
p1Stats, found := stats(p1)
|
||||
// if we have no usage stats for p1, we want p2 first
|
||||
@ -528,8 +554,8 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {
|
||||
|
||||
// disk is best effort, so we don't measure relative to a request.
|
||||
// TODO: add disk as a guaranteed resource
|
||||
p1Disk := p1Usage[resourceDisk]
|
||||
p2Disk := p2Usage[resourceDisk]
|
||||
p1Disk := p1Usage[diskResource]
|
||||
p2Disk := p2Usage[diskResource]
|
||||
// if p2 is using more than p1, we want p2 first
|
||||
return p2Disk.Cmp(p1Disk)
|
||||
}
|
||||
@ -541,9 +567,9 @@ func rankMemoryPressure(pods []*api.Pod, stats statsFunc) {
|
||||
}
|
||||
|
||||
// rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
|
||||
func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType) rankFunc {
|
||||
func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc {
|
||||
return func(pods []*api.Pod, stats statsFunc) {
|
||||
orderedBy(qosComparator, disk(stats, fsStatsToMeasure)).Sort(pods)
|
||||
orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
|
||||
}
|
||||
}
|
||||
|
||||
@ -564,6 +590,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// build the function to work against for pod stats
|
||||
statsFunc := cachedStatsFunc(summary.Pods)
|
||||
// build an evaluation context for current eviction signals
|
||||
@ -575,17 +602,33 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
|
||||
capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
|
||||
}
|
||||
}
|
||||
if nodeFs := summary.Node.Fs; nodeFs != nil && nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
|
||||
result[SignalNodeFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
|
||||
if nodeFs := summary.Node.Fs; nodeFs != nil {
|
||||
if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
|
||||
result[SignalNodeFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
|
||||
}
|
||||
}
|
||||
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
|
||||
result[SignalNodeFsInodesFree] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
|
||||
}
|
||||
}
|
||||
}
|
||||
if summary.Node.Runtime != nil {
|
||||
if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil && imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
|
||||
result[SignalImageFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
|
||||
if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
|
||||
if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
|
||||
result[SignalImageFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
|
||||
}
|
||||
if imageFs.InodesFree != nil && imageFs.Inodes != nil {
|
||||
result[SignalImageFsInodesFree] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -785,16 +828,20 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
|
||||
// usage of an imagefs is optional
|
||||
if withImageFs {
|
||||
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
|
||||
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource})
|
||||
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
|
||||
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
|
||||
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
|
||||
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot})
|
||||
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
|
||||
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
|
||||
} else {
|
||||
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats
|
||||
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})
|
||||
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
|
||||
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
|
||||
}
|
||||
return resourceToRankFunc
|
||||
}
|
||||
|
||||
// PodIsEvicted returns true if the reported pod status is due to an eviction.
|
||||
func PodIsEvicted(podStatus api.PodStatus) bool {
|
||||
return podStatus.Phase == api.PodFailed && podStatus.Reason == reason
|
||||
}
|
||||
@ -806,11 +853,14 @@ func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.
|
||||
if withImageFs {
|
||||
// with an imagefs, nodefs pressure should just delete logs
|
||||
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
|
||||
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
|
||||
// with an imagefs, imagefs pressure should delete unused images
|
||||
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)}
|
||||
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
|
||||
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
|
||||
} else {
|
||||
// without an imagefs, nodefs pressure should delete logs, and unused images
|
||||
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)}
|
||||
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
|
||||
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
|
||||
}
|
||||
return resourceToReclaimFunc
|
||||
}
|
||||
@ -824,13 +874,17 @@ func deleteLogs() nodeReclaimFunc {
|
||||
}
|
||||
|
||||
// deleteImages will delete unused images to free up disk pressure.
|
||||
func deleteImages(imageGC ImageGC) nodeReclaimFunc {
|
||||
func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc {
|
||||
return func() (*resource.Quantity, error) {
|
||||
glog.Infof("eviction manager: attempting to delete unused images")
|
||||
reclaimed, err := imageGC.DeleteUnusedImages()
|
||||
bytesFreed, err := imageGC.DeleteUnusedImages()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reclaimed := int64(0)
|
||||
if reportBytesFreed {
|
||||
reclaimed = bytesFreed
|
||||
}
|
||||
return resource.NewQuantity(reclaimed, resource.BinarySI), nil
|
||||
}
|
||||
}
|
||||
|
@ -191,6 +191,49 @@ func TestParseThresholdConfig(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
"inode flag values": {
|
||||
evictionHard: "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi",
|
||||
evictionSoft: "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi",
|
||||
evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s",
|
||||
evictionMinReclaim: "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi",
|
||||
expectErr: false,
|
||||
expectThresholds: []Threshold{
|
||||
{
|
||||
Signal: SignalImageFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("150Mi"),
|
||||
},
|
||||
MinReclaim: quantityMustParse("2Gi"),
|
||||
},
|
||||
{
|
||||
Signal: SignalNodeFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("100Mi"),
|
||||
},
|
||||
MinReclaim: quantityMustParse("1Gi"),
|
||||
},
|
||||
{
|
||||
Signal: SignalImageFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("300Mi"),
|
||||
},
|
||||
GracePeriod: gracePeriod,
|
||||
MinReclaim: quantityMustParse("2Gi"),
|
||||
},
|
||||
{
|
||||
Signal: SignalNodeFsInodesFree,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("200Mi"),
|
||||
},
|
||||
GracePeriod: gracePeriod,
|
||||
MinReclaim: quantityMustParse("1Gi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
"invalid-signal": {
|
||||
evictionHard: "mem.available<150Mi",
|
||||
evictionSoft: "",
|
||||
@ -400,7 +443,7 @@ func TestOrderedByDisk(t *testing.T) {
|
||||
return result, found
|
||||
}
|
||||
pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
|
||||
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
|
||||
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
|
||||
expected := []*api.Pod{pod6, pod5, pod4, pod3, pod2, pod1}
|
||||
for i := range expected {
|
||||
if pods[i] != expected[i] {
|
||||
@ -466,7 +509,7 @@ func TestOrderedByQoSDisk(t *testing.T) {
|
||||
return result, found
|
||||
}
|
||||
pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
|
||||
orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
|
||||
orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
|
||||
expected := []*api.Pod{pod2, pod1, pod4, pod3, pod6, pod5}
|
||||
for i := range expected {
|
||||
if pods[i] != expected[i] {
|
||||
@ -608,6 +651,10 @@ func TestMakeSignalObservations(t *testing.T) {
|
||||
imageFsCapacityBytes := uint64(1024 * 1024 * 2)
|
||||
nodeFsAvailableBytes := uint64(1024)
|
||||
nodeFsCapacityBytes := uint64(1024 * 2)
|
||||
imageFsInodesFree := uint64(1024)
|
||||
imageFsInodes := uint64(1024 * 1024)
|
||||
nodeFsInodesFree := uint64(1024)
|
||||
nodeFsInodes := uint64(1024 * 1024)
|
||||
fakeStats := &statsapi.Summary{
|
||||
Node: statsapi.NodeStats{
|
||||
Memory: &statsapi.MemoryStats{
|
||||
@ -618,11 +665,15 @@ func TestMakeSignalObservations(t *testing.T) {
|
||||
ImageFs: &statsapi.FsStats{
|
||||
AvailableBytes: &imageFsAvailableBytes,
|
||||
CapacityBytes: &imageFsCapacityBytes,
|
||||
InodesFree: &imageFsInodesFree,
|
||||
Inodes: &imageFsInodes,
|
||||
},
|
||||
},
|
||||
Fs: &statsapi.FsStats{
|
||||
AvailableBytes: &nodeFsAvailableBytes,
|
||||
CapacityBytes: &nodeFsCapacityBytes,
|
||||
InodesFree: &nodeFsInodesFree,
|
||||
Inodes: &nodeFsInodes,
|
||||
},
|
||||
},
|
||||
Pods: []statsapi.PodStats{},
|
||||
@ -664,6 +715,16 @@ func TestMakeSignalObservations(t *testing.T) {
|
||||
if expectedBytes := int64(nodeFsCapacityBytes); nodeFsQuantity.capacity.Value() != expectedBytes {
|
||||
t.Errorf("Expected %v, actual: %v", expectedBytes, nodeFsQuantity.capacity.Value())
|
||||
}
|
||||
nodeFsInodesQuantity, found := actualObservations[SignalNodeFsInodesFree]
|
||||
if !found {
|
||||
t.Errorf("Expected inodes free nodefs observation: %v", err)
|
||||
}
|
||||
if expected := int64(nodeFsInodesFree); nodeFsInodesQuantity.available.Value() != expected {
|
||||
t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.available.Value())
|
||||
}
|
||||
if expected := int64(nodeFsInodes); nodeFsInodesQuantity.capacity.Value() != expected {
|
||||
t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.capacity.Value())
|
||||
}
|
||||
imageFsQuantity, found := actualObservations[SignalImageFsAvailable]
|
||||
if !found {
|
||||
t.Errorf("Expected available imagefs observation: %v", err)
|
||||
@ -674,6 +735,16 @@ func TestMakeSignalObservations(t *testing.T) {
|
||||
if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes {
|
||||
t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value())
|
||||
}
|
||||
imageFsInodesQuantity, found := actualObservations[SignalImageFsInodesFree]
|
||||
if !found {
|
||||
t.Errorf("Expected inodes free imagefs observation: %v", err)
|
||||
}
|
||||
if expected := int64(imageFsInodesFree); imageFsInodesQuantity.available.Value() != expected {
|
||||
t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.available.Value())
|
||||
}
|
||||
if expected := int64(imageFsInodes); imageFsInodesQuantity.capacity.Value() != expected {
|
||||
t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.capacity.Value())
|
||||
}
|
||||
for _, pod := range pods {
|
||||
podStats, found := statsFunc(pod)
|
||||
if !found {
|
||||
@ -1204,6 +1275,22 @@ func testCompareThresholdValue(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// newPodInodeStats returns stats with specified usage amounts.
|
||||
// TODO: in future, this should take a value for inodesUsed per container.
|
||||
func newPodInodeStats(pod *api.Pod) statsapi.PodStats {
|
||||
result := statsapi.PodStats{
|
||||
PodRef: statsapi.PodReference{
|
||||
Name: pod.Name, Namespace: pod.Namespace, UID: string(pod.UID),
|
||||
},
|
||||
}
|
||||
for range pod.Spec.Containers {
|
||||
result.Containers = append(result.Containers, statsapi.ContainerStats{
|
||||
Rootfs: &statsapi.FsStats{},
|
||||
})
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// newPodDiskStats returns stats with specified usage amounts.
|
||||
func newPodDiskStats(pod *api.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resource.Quantity) statsapi.PodStats {
|
||||
result := statsapi.PodStats{
|
||||
|
@ -32,8 +32,12 @@ const (
|
||||
SignalMemoryAvailable Signal = "memory.available"
|
||||
// SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc.
|
||||
SignalNodeFsAvailable Signal = "nodefs.available"
|
||||
// SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc.
|
||||
SignalNodeFsInodesFree Signal = "nodefs.inodesFree"
|
||||
// SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers.
|
||||
SignalImageFsAvailable Signal = "imagefs.available"
|
||||
// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers.
|
||||
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
|
||||
)
|
||||
|
||||
// fsStatsType defines the types of filesystem stats to collect.
|
||||
|
Loading…
Reference in New Issue
Block a user