diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index a101d865d1a..07f465bef9c 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -196,7 +196,7 @@ func (m *managerImpl) IsUnderPIDPressure() bool { return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure) } -func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, hard bool, handler thresholdNotifierHandlerFunc) error { +func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, hard, allocatable bool, handler thresholdNotifierHandlerFunc) error { for _, threshold := range m.config.Thresholds { if threshold.Signal != evictionapi.SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) { continue @@ -205,19 +205,27 @@ func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, ha if err != nil { return err } - // TODO add support for eviction from --cgroup-root cgpath, found := cgroups.MountPoints["memory"] if !found || len(cgpath) == 0 { return fmt.Errorf("memory cgroup mount point not found") } attribute := "memory.usage_in_bytes" - if summary.Node.Memory == nil || summary.Node.Memory.UsageBytes == nil || summary.Node.Memory.WorkingSetBytes == nil { + memoryStats := summary.Node.Memory + if allocatable { + cgpath += m.config.PodCgroupRoot + allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods) + if err != nil { + return err + } + memoryStats = allocatableContainer.Memory + } + if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil { return fmt.Errorf("summary was incomplete") } // Set threshold on usage to capacity - eviction_hard + inactive_file, // since we want to be notified when working_set = capacity - eviction_hard - inactiveFile := resource.NewQuantity(int64(*summary.Node.Memory.UsageBytes-*summary.Node.Memory.WorkingSetBytes), resource.BinarySI) - capacity := resource.NewQuantity(int64(*summary.Node.Memory.AvailableBytes+*summary.Node.Memory.WorkingSetBytes), resource.BinarySI) + inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI) + capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI) evictionThresholdQuantity := evictionapi.GetThresholdQuantity(threshold.Value, capacity) memcgThreshold := capacity.DeepCopy() memcgThreshold.Sub(*evictionThresholdQuantity) @@ -267,22 +275,38 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act if m.config.KernelMemcgNotification && m.notifierStopCh.Reset() { glog.V(4).Infof("eviction manager attempting to integrate with kernel memcg notification api") // start soft memory notification - err = m.startMemoryThresholdNotifier(summary, false, func(desc string) { + err = m.startMemoryThresholdNotifier(summary, false, false, func(desc string) { glog.Infof("soft memory eviction threshold crossed at %s", desc) // TODO wait grace period for soft memory limit m.synchronize(diskInfoProvider, podFunc) }) if err != nil { glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err) + } // start soft memory notification + err = m.startMemoryThresholdNotifier(summary, false, true, func(desc string) { + glog.Infof("soft allocatable memory eviction threshold crossed at %s", desc) + // TODO wait grace period for soft memory limit + m.synchronize(diskInfoProvider, podFunc) + }) + if err != nil { + glog.Warningf("eviction manager: failed to create allocatable soft memory threshold notifier: %v", err) } // start hard memory notification - err = m.startMemoryThresholdNotifier(summary, true, func(desc string) { + err = m.startMemoryThresholdNotifier(summary, true, false, func(desc string) { glog.Infof("hard memory eviction threshold crossed at %s", desc) m.synchronize(diskInfoProvider, podFunc) }) if err != nil { glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err) } + // start hard memory notification + err = m.startMemoryThresholdNotifier(summary, true, true, func(desc string) { + glog.Infof("hard allocatable memory eviction threshold crossed at %s", desc) + m.synchronize(diskInfoProvider, podFunc) + }) + if err != nil { + glog.Warningf("eviction manager: failed to create hard allocatable memory threshold notifier: %v", err) + } } // make observations and get a function to derive pod usage stats relative to those observations. diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index cc192b8b5e7..26e0432fd9e 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -48,6 +48,8 @@ type Config struct { Thresholds []evictionapi.Threshold // KernelMemcgNotification if true will integrate with the kernel memcg notification to determine if memory thresholds are crossed. KernelMemcgNotification bool + // PodCgroupRoot is the cgroup which contains all pods. + PodCgroupRoot string } // Manager evaluates when an eviction threshold for node stability has been met on the node. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index fad4ffeaf65..2bae0115356 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -439,6 +439,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod), Thresholds: thresholds, KernelMemcgNotification: experimentalKernelMemcgNotification, + PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(), } serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})