Merge pull request #63521 from dashpole/allocatable_memcg

Automatic merge from submit-queue (batch tested with PRs 63314, 63884, 63799, 63521, 62242). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Add memcg notifications for allocatable cgroup

**What this PR does / why we need it**:
Use memory cgroup notifications to trigger the eviction manager when the allocatable eviction threshold is crossed.  This allows the eviction manager to respond more quickly when the allocatable cgroup's available memory becomes low.  Evictions are preferable to OOMs in the cgroup since the kubelet can enforce its priorities on which pod is killed.

**Which issue(s) this PR fixes**:
Fixes https://github.com/kubernetes/kubernetes/issues/57901

**Special notes for your reviewer**:
This adds the alloctable cgroup from the container manager to the eviction config.

**Release note**:
```release-note
NONE
```
/sig node
/priority important-soon
/kind feature

I would like this to be included in the 1.11 release.
This commit is contained in:
Kubernetes Submit Queue 2018-05-15 19:55:15 -07:00 committed by GitHub
commit 6934c4f599
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 7 deletions

View File

@ -196,7 +196,7 @@ func (m *managerImpl) IsUnderPIDPressure() bool {
return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
}
func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, hard bool, handler thresholdNotifierHandlerFunc) error {
func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, hard, allocatable bool, handler thresholdNotifierHandlerFunc) error {
for _, threshold := range m.config.Thresholds {
if threshold.Signal != evictionapi.SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) {
continue
@ -205,19 +205,27 @@ func (m *managerImpl) startMemoryThresholdNotifier(summary *statsapi.Summary, ha
if err != nil {
return err
}
// TODO add support for eviction from --cgroup-root
cgpath, found := cgroups.MountPoints["memory"]
if !found || len(cgpath) == 0 {
return fmt.Errorf("memory cgroup mount point not found")
}
attribute := "memory.usage_in_bytes"
if summary.Node.Memory == nil || summary.Node.Memory.UsageBytes == nil || summary.Node.Memory.WorkingSetBytes == nil {
memoryStats := summary.Node.Memory
if allocatable {
cgpath += m.config.PodCgroupRoot
allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods)
if err != nil {
return err
}
memoryStats = allocatableContainer.Memory
}
if memoryStats == nil || memoryStats.UsageBytes == nil || memoryStats.WorkingSetBytes == nil || memoryStats.AvailableBytes == nil {
return fmt.Errorf("summary was incomplete")
}
// Set threshold on usage to capacity - eviction_hard + inactive_file,
// since we want to be notified when working_set = capacity - eviction_hard
inactiveFile := resource.NewQuantity(int64(*summary.Node.Memory.UsageBytes-*summary.Node.Memory.WorkingSetBytes), resource.BinarySI)
capacity := resource.NewQuantity(int64(*summary.Node.Memory.AvailableBytes+*summary.Node.Memory.WorkingSetBytes), resource.BinarySI)
inactiveFile := resource.NewQuantity(int64(*memoryStats.UsageBytes-*memoryStats.WorkingSetBytes), resource.BinarySI)
capacity := resource.NewQuantity(int64(*memoryStats.AvailableBytes+*memoryStats.WorkingSetBytes), resource.BinarySI)
evictionThresholdQuantity := evictionapi.GetThresholdQuantity(threshold.Value, capacity)
memcgThreshold := capacity.DeepCopy()
memcgThreshold.Sub(*evictionThresholdQuantity)
@ -267,22 +275,38 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
if m.config.KernelMemcgNotification && m.notifierStopCh.Reset() {
glog.V(4).Infof("eviction manager attempting to integrate with kernel memcg notification api")
// start soft memory notification
err = m.startMemoryThresholdNotifier(summary, false, func(desc string) {
err = m.startMemoryThresholdNotifier(summary, false, false, func(desc string) {
glog.Infof("soft memory eviction threshold crossed at %s", desc)
// TODO wait grace period for soft memory limit
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)
} // start soft memory notification
err = m.startMemoryThresholdNotifier(summary, false, true, func(desc string) {
glog.Infof("soft allocatable memory eviction threshold crossed at %s", desc)
// TODO wait grace period for soft memory limit
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create allocatable soft memory threshold notifier: %v", err)
}
// start hard memory notification
err = m.startMemoryThresholdNotifier(summary, true, func(desc string) {
err = m.startMemoryThresholdNotifier(summary, true, false, func(desc string) {
glog.Infof("hard memory eviction threshold crossed at %s", desc)
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
}
// start hard memory notification
err = m.startMemoryThresholdNotifier(summary, true, true, func(desc string) {
glog.Infof("hard allocatable memory eviction threshold crossed at %s", desc)
m.synchronize(diskInfoProvider, podFunc)
})
if err != nil {
glog.Warningf("eviction manager: failed to create hard allocatable memory threshold notifier: %v", err)
}
}
// make observations and get a function to derive pod usage stats relative to those observations.

View File

@ -48,6 +48,8 @@ type Config struct {
Thresholds []evictionapi.Threshold
// KernelMemcgNotification if true will integrate with the kernel memcg notification to determine if memory thresholds are crossed.
KernelMemcgNotification bool
// PodCgroupRoot is the cgroup which contains all pods.
PodCgroupRoot string
}
// Manager evaluates when an eviction threshold for node stability has been met on the node.

View File

@ -439,6 +439,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
MaxPodGracePeriodSeconds: int64(kubeCfg.EvictionMaxPodGracePeriod),
Thresholds: thresholds,
KernelMemcgNotification: experimentalKernelMemcgNotification,
PodCgroupRoot: kubeDeps.ContainerManager.GetPodCgroupRoot(),
}
serviceIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})