mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-09-15 14:14:39 +00:00
kubelet: eviction: add memcg threshold notifier to improve eviction responsiveness
This commit is contained in:
@@ -25,6 +25,7 @@ go_library(
|
||||
"//pkg/api/unversioned:go_default_library",
|
||||
"//pkg/client/record:go_default_library",
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/qos:go_default_library",
|
||||
"//pkg/kubelet/server/stats:go_default_library",
|
||||
|
@@ -24,7 +24,9 @@ import (
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/resource"
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/server/stats"
|
||||
@@ -33,7 +35,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/util/wait"
|
||||
)
|
||||
|
||||
// managerImpl implements NodeStabilityManager
|
||||
// managerImpl implements Manager
|
||||
type managerImpl struct {
|
||||
// used to track time
|
||||
clock clock.Clock
|
||||
@@ -65,6 +67,8 @@ type managerImpl struct {
|
||||
resourceToNodeReclaimFuncs map[api.ResourceName]nodeReclaimFuncs
|
||||
// last observations from synchronize
|
||||
lastObservations signalObservations
|
||||
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
|
||||
notifiersInitialized bool
|
||||
}
|
||||
|
||||
// ensure it implements the required interface
|
||||
@@ -139,6 +143,39 @@ func (m *managerImpl) IsUnderDiskPressure() bool {
|
||||
return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure)
|
||||
}
|
||||
|
||||
func startMemoryThresholdNotifier(thresholds []Threshold, observations signalObservations, hard bool, handler thresholdNotifierHandlerFunc) error {
|
||||
for _, threshold := range thresholds {
|
||||
if threshold.Signal != SignalMemoryAvailable || hard != isHardEvictionThreshold(threshold) {
|
||||
continue
|
||||
}
|
||||
observed, found := observations[SignalMemoryAvailable]
|
||||
if !found {
|
||||
continue
|
||||
}
|
||||
cgroups, err := cm.GetCgroupSubsystems()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// TODO add support for eviction from --cgroup-root
|
||||
cgpath, found := cgroups.MountPoints["memory"]
|
||||
if !found || len(cgpath) == 0 {
|
||||
return fmt.Errorf("memory cgroup mount point not found")
|
||||
}
|
||||
attribute := "memory.usage_in_bytes"
|
||||
quantity := getThresholdQuantity(threshold.Value, observed.capacity)
|
||||
usageThreshold := resource.NewQuantity(observed.capacity.Value(), resource.DecimalSI)
|
||||
usageThreshold.Sub(*quantity)
|
||||
description := fmt.Sprintf("<%s available", formatThresholdValue(threshold.Value))
|
||||
memcgThresholdNotifier, err := NewMemCGThresholdNotifier(cgpath, attribute, usageThreshold.String(), description, handler)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
go memcgThresholdNotifier.Start(wait.NeverStop)
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// synchronize is the main control loop that enforces eviction thresholds.
|
||||
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) {
|
||||
// if we have nothing to do, just return
|
||||
@@ -166,8 +203,27 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
return
|
||||
}
|
||||
|
||||
// find the list of thresholds that are met independent of grace period
|
||||
now := m.clock.Now()
|
||||
// attempt to create a threshold notifier to improve eviction response time
|
||||
if !m.notifiersInitialized {
|
||||
m.notifiersInitialized = true
|
||||
// start soft memory notification
|
||||
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) {
|
||||
glog.Infof("soft memory eviction threshold crossed at %s", desc)
|
||||
// TODO wait grace period for soft memory limit
|
||||
m.synchronize(diskInfoProvider, podFunc)
|
||||
})
|
||||
if err != nil {
|
||||
glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err)
|
||||
}
|
||||
// start hard memory notification
|
||||
err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) {
|
||||
glog.Infof("hard memory eviction threshold crossed at %s", desc)
|
||||
m.synchronize(diskInfoProvider, podFunc)
|
||||
})
|
||||
if err != nil {
|
||||
glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// determine the set of thresholds met independent of grace period
|
||||
thresholds = thresholdsMet(thresholds, observations, false)
|
||||
@@ -182,6 +238,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
|
||||
|
||||
// track when a threshold was first observed
|
||||
now := m.clock.Now()
|
||||
thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
|
||||
|
||||
// the set of node conditions that are triggered by currently observed thresholds
|
||||
@@ -218,7 +275,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
||||
glog.Warningf("eviction manager: attempting to reclaim %v", resourceToReclaim)
|
||||
|
||||
// determine if this is a soft or hard eviction associated with the resource
|
||||
softEviction := isSoftEviction(thresholds, resourceToReclaim)
|
||||
softEviction := isSoftEvictionThresholds(thresholds, resourceToReclaim)
|
||||
|
||||
// record an event about the resources we are now attempting to reclaim via eviction
|
||||
m.recorder.Eventf(m.nodeRef, api.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
|
||||
|
@@ -848,18 +848,23 @@ func getStarvedResources(thresholds []Threshold) []api.ResourceName {
|
||||
}
|
||||
|
||||
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
|
||||
func isSoftEviction(thresholds []Threshold, starvedResource api.ResourceName) bool {
|
||||
func isSoftEvictionThresholds(thresholds []Threshold, starvedResource api.ResourceName) bool {
|
||||
for _, threshold := range thresholds {
|
||||
if resourceToCheck := signalToResource[threshold.Signal]; resourceToCheck != starvedResource {
|
||||
continue
|
||||
}
|
||||
if threshold.GracePeriod == time.Duration(0) {
|
||||
if isHardEvictionThreshold(threshold) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// isSoftEviction returns true if the thresholds met for the starved resource are only soft thresholds
|
||||
func isHardEvictionThreshold(threshold Threshold) bool {
|
||||
return threshold.GracePeriod == time.Duration(0)
|
||||
}
|
||||
|
||||
// buildResourceToRankFunc returns ranking functions associated with resources
|
||||
func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
|
||||
resourceToRankFunc := map[api.ResourceName]rankFunc{
|
||||
|
124
pkg/kubelet/eviction/threshold_notifier.go
Normal file
124
pkg/kubelet/eviction/threshold_notifier.go
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
Copyright 2016 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package eviction
|
||||
|
||||
/*
|
||||
#include <sys/eventfd.h>
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// ThresholdNotifier notifies the user when an attribute crosses a threshold value
|
||||
type ThresholdNotifier interface {
|
||||
Start(stopCh <-chan struct{})
|
||||
}
|
||||
|
||||
type memcgThresholdNotifier struct {
|
||||
watchfd int
|
||||
controlfd int
|
||||
eventfd int
|
||||
handler thresholdNotifierHandlerFunc
|
||||
description string
|
||||
}
|
||||
|
||||
var _ ThresholdNotifier = &memcgThresholdNotifier{}
|
||||
|
||||
// NewMemCGThresholdNotifier sends notifications when a cgroup threshold
|
||||
// is crossed (in either direction) for a given cgroup attribute
|
||||
func NewMemCGThresholdNotifier(path, attribute, threshold, description string, handler thresholdNotifierHandlerFunc) (ThresholdNotifier, error) {
|
||||
watchfd, err := syscall.Open(fmt.Sprintf("%s/%s", path, attribute), syscall.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(watchfd)
|
||||
}
|
||||
}()
|
||||
controlfd, err := syscall.Open(fmt.Sprintf("%s/cgroup.event_control", path), syscall.O_WRONLY, 0)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(controlfd)
|
||||
}
|
||||
}()
|
||||
efd, err := C.eventfd(0, C.EFD_CLOEXEC)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
eventfd := int(efd)
|
||||
if eventfd < 0 {
|
||||
err = fmt.Errorf("eventfd call failed")
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
syscall.Close(eventfd)
|
||||
}
|
||||
}()
|
||||
glog.V(2).Infof("eviction: setting notification threshold to %s", threshold)
|
||||
config := fmt.Sprintf("%d %d %s", eventfd, watchfd, threshold)
|
||||
_, err = syscall.Write(controlfd, []byte(config))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &memcgThresholdNotifier{
|
||||
watchfd: watchfd,
|
||||
controlfd: controlfd,
|
||||
eventfd: eventfd,
|
||||
handler: handler,
|
||||
description: description,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func getThresholdEvents(eventfd int, eventCh chan<- int) {
|
||||
for {
|
||||
buf := make([]byte, 8)
|
||||
_, err := syscall.Read(eventfd, buf)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
eventCh <- 0
|
||||
}
|
||||
}
|
||||
|
||||
func (n *memcgThresholdNotifier) Start(stopCh <-chan struct{}) {
|
||||
eventCh := make(chan int, 1)
|
||||
go getThresholdEvents(n.eventfd, eventCh)
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
glog.V(2).Infof("eviction: stopping threshold notifier")
|
||||
syscall.Close(n.watchfd)
|
||||
syscall.Close(n.controlfd)
|
||||
syscall.Close(n.eventfd)
|
||||
close(eventCh)
|
||||
return
|
||||
case <-eventCh:
|
||||
glog.V(2).Infof("eviction: threshold crossed")
|
||||
n.handler(n.description)
|
||||
}
|
||||
}
|
||||
}
|
@@ -161,3 +161,6 @@ type nodeReclaimFunc func() (*resource.Quantity, error)
|
||||
|
||||
// nodeReclaimFuncs is an ordered list of nodeReclaimFunc
|
||||
type nodeReclaimFuncs []nodeReclaimFunc
|
||||
|
||||
// thresholdNotifierHandlerFunc is a function that takes action in response to a crossed threshold
|
||||
type thresholdNotifierHandlerFunc func(thresholdDescription string)
|
||||
|
Reference in New Issue
Block a user