kubelet: enable qos-level memory request reservation

This commit is contained in:
Seth Jennings 2017-02-28 15:03:06 -06:00
parent 01bfbb5fa0
commit cc50aa9dfb
19 changed files with 353 additions and 17 deletions

View File

@ -269,4 +269,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&s.SystemReservedCgroup, "system-reserved-cgroup", s.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
fs.StringVar(&s.KubeReservedCgroup, "kube-reserved-cgroup", s.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")
fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]")
fs.Var(&s.ExperimentalQOSReserved, "experimental-qos-reserved", "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]")
}

View File

@ -525,6 +525,10 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
return err
}
}
experimentalQOSReserved, err := cm.ParseQOSReserved(s.ExperimentalQOSReserved)
if err != nil {
return err
}
kubeDeps.ContainerManager, err = cm.NewContainerManager(
kubeDeps.Mounter,
kubeDeps.CAdvisorInterface,
@ -546,6 +550,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
SystemReserved: systemReserved,
HardEvictionThresholds: hardEvictionThresholds,
},
ExperimentalQOSReserved: *experimentalQOSReserved,
},
s.ExperimentalFailSwapOn,
kubeDeps.Recorder)

View File

@ -236,6 +236,7 @@ experimental-keystone-url
experimental-mounter-path
experimental-nvidia-gpus
experimental-prefix
experimental-qos-reserved
external-etcd-cafile
external-etcd-certfile
external-etcd-endpoints

View File

@ -440,6 +440,10 @@ type KubeletConfiguration struct {
// manage attachment/detachment of volumes scheduled to this node, and
// disables kubelet from executing any attach/detach operations
EnableControllerAttachDetach bool
// A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe
// how pod resource requests are reserved at the QoS level.
// Currently only memory is supported. [default=none]"
ExperimentalQOSReserved ConfigurationMap
// Default behaviour for kernel tuning
ProtectKernelDefaults bool
// If true, Kubelet ensures a set of iptables rules are present on host.

View File

@ -385,6 +385,9 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
if obj.KubeReserved == nil {
obj.KubeReserved = make(map[string]string)
}
if obj.ExperimentalQOSReserved == nil {
obj.ExperimentalQOSReserved = make(map[string]string)
}
if obj.MakeIPTablesUtilChains == nil {
obj.MakeIPTablesUtilChains = boolVar(true)
}

View File

@ -476,6 +476,10 @@ type KubeletConfiguration struct {
// manage attachment/detachment of volumes scheduled to this node, and
// disables kubelet from executing any attach/detach operations
EnableControllerAttachDetach *bool `json:"enableControllerAttachDetach"`
// A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe
// how pod resource requests are reserved at the QoS level.
// Currently only memory is supported. [default=none]"
ExperimentalQOSReserved map[string]string `json:"experimentalQOSReserved"`
// Default behaviour for kernel tuning
ProtectKernelDefaults bool `json:"protectKernelDefaults"`
// If true, Kubelet ensures a set of iptables rules are present on host.

View File

@ -395,6 +395,7 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil {
return err
}
out.ExperimentalQOSReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.ExperimentalQOSReserved))
out.ProtectKernelDefaults = in.ProtectKernelDefaults
if err := v1.Convert_Pointer_bool_To_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil {
return err
@ -572,6 +573,7 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil {
return err
}
out.ExperimentalQOSReserved = *(*map[string]string)(unsafe.Pointer(&in.ExperimentalQOSReserved))
out.ProtectKernelDefaults = in.ProtectKernelDefaults
if err := v1.Convert_bool_To_Pointer_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil {
return err

View File

@ -266,6 +266,13 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c *
*out = new(bool)
**out = **in
}
if in.ExperimentalQOSReserved != nil {
in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved
*out = make(map[string]string)
for key, val := range *in {
(*out)[key] = val
}
}
if in.MakeIPTablesUtilChains != nil {
in, out := &in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains
*out = new(bool)

View File

@ -177,6 +177,13 @@ func DeepCopy_componentconfig_KubeletConfiguration(in interface{}, out interface
(*out)[key] = val
}
}
if in.ExperimentalQOSReserved != nil {
in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved
*out = make(ConfigurationMap)
for key, val := range *in {
(*out)[key] = val
}
}
if in.AllowedUnsafeSysctls != nil {
in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls
*out = make([]string, len(*in))

View File

@ -13838,6 +13838,20 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
Format: "",
},
},
"experimentalQOSReserved": {
SchemaProps: spec.SchemaProps{
Description: "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]\"",
Type: []string{"object"},
AdditionalProperties: &spec.SchemaOrBool{
Schema: &spec.Schema{
SchemaProps: spec.SchemaProps{
Type: []string{"string"},
Format: "",
},
},
},
},
},
"protectKernelDefaults": {
SchemaProps: spec.SchemaProps{
Description: "Default behaviour for kernel tuning",
@ -13979,7 +13993,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
},
},
},
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "experimentalQOSReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
},
},
Dependencies: []string{

View File

@ -25,6 +25,7 @@ go_library(
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/apis/componentconfig:go_default_library",
"//pkg/kubelet/cadvisor:go_default_library",
"//pkg/kubelet/cm/util:go_default_library",
"//pkg/kubelet/events:go_default_library",
@ -55,6 +56,7 @@ go_test(
name = "go_default_test",
srcs = [
"cgroup_manager_linux_test.go",
"cgroup_manager_test.go",
"container_manager_linux_test.go",
"helpers_linux_test.go",
"node_container_manager_test.go",
@ -63,6 +65,7 @@ go_test(
tags = ["automanaged"],
deps = [
"//pkg/api/v1:go_default_library",
"//pkg/apis/componentconfig:go_default_library",
"//pkg/kubelet/eviction/api:go_default_library",
"//pkg/util/mount:go_default_library",
"//vendor:github.com/stretchr/testify/assert",

View File

@ -276,6 +276,8 @@ type subsystem interface {
Name() string
// Set the cgroup represented by cgroup.
Set(path string, cgroup *libcontainerconfigs.Cgroup) error
// GetStats returns the statistics associated with the cgroup
GetStats(path string, stats *libcontainercgroups.Stats) error
}
// Cgroup subsystems we currently support
@ -465,3 +467,34 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
}
return m.Update(containerConfig)
}
func getStatsSupportedSubsytems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
stats := libcontainercgroups.NewStats()
for _, sys := range supportedSubsystems {
if _, ok := cgroupPaths[sys.Name()]; !ok {
return nil, fmt.Errorf("Failed to find subsytem mount for subsytem: %v", sys.Name())
}
if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil {
return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err)
}
}
return stats, nil
}
func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
return &ResourceStats{
MemoryStats: &MemoryStats{
Usage: int64(stats.MemoryStats.Usage.Usage),
},
}
}
// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
cgroupPaths := m.buildCgroupPaths(name)
stats, err := getStatsSupportedSubsytems(cgroupPaths)
if err != nil {
return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
}
return toResourceStats(stats), nil
}

View File

@ -0,0 +1,83 @@
// +build linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"testing"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
)
func Test(t *testing.T) {
tests := []struct {
input string
expected *map[v1.ResourceName]int64
}{
{
input: "memory",
expected: nil,
},
{
input: "memory=a",
expected: nil,
},
{
input: "memory=a%",
expected: nil,
},
{
input: "memory=200%",
expected: nil,
},
{
input: "memory=0%",
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 0,
},
},
{
input: "memory=100%",
expected: &map[v1.ResourceName]int64{
v1.ResourceMemory: 100,
},
},
{
// need to change this when CPU is added as a supported resource
input: "memory=100%,cpu=50%",
expected: nil,
},
}
for _, test := range tests {
m := componentconfig.ConfigurationMap{}
m.Set(test.input)
actual, err := ParseQOSReserved(m)
if actual != nil && test.expected == nil {
t.Errorf("Unexpected success, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if actual == nil && test.expected != nil {
t.Errorf("Unexpected failure, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
if (actual == nil && test.expected == nil) || reflect.DeepEqual(*actual, *test.expected) {
continue
}
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
}
}

View File

@ -54,6 +54,10 @@ func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
return fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) GetResourceStats(name CgroupName) (*ResourceStats, error) {
return nil, fmt.Errorf("Cgroup Manager is not supported in this build")
}
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
return nil
}

View File

@ -20,7 +20,12 @@ import (
"k8s.io/apimachinery/pkg/util/sets"
// TODO: Migrate kubelet to either use its own internal objects or client library.
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"fmt"
"strconv"
"strings"
)
type ActivePodsFunc func() []*v1.Pod
@ -71,6 +76,7 @@ type NodeConfig struct {
ProtectKernelDefaults bool
EnableCRI bool
NodeAllocatableConfig
ExperimentalQOSReserved map[v1.ResourceName]int64
}
type NodeAllocatableConfig struct {
@ -93,3 +99,41 @@ const (
SystemReservedEnforcementKey = "system-reserved"
KubeReservedEnforcementKey = "kube-reserved"
)
// containerManager for the kubelet is currently an injected dependency.
// We need to parse the --qos-reserve-requests option in
// cmd/kubelet/app/server.go and there isn't really a good place to put
// the code. If/When the kubelet dependency injection gets worked out,
// maybe there will be a better place for it.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {
return 0, fmt.Errorf("percentage expected, got '%s'", v)
}
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
if err != nil {
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
}
if percentage < 0 || percentage > 100 {
return 0, fmt.Errorf("percentage must be between 0 and 100")
}
return percentage, nil
}
// ParseQOSReserved parses the --qos-reserve-requests option
func ParseQOSReserved(m componentconfig.ConfigurationMap) (*map[v1.ResourceName]int64, error) {
reservations := make(map[v1.ResourceName]int64)
for k, v := range m {
switch v1.ResourceName(k) {
// Only memory resources are supported.
case v1.ResourceMemory:
q, err := parsePercentage(v)
if err != nil {
return nil, err
}
reservations[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return &reservations, nil
}

View File

@ -274,7 +274,6 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
if cm.NodeConfig.CgroupsPerQOS {
return &podContainerManagerImpl{
qosContainersInfo: cm.GetQOSContainersInfo(),
nodeInfo: cm.nodeInfo,
subsystems: cm.subsystems,
cgroupManager: cm.cgroupManager,
}
@ -366,7 +365,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
if err := cm.createNodeAllocatableCgroups(); err != nil {
return err
}
err = cm.qosContainerManager.Start(cm.nodeInfo, activePods)
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
if err != nil {
return fmt.Errorf("failed to initialise top level QOS containers: %v", err)
}

View File

@ -38,8 +38,6 @@ const (
// It is the general implementation which allows pod level container
// management if qos Cgroup is enabled.
type podContainerManagerImpl struct {
// nodeInfo stores information about the node resource capacity
nodeInfo *v1.Node
// qosContainersInfo hold absolute paths of the top level qos containers
qosContainersInfo QOSContainersInfo
// Stores the mounted cgroup subsystems

View File

@ -25,7 +25,6 @@ import (
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/kubelet/qos"
)
@ -37,19 +36,21 @@ const (
)
type QOSContainerManager interface {
Start(*v1.Node, ActivePodsFunc) error
Start(func() v1.ResourceList, ActivePodsFunc) error
GetQOSContainersInfo() QOSContainersInfo
UpdateCgroups() error
}
type qosContainerManagerImpl struct {
sync.Mutex
nodeInfo *v1.Node
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
cgroupRoot string
nodeInfo *v1.Node
qosContainersInfo QOSContainersInfo
subsystems *CgroupSubsystems
cgroupManager CgroupManager
activePods ActivePodsFunc
getNodeAllocatable func() v1.ResourceList
cgroupRoot string
qosReserved map[v1.ResourceName]int64
}
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig) (QOSContainerManager, error) {
@ -63,6 +64,7 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nod
subsystems: subsystems,
cgroupManager: NewCgroupManager(subsystems, nodeConfig.CgroupDriver),
cgroupRoot: cgroupRoot,
qosReserved: nodeConfig.ExperimentalQOSReserved,
}, nil
}
@ -70,7 +72,7 @@ func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return m.qosContainersInfo
}
func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePodsFunc) error {
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
cm := m.cgroupManager
rootContainer := m.cgroupRoot
if !cm.Exists(CgroupName(rootContainer)) {
@ -115,7 +117,7 @@ func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePods
Burstable: path.Join(rootContainer, string(v1.PodQOSBurstable)),
BestEffort: path.Join(rootContainer, string(v1.PodQOSBestEffort)),
}
m.nodeInfo = nodeInfo
m.getNodeAllocatable = getNodeAllocatable
m.activePods = activePods
// update qos cgroup tiers on startup and in periodic intervals
@ -162,6 +164,85 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]
return nil
}
// setMemoryReserve sums the memory limits of all pods in a QOS class,
// calculates QOS class memory limits, and set those limits in the
// CgroupConfig for each QOS class.
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
qosMemoryRequests := map[v1.PodQOSClass]int64{
v1.PodQOSGuaranteed: 0,
v1.PodQOSBurstable: 0,
}
// Sum the pod limits for pods in each QOS class
pods := m.activePods()
for _, pod := range pods {
podMemoryRequest := int64(0)
qosClass := qos.GetPodQOS(pod)
if qosClass == v1.PodQOSBestEffort {
// limits are not set for Best Effort pods
continue
}
req, _, err := v1.PodRequestsAndLimits(pod)
if err != nil {
glog.V(2).Infof("[Container Manager] Pod resource requests/limits could not be determined. Not setting QOS memory limts.")
return
}
if request, found := req[v1.ResourceMemory]; found {
podMemoryRequest += request.Value()
}
qosMemoryRequests[qosClass] += podMemoryRequest
}
resources := m.getNodeAllocatable()
allocatableResource, ok := resources[v1.ResourceMemory]
if !ok {
glog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
return
}
allocatable := allocatableResource.Value()
if allocatable == 0 {
glog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
return
}
for qos, limits := range qosMemoryRequests {
glog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
}
// Calculate QOS memory limits
burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
}
// retrySetMemoryReserve checks for any QoS cgroups over the limit
// that was attempted to be set in the first Update() and adjusts
// their memory limit to the usage to prevent further growth.
func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
// Unreclaimable memory usage may already exceeded the desired limit
// Attempt to set the limit near the current usage to put pressure
// on the cgroup and prevent further growth.
for qos, config := range configs {
stats, err := m.cgroupManager.GetResourceStats(config.Name)
if err != nil {
glog.V(2).Infof("[Container Manager] %v", err)
return
}
usage := stats.MemoryStats.Usage
// Because there is no good way to determine of the original Update()
// on the memory resource was successful, we determine failure of the
// first attempt by checking if the usage is above the limit we attempt
// to set. If it is, we assume the first attempt to set the limit failed
// and try again setting the limit to the usage. Otherwise we leave
// the CgroupConfig as is.
if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
configs[qos].ResourceParameters.Memory = &usage
}
}
}
func (m *qosContainerManagerImpl) UpdateCgroups() error {
m.Lock()
defer m.Unlock()
@ -182,6 +263,34 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
return err
}
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.setMemoryReserve(qosConfigs, percentReserve)
}
}
updateSuccess := true
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
updateSuccess = false
}
}
if updateSuccess {
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration")
return nil
}
// If the resource can adjust the ResourceConfig to increase likelihood of
// success, call the adjustment function here. Otherwise, the Update() will
// be called again with the same values.
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:
m.retrySetMemoryReserve(qosConfigs, percentReserve)
}
}
for _, config := range qosConfigs {
err := m.cgroupManager.Update(config)
if err != nil {
@ -189,8 +298,8 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
return err
}
}
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration")
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry")
return nil
}
@ -204,7 +313,7 @@ func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (m *qosContainerManagerNoop) Start(_ *v1.Node, _ ActivePodsFunc) error {
func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
return nil
}

View File

@ -47,6 +47,18 @@ type CgroupConfig struct {
ResourceParameters *ResourceConfig
}
// MemoryStats holds the on-demand stastistics from the memory cgroup
type MemoryStats struct {
// Memory usage (in bytes).
Usage int64
}
// ResourceStats holds on-demand stastistics from various cgroup subsystems
type ResourceStats struct {
// Memory statistics.
MemoryStats *MemoryStats
}
// CgroupManager allows for cgroup management.
// Supports Cgroup Creation ,Deletion and Updates.
type CgroupManager interface {
@ -72,6 +84,8 @@ type CgroupManager interface {
Pids(name CgroupName) []int
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
ReduceCPULimits(cgroupName CgroupName) error
// GetResourceStats returns statistics of the specified cgroup as read from the cgroup fs.
GetResourceStats(name CgroupName) (*ResourceStats, error)
}
// QOSContainersInfo stores the names of containers per qos