mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 19:31:44 +00:00
Merge pull request #41149 from sjenning/qos-memory-limits
Automatic merge from submit-queue (batch tested with PRs 41919, 41149, 42350, 42351, 42285) kubelet: enable qos-level memory limits ```release-note Experimental support to reserve a pod's memory request from being utilized by pods in lower QoS tiers. ``` Enables the QoS-level memory cgroup limits described in https://github.com/kubernetes/community/pull/314 **Note: QoS level cgroups have to be enabled for any of this to take effect.** Adds a new `--experimental-qos-reserved` flag that can be used to set the percentage of a resource to be reserved at the QoS level for pod resource requests. For example, `--experimental-qos-reserved="memory=50%`, means that if a Guaranteed pod sets a memory request of 2Gi, the Burstable and BestEffort QoS memory cgroups will have their `memory.limit_in_bytes` set to `NodeAllocatable - (2Gi*50%)` to reserve 50% of the guaranteed pod's request from being used by the lower QoS tiers. If a Burstable pod sets a request, its reserve will be deducted from the BestEffort memory limit. The result is that: - Guaranteed limit matches root cgroup at is not set by this code - Burstable limit is `NodeAllocatable - Guaranteed reserve` - BestEffort limit is `NodeAllocatable - Guaranteed reserve - Burstable reserve` The only resource currently supported is `memory`; however, the code is generic enough that other resources can be added in the future. @derekwaynecarr @vishh
This commit is contained in:
commit
9cc5480918
@ -269,4 +269,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
|
||||
fs.StringVar(&s.SystemReservedCgroup, "system-reserved-cgroup", s.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
|
||||
fs.StringVar(&s.KubeReservedCgroup, "kube-reserved-cgroup", s.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")
|
||||
fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]")
|
||||
|
||||
fs.Var(&s.ExperimentalQOSReserved, "experimental-qos-reserved", "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]")
|
||||
}
|
||||
|
@ -525,6 +525,10 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
experimentalQOSReserved, err := cm.ParseQOSReserved(s.ExperimentalQOSReserved)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
kubeDeps.ContainerManager, err = cm.NewContainerManager(
|
||||
kubeDeps.Mounter,
|
||||
kubeDeps.CAdvisorInterface,
|
||||
@ -546,6 +550,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) {
|
||||
SystemReserved: systemReserved,
|
||||
HardEvictionThresholds: hardEvictionThresholds,
|
||||
},
|
||||
ExperimentalQOSReserved: *experimentalQOSReserved,
|
||||
},
|
||||
s.ExperimentalFailSwapOn,
|
||||
kubeDeps.Recorder)
|
||||
|
@ -244,6 +244,7 @@ experimental-keystone-url
|
||||
experimental-mounter-path
|
||||
experimental-nvidia-gpus
|
||||
experimental-prefix
|
||||
experimental-qos-reserved
|
||||
external-etcd-cafile
|
||||
external-etcd-certfile
|
||||
external-etcd-endpoints
|
||||
|
@ -440,6 +440,10 @@ type KubeletConfiguration struct {
|
||||
// manage attachment/detachment of volumes scheduled to this node, and
|
||||
// disables kubelet from executing any attach/detach operations
|
||||
EnableControllerAttachDetach bool
|
||||
// A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe
|
||||
// how pod resource requests are reserved at the QoS level.
|
||||
// Currently only memory is supported. [default=none]"
|
||||
ExperimentalQOSReserved ConfigurationMap
|
||||
// Default behaviour for kernel tuning
|
||||
ProtectKernelDefaults bool
|
||||
// If true, Kubelet ensures a set of iptables rules are present on host.
|
||||
|
@ -385,6 +385,9 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
|
||||
if obj.KubeReserved == nil {
|
||||
obj.KubeReserved = make(map[string]string)
|
||||
}
|
||||
if obj.ExperimentalQOSReserved == nil {
|
||||
obj.ExperimentalQOSReserved = make(map[string]string)
|
||||
}
|
||||
if obj.MakeIPTablesUtilChains == nil {
|
||||
obj.MakeIPTablesUtilChains = boolVar(true)
|
||||
}
|
||||
|
@ -476,6 +476,10 @@ type KubeletConfiguration struct {
|
||||
// manage attachment/detachment of volumes scheduled to this node, and
|
||||
// disables kubelet from executing any attach/detach operations
|
||||
EnableControllerAttachDetach *bool `json:"enableControllerAttachDetach"`
|
||||
// A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe
|
||||
// how pod resource requests are reserved at the QoS level.
|
||||
// Currently only memory is supported. [default=none]"
|
||||
ExperimentalQOSReserved map[string]string `json:"experimentalQOSReserved"`
|
||||
// Default behaviour for kernel tuning
|
||||
ProtectKernelDefaults bool `json:"protectKernelDefaults"`
|
||||
// If true, Kubelet ensures a set of iptables rules are present on host.
|
||||
|
@ -395,6 +395,7 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
|
||||
if err := v1.Convert_Pointer_bool_To_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.ExperimentalQOSReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.ExperimentalQOSReserved))
|
||||
out.ProtectKernelDefaults = in.ProtectKernelDefaults
|
||||
if err := v1.Convert_Pointer_bool_To_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil {
|
||||
return err
|
||||
@ -572,6 +573,7 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
|
||||
if err := v1.Convert_bool_To_Pointer_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.ExperimentalQOSReserved = *(*map[string]string)(unsafe.Pointer(&in.ExperimentalQOSReserved))
|
||||
out.ProtectKernelDefaults = in.ProtectKernelDefaults
|
||||
if err := v1.Convert_bool_To_Pointer_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil {
|
||||
return err
|
||||
|
@ -266,6 +266,13 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c *
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.ExperimentalQOSReserved != nil {
|
||||
in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved
|
||||
*out = make(map[string]string)
|
||||
for key, val := range *in {
|
||||
(*out)[key] = val
|
||||
}
|
||||
}
|
||||
if in.MakeIPTablesUtilChains != nil {
|
||||
in, out := &in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains
|
||||
*out = new(bool)
|
||||
|
@ -177,6 +177,13 @@ func DeepCopy_componentconfig_KubeletConfiguration(in interface{}, out interface
|
||||
(*out)[key] = val
|
||||
}
|
||||
}
|
||||
if in.ExperimentalQOSReserved != nil {
|
||||
in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved
|
||||
*out = make(ConfigurationMap)
|
||||
for key, val := range *in {
|
||||
(*out)[key] = val
|
||||
}
|
||||
}
|
||||
if in.AllowedUnsafeSysctls != nil {
|
||||
in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls
|
||||
*out = make([]string, len(*in))
|
||||
|
@ -13838,6 +13838,20 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
|
||||
Format: "",
|
||||
},
|
||||
},
|
||||
"experimentalQOSReserved": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]\"",
|
||||
Type: []string{"object"},
|
||||
AdditionalProperties: &spec.SchemaOrBool{
|
||||
Schema: &spec.Schema{
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Type: []string{"string"},
|
||||
Format: "",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"protectKernelDefaults": {
|
||||
SchemaProps: spec.SchemaProps{
|
||||
Description: "Default behaviour for kernel tuning",
|
||||
@ -13979,7 +13993,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope
|
||||
},
|
||||
},
|
||||
},
|
||||
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
|
||||
Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "experimentalQOSReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"},
|
||||
},
|
||||
},
|
||||
Dependencies: []string{
|
||||
|
@ -25,6 +25,7 @@ go_library(
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/apis/componentconfig:go_default_library",
|
||||
"//pkg/kubelet/cadvisor:go_default_library",
|
||||
"//pkg/kubelet/cm/util:go_default_library",
|
||||
"//pkg/kubelet/events:go_default_library",
|
||||
@ -55,6 +56,7 @@ go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
"cgroup_manager_linux_test.go",
|
||||
"cgroup_manager_test.go",
|
||||
"container_manager_linux_test.go",
|
||||
"helpers_linux_test.go",
|
||||
"node_container_manager_test.go",
|
||||
@ -63,6 +65,7 @@ go_test(
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/apis/componentconfig:go_default_library",
|
||||
"//pkg/kubelet/eviction/api:go_default_library",
|
||||
"//pkg/util/mount:go_default_library",
|
||||
"//vendor:github.com/stretchr/testify/assert",
|
||||
|
@ -276,6 +276,8 @@ type subsystem interface {
|
||||
Name() string
|
||||
// Set the cgroup represented by cgroup.
|
||||
Set(path string, cgroup *libcontainerconfigs.Cgroup) error
|
||||
// GetStats returns the statistics associated with the cgroup
|
||||
GetStats(path string, stats *libcontainercgroups.Stats) error
|
||||
}
|
||||
|
||||
// Cgroup subsystems we currently support
|
||||
@ -465,3 +467,34 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
|
||||
}
|
||||
return m.Update(containerConfig)
|
||||
}
|
||||
|
||||
func getStatsSupportedSubsytems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
|
||||
stats := libcontainercgroups.NewStats()
|
||||
for _, sys := range supportedSubsystems {
|
||||
if _, ok := cgroupPaths[sys.Name()]; !ok {
|
||||
return nil, fmt.Errorf("Failed to find subsytem mount for subsytem: %v", sys.Name())
|
||||
}
|
||||
if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil {
|
||||
return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err)
|
||||
}
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats {
|
||||
return &ResourceStats{
|
||||
MemoryStats: &MemoryStats{
|
||||
Usage: int64(stats.MemoryStats.Usage.Usage),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs
|
||||
func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) {
|
||||
cgroupPaths := m.buildCgroupPaths(name)
|
||||
stats, err := getStatsSupportedSubsytems(cgroupPaths)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err)
|
||||
}
|
||||
return toResourceStats(stats), nil
|
||||
}
|
||||
|
83
pkg/kubelet/cm/cgroup_manager_test.go
Normal file
83
pkg/kubelet/cm/cgroup_manager_test.go
Normal file
@ -0,0 +1,83 @@
|
||||
// +build linux
|
||||
|
||||
/*
|
||||
Copyright 2016 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package cm
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
)
|
||||
|
||||
func Test(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected *map[v1.ResourceName]int64
|
||||
}{
|
||||
{
|
||||
input: "memory",
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
input: "memory=a",
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
input: "memory=a%",
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
input: "memory=200%",
|
||||
expected: nil,
|
||||
},
|
||||
{
|
||||
input: "memory=0%",
|
||||
expected: &map[v1.ResourceName]int64{
|
||||
v1.ResourceMemory: 0,
|
||||
},
|
||||
},
|
||||
{
|
||||
input: "memory=100%",
|
||||
expected: &map[v1.ResourceName]int64{
|
||||
v1.ResourceMemory: 100,
|
||||
},
|
||||
},
|
||||
{
|
||||
// need to change this when CPU is added as a supported resource
|
||||
input: "memory=100%,cpu=50%",
|
||||
expected: nil,
|
||||
},
|
||||
}
|
||||
for _, test := range tests {
|
||||
m := componentconfig.ConfigurationMap{}
|
||||
m.Set(test.input)
|
||||
actual, err := ParseQOSReserved(m)
|
||||
if actual != nil && test.expected == nil {
|
||||
t.Errorf("Unexpected success, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
|
||||
}
|
||||
if actual == nil && test.expected != nil {
|
||||
t.Errorf("Unexpected failure, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
|
||||
}
|
||||
if (actual == nil && test.expected == nil) || reflect.DeepEqual(*actual, *test.expected) {
|
||||
continue
|
||||
}
|
||||
t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err)
|
||||
}
|
||||
}
|
@ -54,6 +54,10 @@ func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
|
||||
return fmt.Errorf("Cgroup Manager is not supported in this build")
|
||||
}
|
||||
|
||||
func (m *unsupportedCgroupManager) GetResourceStats(name CgroupName) (*ResourceStats, error) {
|
||||
return nil, fmt.Errorf("Cgroup Manager is not supported in this build")
|
||||
}
|
||||
|
||||
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
|
||||
return nil
|
||||
}
|
||||
|
@ -20,7 +20,12 @@ import (
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
// TODO: Migrate kubelet to either use its own internal objects or client library.
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/apis/componentconfig"
|
||||
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
|
||||
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type ActivePodsFunc func() []*v1.Pod
|
||||
@ -71,6 +76,7 @@ type NodeConfig struct {
|
||||
ProtectKernelDefaults bool
|
||||
EnableCRI bool
|
||||
NodeAllocatableConfig
|
||||
ExperimentalQOSReserved map[v1.ResourceName]int64
|
||||
}
|
||||
|
||||
type NodeAllocatableConfig struct {
|
||||
@ -93,3 +99,41 @@ const (
|
||||
SystemReservedEnforcementKey = "system-reserved"
|
||||
KubeReservedEnforcementKey = "kube-reserved"
|
||||
)
|
||||
|
||||
// containerManager for the kubelet is currently an injected dependency.
|
||||
// We need to parse the --qos-reserve-requests option in
|
||||
// cmd/kubelet/app/server.go and there isn't really a good place to put
|
||||
// the code. If/When the kubelet dependency injection gets worked out,
|
||||
// maybe there will be a better place for it.
|
||||
func parsePercentage(v string) (int64, error) {
|
||||
if !strings.HasSuffix(v, "%") {
|
||||
return 0, fmt.Errorf("percentage expected, got '%s'", v)
|
||||
}
|
||||
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
|
||||
}
|
||||
if percentage < 0 || percentage > 100 {
|
||||
return 0, fmt.Errorf("percentage must be between 0 and 100")
|
||||
}
|
||||
return percentage, nil
|
||||
}
|
||||
|
||||
// ParseQOSReserved parses the --qos-reserve-requests option
|
||||
func ParseQOSReserved(m componentconfig.ConfigurationMap) (*map[v1.ResourceName]int64, error) {
|
||||
reservations := make(map[v1.ResourceName]int64)
|
||||
for k, v := range m {
|
||||
switch v1.ResourceName(k) {
|
||||
// Only memory resources are supported.
|
||||
case v1.ResourceMemory:
|
||||
q, err := parsePercentage(v)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reservations[v1.ResourceName(k)] = q
|
||||
default:
|
||||
return nil, fmt.Errorf("cannot reserve %q resource", k)
|
||||
}
|
||||
}
|
||||
return &reservations, nil
|
||||
}
|
||||
|
@ -274,7 +274,6 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
|
||||
if cm.NodeConfig.CgroupsPerQOS {
|
||||
return &podContainerManagerImpl{
|
||||
qosContainersInfo: cm.GetQOSContainersInfo(),
|
||||
nodeInfo: cm.nodeInfo,
|
||||
subsystems: cm.subsystems,
|
||||
cgroupManager: cm.cgroupManager,
|
||||
}
|
||||
@ -366,7 +365,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error {
|
||||
if err := cm.createNodeAllocatableCgroups(); err != nil {
|
||||
return err
|
||||
}
|
||||
err = cm.qosContainerManager.Start(cm.nodeInfo, activePods)
|
||||
err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to initialise top level QOS containers: %v", err)
|
||||
}
|
||||
|
@ -38,8 +38,6 @@ const (
|
||||
// It is the general implementation which allows pod level container
|
||||
// management if qos Cgroup is enabled.
|
||||
type podContainerManagerImpl struct {
|
||||
// nodeInfo stores information about the node resource capacity
|
||||
nodeInfo *v1.Node
|
||||
// qosContainersInfo hold absolute paths of the top level qos containers
|
||||
qosContainersInfo QOSContainersInfo
|
||||
// Stores the mounted cgroup subsystems
|
||||
|
@ -25,7 +25,6 @@ import (
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
)
|
||||
@ -37,19 +36,21 @@ const (
|
||||
)
|
||||
|
||||
type QOSContainerManager interface {
|
||||
Start(*v1.Node, ActivePodsFunc) error
|
||||
Start(func() v1.ResourceList, ActivePodsFunc) error
|
||||
GetQOSContainersInfo() QOSContainersInfo
|
||||
UpdateCgroups() error
|
||||
}
|
||||
|
||||
type qosContainerManagerImpl struct {
|
||||
sync.Mutex
|
||||
nodeInfo *v1.Node
|
||||
qosContainersInfo QOSContainersInfo
|
||||
subsystems *CgroupSubsystems
|
||||
cgroupManager CgroupManager
|
||||
activePods ActivePodsFunc
|
||||
cgroupRoot string
|
||||
nodeInfo *v1.Node
|
||||
qosContainersInfo QOSContainersInfo
|
||||
subsystems *CgroupSubsystems
|
||||
cgroupManager CgroupManager
|
||||
activePods ActivePodsFunc
|
||||
getNodeAllocatable func() v1.ResourceList
|
||||
cgroupRoot string
|
||||
qosReserved map[v1.ResourceName]int64
|
||||
}
|
||||
|
||||
func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig) (QOSContainerManager, error) {
|
||||
@ -63,6 +64,7 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nod
|
||||
subsystems: subsystems,
|
||||
cgroupManager: NewCgroupManager(subsystems, nodeConfig.CgroupDriver),
|
||||
cgroupRoot: cgroupRoot,
|
||||
qosReserved: nodeConfig.ExperimentalQOSReserved,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -70,7 +72,7 @@ func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
|
||||
return m.qosContainersInfo
|
||||
}
|
||||
|
||||
func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePodsFunc) error {
|
||||
func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
|
||||
cm := m.cgroupManager
|
||||
rootContainer := m.cgroupRoot
|
||||
if !cm.Exists(CgroupName(rootContainer)) {
|
||||
@ -115,7 +117,7 @@ func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePods
|
||||
Burstable: path.Join(rootContainer, string(v1.PodQOSBurstable)),
|
||||
BestEffort: path.Join(rootContainer, string(v1.PodQOSBestEffort)),
|
||||
}
|
||||
m.nodeInfo = nodeInfo
|
||||
m.getNodeAllocatable = getNodeAllocatable
|
||||
m.activePods = activePods
|
||||
|
||||
// update qos cgroup tiers on startup and in periodic intervals
|
||||
@ -162,6 +164,85 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]
|
||||
return nil
|
||||
}
|
||||
|
||||
// setMemoryReserve sums the memory limits of all pods in a QOS class,
|
||||
// calculates QOS class memory limits, and set those limits in the
|
||||
// CgroupConfig for each QOS class.
|
||||
func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
|
||||
qosMemoryRequests := map[v1.PodQOSClass]int64{
|
||||
v1.PodQOSGuaranteed: 0,
|
||||
v1.PodQOSBurstable: 0,
|
||||
}
|
||||
|
||||
// Sum the pod limits for pods in each QOS class
|
||||
pods := m.activePods()
|
||||
for _, pod := range pods {
|
||||
podMemoryRequest := int64(0)
|
||||
qosClass := qos.GetPodQOS(pod)
|
||||
if qosClass == v1.PodQOSBestEffort {
|
||||
// limits are not set for Best Effort pods
|
||||
continue
|
||||
}
|
||||
req, _, err := v1.PodRequestsAndLimits(pod)
|
||||
if err != nil {
|
||||
glog.V(2).Infof("[Container Manager] Pod resource requests/limits could not be determined. Not setting QOS memory limts.")
|
||||
return
|
||||
}
|
||||
if request, found := req[v1.ResourceMemory]; found {
|
||||
podMemoryRequest += request.Value()
|
||||
}
|
||||
qosMemoryRequests[qosClass] += podMemoryRequest
|
||||
}
|
||||
|
||||
resources := m.getNodeAllocatable()
|
||||
allocatableResource, ok := resources[v1.ResourceMemory]
|
||||
if !ok {
|
||||
glog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.")
|
||||
return
|
||||
}
|
||||
allocatable := allocatableResource.Value()
|
||||
if allocatable == 0 {
|
||||
glog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.")
|
||||
return
|
||||
}
|
||||
|
||||
for qos, limits := range qosMemoryRequests {
|
||||
glog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve)
|
||||
}
|
||||
|
||||
// Calculate QOS memory limits
|
||||
burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
|
||||
bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
|
||||
configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
|
||||
configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
|
||||
}
|
||||
|
||||
// retrySetMemoryReserve checks for any QoS cgroups over the limit
|
||||
// that was attempted to be set in the first Update() and adjusts
|
||||
// their memory limit to the usage to prevent further growth.
|
||||
func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
|
||||
// Unreclaimable memory usage may already exceeded the desired limit
|
||||
// Attempt to set the limit near the current usage to put pressure
|
||||
// on the cgroup and prevent further growth.
|
||||
for qos, config := range configs {
|
||||
stats, err := m.cgroupManager.GetResourceStats(config.Name)
|
||||
if err != nil {
|
||||
glog.V(2).Infof("[Container Manager] %v", err)
|
||||
return
|
||||
}
|
||||
usage := stats.MemoryStats.Usage
|
||||
|
||||
// Because there is no good way to determine of the original Update()
|
||||
// on the memory resource was successful, we determine failure of the
|
||||
// first attempt by checking if the usage is above the limit we attempt
|
||||
// to set. If it is, we assume the first attempt to set the limit failed
|
||||
// and try again setting the limit to the usage. Otherwise we leave
|
||||
// the CgroupConfig as is.
|
||||
if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
|
||||
configs[qos].ResourceParameters.Memory = &usage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m *qosContainerManagerImpl) UpdateCgroups() error {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
@ -182,6 +263,34 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
|
||||
return err
|
||||
}
|
||||
|
||||
for resource, percentReserve := range m.qosReserved {
|
||||
switch resource {
|
||||
case v1.ResourceMemory:
|
||||
m.setMemoryReserve(qosConfigs, percentReserve)
|
||||
}
|
||||
}
|
||||
updateSuccess := true
|
||||
for _, config := range qosConfigs {
|
||||
err := m.cgroupManager.Update(config)
|
||||
if err != nil {
|
||||
updateSuccess = false
|
||||
}
|
||||
}
|
||||
if updateSuccess {
|
||||
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration")
|
||||
return nil
|
||||
}
|
||||
|
||||
// If the resource can adjust the ResourceConfig to increase likelihood of
|
||||
// success, call the adjustment function here. Otherwise, the Update() will
|
||||
// be called again with the same values.
|
||||
for resource, percentReserve := range m.qosReserved {
|
||||
switch resource {
|
||||
case v1.ResourceMemory:
|
||||
m.retrySetMemoryReserve(qosConfigs, percentReserve)
|
||||
}
|
||||
}
|
||||
|
||||
for _, config := range qosConfigs {
|
||||
err := m.cgroupManager.Update(config)
|
||||
if err != nil {
|
||||
@ -189,8 +298,8 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
|
||||
return err
|
||||
}
|
||||
}
|
||||
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration")
|
||||
|
||||
glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry")
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -204,7 +313,7 @@ func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
|
||||
return QOSContainersInfo{}
|
||||
}
|
||||
|
||||
func (m *qosContainerManagerNoop) Start(_ *v1.Node, _ ActivePodsFunc) error {
|
||||
func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -47,6 +47,18 @@ type CgroupConfig struct {
|
||||
ResourceParameters *ResourceConfig
|
||||
}
|
||||
|
||||
// MemoryStats holds the on-demand stastistics from the memory cgroup
|
||||
type MemoryStats struct {
|
||||
// Memory usage (in bytes).
|
||||
Usage int64
|
||||
}
|
||||
|
||||
// ResourceStats holds on-demand stastistics from various cgroup subsystems
|
||||
type ResourceStats struct {
|
||||
// Memory statistics.
|
||||
MemoryStats *MemoryStats
|
||||
}
|
||||
|
||||
// CgroupManager allows for cgroup management.
|
||||
// Supports Cgroup Creation ,Deletion and Updates.
|
||||
type CgroupManager interface {
|
||||
@ -72,6 +84,8 @@ type CgroupManager interface {
|
||||
Pids(name CgroupName) []int
|
||||
// ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
|
||||
ReduceCPULimits(cgroupName CgroupName) error
|
||||
// GetResourceStats returns statistics of the specified cgroup as read from the cgroup fs.
|
||||
GetResourceStats(name CgroupName) (*ResourceStats, error)
|
||||
}
|
||||
|
||||
// QOSContainersInfo stores the names of containers per qos
|
||||
|
Loading…
Reference in New Issue
Block a user