diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index b9322fcd0ea..03c57e9cd22 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -269,4 +269,6 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&s.SystemReservedCgroup, "system-reserved-cgroup", s.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&s.KubeReservedCgroup, "kube-reserved-cgroup", s.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-allocatable-ignore-eviction", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]") + + fs.Var(&s.ExperimentalQOSReserved, "experimental-qos-reserved", "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]") } diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 89f5639ca20..7d9028444ed 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -525,6 +525,10 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { return err } } + experimentalQOSReserved, err := cm.ParseQOSReserved(s.ExperimentalQOSReserved) + if err != nil { + return err + } kubeDeps.ContainerManager, err = cm.NewContainerManager( kubeDeps.Mounter, kubeDeps.CAdvisorInterface, @@ -546,6 +550,7 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { SystemReserved: systemReserved, HardEvictionThresholds: hardEvictionThresholds, }, + ExperimentalQOSReserved: *experimentalQOSReserved, }, s.ExperimentalFailSwapOn, kubeDeps.Recorder) diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 843125de9ac..f2ca1e258ab 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -236,6 +236,7 @@ experimental-keystone-url experimental-mounter-path experimental-nvidia-gpus experimental-prefix +experimental-qos-reserved external-etcd-cafile external-etcd-certfile external-etcd-endpoints diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index e77f7686d70..af3812e2c47 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -440,6 +440,10 @@ type KubeletConfiguration struct { // manage attachment/detachment of volumes scheduled to this node, and // disables kubelet from executing any attach/detach operations EnableControllerAttachDetach bool + // A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe + // how pod resource requests are reserved at the QoS level. + // Currently only memory is supported. [default=none]" + ExperimentalQOSReserved ConfigurationMap // Default behaviour for kernel tuning ProtectKernelDefaults bool // If true, Kubelet ensures a set of iptables rules are present on host. diff --git a/pkg/apis/componentconfig/v1alpha1/defaults.go b/pkg/apis/componentconfig/v1alpha1/defaults.go index 7a2f8b84d56..57e06dd5c87 100644 --- a/pkg/apis/componentconfig/v1alpha1/defaults.go +++ b/pkg/apis/componentconfig/v1alpha1/defaults.go @@ -385,6 +385,9 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) { if obj.KubeReserved == nil { obj.KubeReserved = make(map[string]string) } + if obj.ExperimentalQOSReserved == nil { + obj.ExperimentalQOSReserved = make(map[string]string) + } if obj.MakeIPTablesUtilChains == nil { obj.MakeIPTablesUtilChains = boolVar(true) } diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 0284b244343..83436dd6f6c 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -476,6 +476,10 @@ type KubeletConfiguration struct { // manage attachment/detachment of volumes scheduled to this node, and // disables kubelet from executing any attach/detach operations EnableControllerAttachDetach *bool `json:"enableControllerAttachDetach"` + // A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe + // how pod resource requests are reserved at the QoS level. + // Currently only memory is supported. [default=none]" + ExperimentalQOSReserved map[string]string `json:"experimentalQOSReserved"` // Default behaviour for kernel tuning ProtectKernelDefaults bool `json:"protectKernelDefaults"` // If true, Kubelet ensures a set of iptables rules are present on host. diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go index cc7deafc92b..f34355fc1ed 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go @@ -395,6 +395,7 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu if err := v1.Convert_Pointer_bool_To_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil { return err } + out.ExperimentalQOSReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.ExperimentalQOSReserved)) out.ProtectKernelDefaults = in.ProtectKernelDefaults if err := v1.Convert_Pointer_bool_To_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil { return err @@ -572,6 +573,7 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu if err := v1.Convert_bool_To_Pointer_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil { return err } + out.ExperimentalQOSReserved = *(*map[string]string)(unsafe.Pointer(&in.ExperimentalQOSReserved)) out.ProtectKernelDefaults = in.ProtectKernelDefaults if err := v1.Convert_bool_To_Pointer_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil { return err diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go index c21f1f14dc5..c3d4fbee0dd 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go @@ -266,6 +266,13 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c * *out = new(bool) **out = **in } + if in.ExperimentalQOSReserved != nil { + in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved + *out = make(map[string]string) + for key, val := range *in { + (*out)[key] = val + } + } if in.MakeIPTablesUtilChains != nil { in, out := &in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains *out = new(bool) diff --git a/pkg/apis/componentconfig/zz_generated.deepcopy.go b/pkg/apis/componentconfig/zz_generated.deepcopy.go index 55b646d61fb..733137549df 100644 --- a/pkg/apis/componentconfig/zz_generated.deepcopy.go +++ b/pkg/apis/componentconfig/zz_generated.deepcopy.go @@ -177,6 +177,13 @@ func DeepCopy_componentconfig_KubeletConfiguration(in interface{}, out interface (*out)[key] = val } } + if in.ExperimentalQOSReserved != nil { + in, out := &in.ExperimentalQOSReserved, &out.ExperimentalQOSReserved + *out = make(ConfigurationMap) + for key, val := range *in { + (*out)[key] = val + } + } if in.AllowedUnsafeSysctls != nil { in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls *out = make([]string, len(*in)) diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 37ff83207ca..740b4f48f05 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -13838,6 +13838,20 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope Format: "", }, }, + "experimentalQOSReserved": { + SchemaProps: spec.SchemaProps{ + Description: "A set of ResourceName=Percentage (e.g. memory=50%) pairs that describe how pod resource requests are reserved at the QoS level. Currently only memory is supported. [default=none]\"", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, "protectKernelDefaults": { SchemaProps: spec.SchemaProps{ Description: "Default behaviour for kernel tuning", @@ -13979,7 +13993,7 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope }, }, }, - Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, + Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "experimentalQOSReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, }, }, Dependencies: []string{ diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 42cadd854b3..acf1ed53af2 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -25,6 +25,7 @@ go_library( tags = ["automanaged"], deps = [ "//pkg/api/v1:go_default_library", + "//pkg/apis/componentconfig:go_default_library", "//pkg/kubelet/cadvisor:go_default_library", "//pkg/kubelet/cm/util:go_default_library", "//pkg/kubelet/events:go_default_library", @@ -55,6 +56,7 @@ go_test( name = "go_default_test", srcs = [ "cgroup_manager_linux_test.go", + "cgroup_manager_test.go", "container_manager_linux_test.go", "helpers_linux_test.go", "node_container_manager_test.go", @@ -63,6 +65,7 @@ go_test( tags = ["automanaged"], deps = [ "//pkg/api/v1:go_default_library", + "//pkg/apis/componentconfig:go_default_library", "//pkg/kubelet/eviction/api:go_default_library", "//pkg/util/mount:go_default_library", "//vendor:github.com/stretchr/testify/assert", diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index f7f44d5408c..e8fcca3ec36 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -276,6 +276,8 @@ type subsystem interface { Name() string // Set the cgroup represented by cgroup. Set(path string, cgroup *libcontainerconfigs.Cgroup) error + // GetStats returns the statistics associated with the cgroup + GetStats(path string, stats *libcontainercgroups.Stats) error } // Cgroup subsystems we currently support @@ -465,3 +467,34 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error { } return m.Update(containerConfig) } + +func getStatsSupportedSubsytems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) { + stats := libcontainercgroups.NewStats() + for _, sys := range supportedSubsystems { + if _, ok := cgroupPaths[sys.Name()]; !ok { + return nil, fmt.Errorf("Failed to find subsytem mount for subsytem: %v", sys.Name()) + } + if err := sys.GetStats(cgroupPaths[sys.Name()], stats); err != nil { + return nil, fmt.Errorf("Failed to get stats for supported subsystems : %v", err) + } + } + return stats, nil +} + +func toResourceStats(stats *libcontainercgroups.Stats) *ResourceStats { + return &ResourceStats{ + MemoryStats: &MemoryStats{ + Usage: int64(stats.MemoryStats.Usage.Usage), + }, + } +} + +// Get sets the ResourceParameters of the specified cgroup as read from the cgroup fs +func (m *cgroupManagerImpl) GetResourceStats(name CgroupName) (*ResourceStats, error) { + cgroupPaths := m.buildCgroupPaths(name) + stats, err := getStatsSupportedSubsytems(cgroupPaths) + if err != nil { + return nil, fmt.Errorf("failed to get stats supported cgroup subsystems for cgroup %v: %v", name, err) + } + return toResourceStats(stats), nil +} diff --git a/pkg/kubelet/cm/cgroup_manager_test.go b/pkg/kubelet/cm/cgroup_manager_test.go new file mode 100644 index 00000000000..442941b4c2f --- /dev/null +++ b/pkg/kubelet/cm/cgroup_manager_test.go @@ -0,0 +1,83 @@ +// +build linux + +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "reflect" + "testing" + + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" +) + +func Test(t *testing.T) { + tests := []struct { + input string + expected *map[v1.ResourceName]int64 + }{ + { + input: "memory", + expected: nil, + }, + { + input: "memory=a", + expected: nil, + }, + { + input: "memory=a%", + expected: nil, + }, + { + input: "memory=200%", + expected: nil, + }, + { + input: "memory=0%", + expected: &map[v1.ResourceName]int64{ + v1.ResourceMemory: 0, + }, + }, + { + input: "memory=100%", + expected: &map[v1.ResourceName]int64{ + v1.ResourceMemory: 100, + }, + }, + { + // need to change this when CPU is added as a supported resource + input: "memory=100%,cpu=50%", + expected: nil, + }, + } + for _, test := range tests { + m := componentconfig.ConfigurationMap{} + m.Set(test.input) + actual, err := ParseQOSReserved(m) + if actual != nil && test.expected == nil { + t.Errorf("Unexpected success, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err) + } + if actual == nil && test.expected != nil { + t.Errorf("Unexpected failure, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err) + } + if (actual == nil && test.expected == nil) || reflect.DeepEqual(*actual, *test.expected) { + continue + } + t.Errorf("Unexpected result, input: %v, expected: %v, actual: %v, err: %v", test.input, test.expected, actual, err) + } +} diff --git a/pkg/kubelet/cm/cgroup_manager_unsupported.go b/pkg/kubelet/cm/cgroup_manager_unsupported.go index b41c301c432..6a567e94b15 100644 --- a/pkg/kubelet/cm/cgroup_manager_unsupported.go +++ b/pkg/kubelet/cm/cgroup_manager_unsupported.go @@ -54,6 +54,10 @@ func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error { return fmt.Errorf("Cgroup Manager is not supported in this build") } +func (m *unsupportedCgroupManager) GetResourceStats(name CgroupName) (*ResourceStats, error) { + return nil, fmt.Errorf("Cgroup Manager is not supported in this build") +} + func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int { return nil } diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index b2c6f8fc4e8..86d64b85140 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -20,7 +20,12 @@ import ( "k8s.io/apimachinery/pkg/util/sets" // TODO: Migrate kubelet to either use its own internal objects or client library. "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" + + "fmt" + "strconv" + "strings" ) type ActivePodsFunc func() []*v1.Pod @@ -71,6 +76,7 @@ type NodeConfig struct { ProtectKernelDefaults bool EnableCRI bool NodeAllocatableConfig + ExperimentalQOSReserved map[v1.ResourceName]int64 } type NodeAllocatableConfig struct { @@ -93,3 +99,41 @@ const ( SystemReservedEnforcementKey = "system-reserved" KubeReservedEnforcementKey = "kube-reserved" ) + +// containerManager for the kubelet is currently an injected dependency. +// We need to parse the --qos-reserve-requests option in +// cmd/kubelet/app/server.go and there isn't really a good place to put +// the code. If/When the kubelet dependency injection gets worked out, +// maybe there will be a better place for it. +func parsePercentage(v string) (int64, error) { + if !strings.HasSuffix(v, "%") { + return 0, fmt.Errorf("percentage expected, got '%s'", v) + } + percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0) + if err != nil { + return 0, fmt.Errorf("invalid number in percentage '%s'", v) + } + if percentage < 0 || percentage > 100 { + return 0, fmt.Errorf("percentage must be between 0 and 100") + } + return percentage, nil +} + +// ParseQOSReserved parses the --qos-reserve-requests option +func ParseQOSReserved(m componentconfig.ConfigurationMap) (*map[v1.ResourceName]int64, error) { + reservations := make(map[v1.ResourceName]int64) + for k, v := range m { + switch v1.ResourceName(k) { + // Only memory resources are supported. + case v1.ResourceMemory: + q, err := parsePercentage(v) + if err != nil { + return nil, err + } + reservations[v1.ResourceName(k)] = q + default: + return nil, fmt.Errorf("cannot reserve %q resource", k) + } + } + return &reservations, nil +} diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index d613910d48d..39b377ac0fb 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -274,7 +274,6 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager { if cm.NodeConfig.CgroupsPerQOS { return &podContainerManagerImpl{ qosContainersInfo: cm.GetQOSContainersInfo(), - nodeInfo: cm.nodeInfo, subsystems: cm.subsystems, cgroupManager: cm.cgroupManager, } @@ -366,7 +365,7 @@ func (cm *containerManagerImpl) setupNode(activePods ActivePodsFunc) error { if err := cm.createNodeAllocatableCgroups(); err != nil { return err } - err = cm.qosContainerManager.Start(cm.nodeInfo, activePods) + err = cm.qosContainerManager.Start(cm.getNodeAllocatableAbsolute, activePods) if err != nil { return fmt.Errorf("failed to initialise top level QOS containers: %v", err) } diff --git a/pkg/kubelet/cm/pod_container_manager_linux.go b/pkg/kubelet/cm/pod_container_manager_linux.go index 06573a0aa89..f59495f5f0f 100644 --- a/pkg/kubelet/cm/pod_container_manager_linux.go +++ b/pkg/kubelet/cm/pod_container_manager_linux.go @@ -38,8 +38,6 @@ const ( // It is the general implementation which allows pod level container // management if qos Cgroup is enabled. type podContainerManagerImpl struct { - // nodeInfo stores information about the node resource capacity - nodeInfo *v1.Node // qosContainersInfo hold absolute paths of the top level qos containers qosContainersInfo QOSContainersInfo // Stores the mounted cgroup subsystems diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index bcf011995de..db4536b61eb 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -25,7 +25,6 @@ import ( "github.com/golang/glog" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/kubelet/qos" ) @@ -37,19 +36,21 @@ const ( ) type QOSContainerManager interface { - Start(*v1.Node, ActivePodsFunc) error + Start(func() v1.ResourceList, ActivePodsFunc) error GetQOSContainersInfo() QOSContainersInfo UpdateCgroups() error } type qosContainerManagerImpl struct { sync.Mutex - nodeInfo *v1.Node - qosContainersInfo QOSContainersInfo - subsystems *CgroupSubsystems - cgroupManager CgroupManager - activePods ActivePodsFunc - cgroupRoot string + nodeInfo *v1.Node + qosContainersInfo QOSContainersInfo + subsystems *CgroupSubsystems + cgroupManager CgroupManager + activePods ActivePodsFunc + getNodeAllocatable func() v1.ResourceList + cgroupRoot string + qosReserved map[v1.ResourceName]int64 } func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nodeConfig NodeConfig) (QOSContainerManager, error) { @@ -63,6 +64,7 @@ func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot string, nod subsystems: subsystems, cgroupManager: NewCgroupManager(subsystems, nodeConfig.CgroupDriver), cgroupRoot: cgroupRoot, + qosReserved: nodeConfig.ExperimentalQOSReserved, }, nil } @@ -70,7 +72,7 @@ func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo { return m.qosContainersInfo } -func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePodsFunc) error { +func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error { cm := m.cgroupManager rootContainer := m.cgroupRoot if !cm.Exists(CgroupName(rootContainer)) { @@ -115,7 +117,7 @@ func (m *qosContainerManagerImpl) Start(nodeInfo *v1.Node, activePods ActivePods Burstable: path.Join(rootContainer, string(v1.PodQOSBurstable)), BestEffort: path.Join(rootContainer, string(v1.PodQOSBestEffort)), } - m.nodeInfo = nodeInfo + m.getNodeAllocatable = getNodeAllocatable m.activePods = activePods // update qos cgroup tiers on startup and in periodic intervals @@ -162,6 +164,85 @@ func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass] return nil } +// setMemoryReserve sums the memory limits of all pods in a QOS class, +// calculates QOS class memory limits, and set those limits in the +// CgroupConfig for each QOS class. +func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { + qosMemoryRequests := map[v1.PodQOSClass]int64{ + v1.PodQOSGuaranteed: 0, + v1.PodQOSBurstable: 0, + } + + // Sum the pod limits for pods in each QOS class + pods := m.activePods() + for _, pod := range pods { + podMemoryRequest := int64(0) + qosClass := qos.GetPodQOS(pod) + if qosClass == v1.PodQOSBestEffort { + // limits are not set for Best Effort pods + continue + } + req, _, err := v1.PodRequestsAndLimits(pod) + if err != nil { + glog.V(2).Infof("[Container Manager] Pod resource requests/limits could not be determined. Not setting QOS memory limts.") + return + } + if request, found := req[v1.ResourceMemory]; found { + podMemoryRequest += request.Value() + } + qosMemoryRequests[qosClass] += podMemoryRequest + } + + resources := m.getNodeAllocatable() + allocatableResource, ok := resources[v1.ResourceMemory] + if !ok { + glog.V(2).Infof("[Container Manager] Allocatable memory value could not be determined. Not setting QOS memory limts.") + return + } + allocatable := allocatableResource.Value() + if allocatable == 0 { + glog.V(2).Infof("[Container Manager] Memory allocatable reported as 0, might be in standalone mode. Not setting QOS memory limts.") + return + } + + for qos, limits := range qosMemoryRequests { + glog.V(2).Infof("[Container Manager] %s pod requests total %d bytes (reserve %d%%)", qos, limits, percentReserve) + } + + // Calculate QOS memory limits + burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100) + bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100) + configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit + configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit +} + +// retrySetMemoryReserve checks for any QoS cgroups over the limit +// that was attempted to be set in the first Update() and adjusts +// their memory limit to the usage to prevent further growth. +func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) { + // Unreclaimable memory usage may already exceeded the desired limit + // Attempt to set the limit near the current usage to put pressure + // on the cgroup and prevent further growth. + for qos, config := range configs { + stats, err := m.cgroupManager.GetResourceStats(config.Name) + if err != nil { + glog.V(2).Infof("[Container Manager] %v", err) + return + } + usage := stats.MemoryStats.Usage + + // Because there is no good way to determine of the original Update() + // on the memory resource was successful, we determine failure of the + // first attempt by checking if the usage is above the limit we attempt + // to set. If it is, we assume the first attempt to set the limit failed + // and try again setting the limit to the usage. Otherwise we leave + // the CgroupConfig as is. + if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory { + configs[qos].ResourceParameters.Memory = &usage + } + } +} + func (m *qosContainerManagerImpl) UpdateCgroups() error { m.Lock() defer m.Unlock() @@ -182,6 +263,34 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { return err } + for resource, percentReserve := range m.qosReserved { + switch resource { + case v1.ResourceMemory: + m.setMemoryReserve(qosConfigs, percentReserve) + } + } + updateSuccess := true + for _, config := range qosConfigs { + err := m.cgroupManager.Update(config) + if err != nil { + updateSuccess = false + } + } + if updateSuccess { + glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration") + return nil + } + + // If the resource can adjust the ResourceConfig to increase likelihood of + // success, call the adjustment function here. Otherwise, the Update() will + // be called again with the same values. + for resource, percentReserve := range m.qosReserved { + switch resource { + case v1.ResourceMemory: + m.retrySetMemoryReserve(qosConfigs, percentReserve) + } + } + for _, config := range qosConfigs { err := m.cgroupManager.Update(config) if err != nil { @@ -189,8 +298,8 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { return err } } - glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration") + glog.V(2).Infof("[ContainerManager]: Updated QoS cgroup configuration on retry") return nil } @@ -204,7 +313,7 @@ func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo { return QOSContainersInfo{} } -func (m *qosContainerManagerNoop) Start(_ *v1.Node, _ ActivePodsFunc) error { +func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error { return nil } diff --git a/pkg/kubelet/cm/types.go b/pkg/kubelet/cm/types.go index 5330ea7affe..a2dd30b7b36 100644 --- a/pkg/kubelet/cm/types.go +++ b/pkg/kubelet/cm/types.go @@ -47,6 +47,18 @@ type CgroupConfig struct { ResourceParameters *ResourceConfig } +// MemoryStats holds the on-demand stastistics from the memory cgroup +type MemoryStats struct { + // Memory usage (in bytes). + Usage int64 +} + +// ResourceStats holds on-demand stastistics from various cgroup subsystems +type ResourceStats struct { + // Memory statistics. + MemoryStats *MemoryStats +} + // CgroupManager allows for cgroup management. // Supports Cgroup Creation ,Deletion and Updates. type CgroupManager interface { @@ -72,6 +84,8 @@ type CgroupManager interface { Pids(name CgroupName) []int // ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares. ReduceCPULimits(cgroupName CgroupName) error + // GetResourceStats returns statistics of the specified cgroup as read from the cgroup fs. + GetResourceStats(name CgroupName) (*ResourceStats, error) } // QOSContainersInfo stores the names of containers per qos