From 93accb51e4c24bc36b730812a44381b7780ca918 Mon Sep 17 00:00:00 2001 From: Artyom Lukianov Date: Thu, 8 Oct 2020 18:37:36 +0300 Subject: [PATCH] memory manager: add memory manager flag under kubelet options and kubelet config The commit also includes generated files after `make generated_files`. Signed-off-by: Byonggon Chun --- api/api-rules/violation_exceptions.list | 1 + cmd/kubelet/app/options/options.go | 5 ++ cmd/kubelet/app/server.go | 78 +++++++++++++++++-- pkg/features/kube_features.go | 6 ++ pkg/kubelet/apis/config/types.go | 9 +++ .../config/v1beta1/zz_generated.conversion.go | 4 + .../apis/config/zz_generated.deepcopy.go | 13 ++++ .../k8s.io/kubelet/config/v1beta1/types.go | 22 ++++++ .../config/v1beta1/zz_generated.deepcopy.go | 13 ++++ 9 files changed, 143 insertions(+), 8 deletions(-) diff --git a/api/api-rules/violation_exceptions.list b/api/api-rules/violation_exceptions.list index 65aea40c25e..665da4da15c 100644 --- a/api/api-rules/violation_exceptions.list +++ b/api/api-rules/violation_exceptions.list @@ -392,6 +392,7 @@ API rule violation: list_type_missing,k8s.io/kubelet/config/v1alpha1,CredentialP API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,AllowedUnsafeSysctls API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ClusterDNS API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,EnforceNodeAllocatable +API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,ReservedMemory API rule violation: list_type_missing,k8s.io/kubelet/config/v1beta1,KubeletConfiguration,TLSCipherSuites API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1alpha1,PodMetrics,Containers API rule violation: list_type_missing,k8s.io/metrics/pkg/apis/metrics/v1beta1,PodMetrics,Containers diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 842386af3be..471d083f5a6 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -550,4 +550,9 @@ Runtime log sanitization may introduce significant computation overhead and ther // Graduated experimental flags, kept for backward compatibility fs.BoolVar(&c.KernelMemcgNotification, "experimental-kernel-memcg-notification", c.KernelMemcgNotification, "Use kernelMemcgNotification configuration, this flag will be removed in 1.23.") + + // Memory Manager Flags + fs.StringVar(&c.MemoryManagerPolicy, "memory-manager-policy", c.MemoryManagerPolicy, "Memory Manager policy to use. Possible values: 'none', 'static'. Default: 'none'") + // TODO: once documentation link is available, replace KEP link with the documentation one. + fs.Var(cliflag.NewBracketSeparatedSliceMapStringString(&c.ReservedMemory), "reserved-memory", "A comma separated list of bracket-enclosed configuration for memory manager (e.g. {numa-node=0, type=memory, limit=1Gi}, {numa-node=1, type=memory, limit=1Gi}). The total sum for each memory type should be equal to the sum of kube-reserved, system-reserved and eviction-threshold. See more details under https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/1769-memory-manager#reserved-memory-flag") } diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index ff7e05feec1..127dfc84f8b 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -71,6 +71,7 @@ import ( "k8s.io/kubernetes/cmd/kubelet/app/options" "k8s.io/kubernetes/pkg/api/legacyscheme" api "k8s.io/kubernetes/pkg/apis/core" + corev1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" "k8s.io/kubernetes/pkg/capabilities" "k8s.io/kubernetes/pkg/credentialprovider" "k8s.io/kubernetes/pkg/features" @@ -687,6 +688,12 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend s.SystemReserved["cpu"] = strconv.Itoa(reservedSystemCPUs.Size()) klog.Infof("After cpu setting is overwritten, KubeReserved=\"%v\", SystemReserved=\"%v\"", s.KubeReserved, s.SystemReserved) } + + reservedMemory, err := parseReservedMemoryConfig(s.ReservedMemory) + if err != nil { + return err + } + kubeReserved, err := parseResourceList(s.KubeReserved) if err != nil { return err @@ -732,14 +739,16 @@ func run(ctx context.Context, s *options.KubeletServer, kubeDeps *kubelet.Depend ReservedSystemCPUs: reservedSystemCPUs, HardEvictionThresholds: hardEvictionThresholds, }, - QOSReserved: *experimentalQOSReserved, - ExperimentalCPUManagerPolicy: s.CPUManagerPolicy, - ExperimentalCPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, - ExperimentalPodPidsLimit: s.PodPidsLimit, - EnforceCPULimits: s.CPUCFSQuota, - CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, - ExperimentalTopologyManagerPolicy: s.TopologyManagerPolicy, - ExperimentalTopologyManagerScope: s.TopologyManagerScope, + QOSReserved: *experimentalQOSReserved, + ExperimentalCPUManagerPolicy: s.CPUManagerPolicy, + ExperimentalCPUManagerReconcilePeriod: s.CPUManagerReconcilePeriod.Duration, + ExperimentalMemoryManagerPolicy: s.MemoryManagerPolicy, + ExperimentalMemoryManagerReservedMemory: reservedMemory, + ExperimentalPodPidsLimit: s.PodPidsLimit, + EnforceCPULimits: s.CPUCFSQuota, + CPUCFSQuotaPeriod: s.CPUCFSQuotaPeriod.Duration, + ExperimentalTopologyManagerPolicy: s.TopologyManagerPolicy, + ExperimentalTopologyManagerScope: s.TopologyManagerScope, }, s.FailSwapOn, devicePluginEnabled, @@ -1296,6 +1305,59 @@ func parseResourceList(m map[string]string) (v1.ResourceList, error) { return rl, nil } +func parseReservedMemoryConfig(config []map[string]string) (map[int]map[v1.ResourceName]resource.Quantity, error) { + if len(config) == 0 { + return nil, nil + } + + const ( + indexKey = "numa-node" + typeKey = "type" + limitKey = "limit" + ) + + keys := []string{indexKey, typeKey, limitKey} + + // check whether all keys are present + for _, m := range config { + for _, key := range keys { + if _, exist := m[key]; !exist { + return nil, fmt.Errorf("key: %s is missing in given ReservedMemory flag: %v", key, config) + } + } + } + + parsed := make(map[int]map[v1.ResourceName]resource.Quantity, len(config)) + for _, m := range config { + idxInString, _ := m[indexKey] + idx, err := strconv.Atoi(idxInString) + if err != nil || idx < 0 { + return nil, fmt.Errorf("NUMA index conversion error for value: \"%s\"", idxInString) + } + + typeInString, _ := m[typeKey] + v1Type := v1.ResourceName(typeInString) + if v1Type != v1.ResourceMemory && !corev1helper.IsHugePageResourceName(v1Type) { + return nil, fmt.Errorf("memory type conversion error, unknown type: \"%s\"", typeInString) + } + if corev1helper.IsHugePageResourceName(v1Type) { + if _, err := corev1helper.HugePageSizeFromResourceName(v1Type); err != nil { + return nil, fmt.Errorf("memory type conversion error, unknown type: \"%s\"", typeInString) + } + } + + limitInString, _ := m[limitKey] + limit, err := resource.ParseQuantity(limitInString) + if err != nil || limit.Sign() != 1 { + return nil, fmt.Errorf("memory limit conversion error for value \"%s\"", limitInString) + } + parsed[idx] = make(map[v1.ResourceName]resource.Quantity) + parsed[idx][v1Type] = limit + } + + return parsed, nil +} + // BootstrapKubeletConfigController constructs and bootstrap a configuration controller func BootstrapKubeletConfigController(dynamicConfigDir string, transform dynamickubeletconfig.TransformFunc) (*kubeletconfiginternal.KubeletConfiguration, *dynamickubeletconfig.Controller, error) { if !utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) { diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 122d1d11975..f1a47fbdb8c 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -123,6 +123,12 @@ const ( // Enable resource managers to make NUMA aligned decisions TopologyManager featuregate.Feature = "TopologyManager" + // owner: @cynepco3hahue(alukiano) @cezaryzukowski @k-wiatrzyk + // alpha:: v1.20 + + // Allows to set containers memory affinity according to NUMA topology + MemoryManager featuregate.Feature = "MemoryManager" + // owner: @sjenning // beta: v1.11 // diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index d518a6cf412..13a5fbbd280 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -224,6 +224,9 @@ type KubeletConfiguration struct { // CPU Manager reconciliation period. // Requires the CPUManager feature gate to be enabled. CPUManagerReconcilePeriod metav1.Duration + // MemoryManagerPolicy is the name of the policy to use. + // Requires the MemoryManager feature gate to be enabled. + MemoryManagerPolicy string // TopologyManagerPolicy is the name of the policy to use. // Policies other than "none" require the TopologyManager feature gate to be enabled. TopologyManagerPolicy string @@ -382,6 +385,12 @@ type KubeletConfiguration struct { // Defaults to 10 seconds, requires GracefulNodeShutdown feature gate to be enabled. // For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods. ShutdownGracePeriodCriticalPods metav1.Duration + // A comma separated list of bracket-enclosed configurations for memory manager. + // Each configuration describes pre-reserved memory for the particular memory type on a specific NUMA node. + // The Memory Manager validates whether total amount of pre-reserved memory is identical to reserved-memory by the Node Allocatable feature. + // The format is {numa-node=integer, memory-type=string, limit=string} + // (e.g. {numa-node=0, type=memory, limit=1Gi}, {numa-node=1, type=memory, limit=1Gi}) + ReservedMemory []map[string]string } // KubeletAuthorizationMode denotes the authorization mode for the kubelet diff --git a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go index 09aae527ecc..d0865b25746 100644 --- a/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go +++ b/pkg/kubelet/apis/config/v1beta1/zz_generated.conversion.go @@ -274,6 +274,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in out.CgroupDriver = in.CgroupDriver out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + out.MemoryManagerPolicy = in.MemoryManagerPolicy out.TopologyManagerPolicy = in.TopologyManagerPolicy out.TopologyManagerScope = in.TopologyManagerScope out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) @@ -352,6 +353,7 @@ func autoConvert_v1beta1_KubeletConfiguration_To_config_KubeletConfiguration(in } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + out.ReservedMemory = *(*[]map[string]string)(unsafe.Pointer(&in.ReservedMemory)) return nil } @@ -429,6 +431,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in out.CgroupDriver = in.CgroupDriver out.CPUManagerPolicy = in.CPUManagerPolicy out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod + out.MemoryManagerPolicy = in.MemoryManagerPolicy out.TopologyManagerPolicy = in.TopologyManagerPolicy out.TopologyManagerScope = in.TopologyManagerScope out.QOSReserved = *(*map[string]string)(unsafe.Pointer(&in.QOSReserved)) @@ -505,6 +508,7 @@ func autoConvert_config_KubeletConfiguration_To_v1beta1_KubeletConfiguration(in } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + out.ReservedMemory = *(*[]map[string]string)(unsafe.Pointer(&in.ReservedMemory)) return nil } diff --git a/pkg/kubelet/apis/config/zz_generated.deepcopy.go b/pkg/kubelet/apis/config/zz_generated.deepcopy.go index e458d832294..5dc85843b6e 100644 --- a/pkg/kubelet/apis/config/zz_generated.deepcopy.go +++ b/pkg/kubelet/apis/config/zz_generated.deepcopy.go @@ -273,6 +273,19 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { out.Logging = in.Logging out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + if in.ReservedMemory != nil { + in, out := &in.ReservedMemory, &out.ReservedMemory + *out = make([]map[string]string, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + } + } return } diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/types.go b/staging/src/k8s.io/kubelet/config/v1beta1/types.go index aa37d6df280..c75d129f235 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/types.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/types.go @@ -73,6 +73,13 @@ const ( // PodTopologyManagerScope represents that // topology policy is applied on a per-pod basis. PodTopologyManagerScope = "pod" + // NoneMemoryManagerPolicy is a memory manager none policy, under the none policy + // the memory manager will not pin containers memory of guaranteed pods + NoneMemoryManagerPolicy = "none" + // StaticMemoryManagerPolicy is a memory manager static policy, under the static policy + // the memory manager will try to pin containers memory of guaranteed pods to the smallest + // possible sub-set of NUMA nodes + StaticMemoryManagerPolicy = "static" ) // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -433,6 +440,13 @@ type KubeletConfiguration struct { // Default: "10s" // +optional CPUManagerReconcilePeriod metav1.Duration `json:"cpuManagerReconcilePeriod,omitempty"` + // MemoryManagerPolicy is the name of the policy to use by memory manager. + // Requires the MemoryManager feature gate to be enabled. + // Dynamic Kubelet Config (beta): This field should not be updated without a full node + // reboot. It is safest to keep this value the same as the local config. + // Default: "none" + // +optional + MemoryManagerPolicy string `json:"memoryManagerPolicy,omitempty"` // TopologyManagerPolicy is the name of the policy to use. // Policies other than "none" require the TopologyManager feature gate to be enabled. // Dynamic Kubelet Config (beta): This field should not be updated without a full node @@ -824,6 +838,14 @@ type KubeletConfiguration struct { // Default: "10s" // +optional ShutdownGracePeriodCriticalPods metav1.Duration `json:"shutdownGracePeriodCriticalPods,omitempty"` + // A comma separated list of bracket-enclosed configurations for memory manager. + // Each configuration describes pre-reserved memory for the certain memory type on a specific NUMA node. + // The Memory Manager validates whether total amount of pre-reserved memory is identical to reserved-memory by the Node Allocatable feature. + // The format is {numa-node=integer, memory-type=string, limit=string} + // (e.g. {numa-node=0, type=memory, limit=1Gi}, {numa-node=1, type=memory, limit=1Gi}) + // Default: nil + // +optional + ReservedMemory []map[string]string `json:"reservedMemory,omitempty"` } type KubeletAuthorizationMode string diff --git a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go index a6ad075c9ad..a6c7f56023e 100644 --- a/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/kubelet/config/v1beta1/zz_generated.deepcopy.go @@ -303,6 +303,19 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) { } out.ShutdownGracePeriod = in.ShutdownGracePeriod out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods + if in.ReservedMemory != nil { + in, out := &in.ReservedMemory, &out.ReservedMemory + *out = make([]map[string]string, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + } + } return }