diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index 9465856b1c6..069f4d05a49 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -225,8 +225,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.Float64Var(&s.ChaosChance, "chaos-chance", s.ChaosChance, "If > 0.0, introduce random client errors and latency. Intended for testing. [default=0.0]") fs.BoolVar(&s.Containerized, "containerized", s.Containerized, "Experimental support for running kubelet in a container. Intended for testing. [default=false]") fs.Int64Var(&s.MaxOpenFiles, "max-open-files", s.MaxOpenFiles, "Number of files that can be opened by Kubelet process. [default=1000000]") - fs.Var(&s.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") - fs.Var(&s.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.BoolVar(&s.RegisterSchedulable, "register-schedulable", s.RegisterSchedulable, "Register the node as schedulable. Won't have any effect if register-node is false. [default=true]") fs.MarkDeprecated("register-schedulable", "will be removed in a future version") fs.Var(utiltaints.NewTaintsVar(&s.RegisterWithTaints), "register-with-taints", "Register the node with the given list of taints (comma seperated \"=:\"). No-op if register-node is false.") @@ -264,4 +263,13 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&s.RemoteImageEndpoint, "image-service-endpoint", s.RemoteImageEndpoint, "[Experimental] The unix socket endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. The endpoint is used only when CRI integration is enabled (--enable-cri)") fs.BoolVar(&s.ExperimentalCheckNodeCapabilitiesBeforeMount, "experimental-check-node-capabilities-before-mount", s.ExperimentalCheckNodeCapabilitiesBeforeMount, "[Experimental] if set true, the kubelet will check the underlying node for required componenets (binaries, etc.) before performing the mount") + + // Node Allocatable Flags + fs.Var(&s.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.Var(&s.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + + fs.StringSliceVar(&s.EnforceNodeAllocatable, "enforce-node-allocatable", s.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default='']") + fs.StringVar(&s.SystemReservedCgroup, "system-reserved-cgroup", s.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") + fs.StringVar(&s.KubeReservedCgroup, "kube-reserved-cgroup", s.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") + fs.BoolVar(&s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "experimental-node-allocatable-ignore-eviction-threshold", s.ExperimentalNodeAllocatableIgnoreEvictionThreshold, "When set to 'true', Hard Eviction Thresholds will be ignored while calculating Node Allocatable. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details. [default=false]") } diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 6d88325d104..2bc489741e1 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -40,6 +40,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/server/healthz" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -469,6 +470,14 @@ func run(s *options.KubeletServer, kubeDeps *kubelet.KubeletDeps) (err error) { CgroupDriver: s.CgroupDriver, ProtectKernelDefaults: s.ProtectKernelDefaults, EnableCRI: s.EnableCRI, + NodeAllocatableConfig: cm.NodeAllocatableConfig{ + KubeReservedCgroupName: s.KubeReservedCgroup, + SystemReservedCgroupName: s.SystemReservedCgroup, + EnforceNodeAllocatable: sets.NewString(s.EnforceNodeAllocatable...), + KubeReserved: kubeReserved, + SystemReserved: systemReserved, + HardEvictionThresholds: hardEvictionThresholds, + }, }, s.ExperimentalFailSwapOn) diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 53b0ff3cc7f..88ea80e8f2d 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -442,16 +442,6 @@ type KubeletConfiguration struct { // manage attachment/detachment of volumes scheduled to this node, and // disables kubelet from executing any attach/detach operations EnableControllerAttachDetach bool - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs - // that describe resources reserved for non-kubernetes components. - // Currently only cpu and memory are supported. [default=none] - // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. - SystemReserved ConfigurationMap - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs - // that describe resources reserved for kubernetes system components. - // Currently only cpu and memory are supported. [default=none] - // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. - KubeReserved ConfigurationMap // Default behaviour for kernel tuning ProtectKernelDefaults bool // If true, Kubelet ensures a set of iptables rules are present on host. @@ -485,6 +475,32 @@ type KubeletConfiguration struct { // This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. // This can be useful for debugging volume related issues. KeepTerminatedPodVolumes bool + + /* following flags are meant for Node Allocatable */ + + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // that describe resources reserved for non-kubernetes components. + // Currently only cpu and memory are supported. [default=none] + // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. + SystemReserved ConfigurationMap + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // that describe resources reserved for kubernetes system components. + // Currently only cpu and memory are supported. [default=none] + // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. + KubeReserved ConfigurationMap + // This flag helps kubelet identify absolute name of top level cgroup used to enforce `SystemReserved` compute resource reservation for OS system daemons. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + SystemReservedCgroup string + // This flag helps kubelet identify absolute name of top level cgroup used to enforce `KubeReserved` compute resource reservation for Kubernetes node system daemons. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + KubeReservedCgroup string + // This flag specifies the various Node Allocatable enforcements that Kubelet needs to perform. + // This flag accepts a list of options. Acceptible options are `pods`, `system-reserved` & `kube-reserved`. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + EnforceNodeAllocatable []string + // This flag, if set, will avoid including `EvictionHard` limits while computing Node Allocatable. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + ExperimentalNodeAllocatableIgnoreEvictionThreshold bool } type KubeletAuthorizationMode string diff --git a/pkg/apis/componentconfig/v1alpha1/defaults.go b/pkg/apis/componentconfig/v1alpha1/defaults.go index 9166d018c87..7a2f8b84d56 100644 --- a/pkg/apis/componentconfig/v1alpha1/defaults.go +++ b/pkg/apis/componentconfig/v1alpha1/defaults.go @@ -48,7 +48,12 @@ const ( defaultIPTablesDropBit = 15 ) -var zeroDuration = metav1.Duration{} +var ( + zeroDuration = metav1.Duration{} + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + // TODO: Set the default to "pods" once cgroups per qos is turned on by default. + defaultNodeAllocatableEnforcement = []string{} +) func addDefaultingFuncs(scheme *kruntime.Scheme) error { RegisterDefaults(scheme) @@ -401,6 +406,9 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) { if obj.CgroupDriver == "" { obj.CgroupDriver = "cgroupfs" } + if obj.EnforceNodeAllocatable == nil { + obj.EnforceNodeAllocatable = defaultNodeAllocatableEnforcement + } if obj.EnableCRI == nil { obj.EnableCRI = boolVar(true) } diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 5fcbcac5e3d..28b7499a3ef 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -478,16 +478,6 @@ type KubeletConfiguration struct { // manage attachment/detachment of volumes scheduled to this node, and // disables kubelet from executing any attach/detach operations EnableControllerAttachDetach *bool `json:"enableControllerAttachDetach"` - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs - // that describe resources reserved for non-kubernetes components. - // Currently only cpu and memory are supported. [default=none] - // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. - SystemReserved map[string]string `json:"systemReserved"` - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs - // that describe resources reserved for kubernetes system components. - // Currently only cpu and memory are supported. [default=none] - // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. - KubeReserved map[string]string `json:"kubeReserved"` // Default behaviour for kernel tuning ProtectKernelDefaults bool `json:"protectKernelDefaults"` // If true, Kubelet ensures a set of iptables rules are present on host. @@ -522,6 +512,33 @@ type KubeletConfiguration struct { // This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node. // This can be useful for debugging volume related issues. KeepTerminatedPodVolumes bool `json:"keepTerminatedPodVolumes,omitempty"` + + /* following flags are meant for Node Allocatable */ + + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // that describe resources reserved for non-kubernetes components. + // Currently only cpu and memory are supported. [default=none] + // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. + SystemReserved map[string]string `json:"systemReserved"` + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // that describe resources reserved for kubernetes system components. + // Currently only cpu and memory are supported. [default=none] + // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. + KubeReserved map[string]string `json:"kubeReserved"` + + // This flag helps kubelet identify absolute name of top level cgroup used to enforce `SystemReserved` compute resource reservation for OS system daemons. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + SystemReservedCgroup string `json:"systemReservedCgroup,omitempty"` + // This flag helps kubelet identify absolute name of top level cgroup used to enforce `KubeReserved` compute resource reservation for Kubernetes node system daemons. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + KubeReservedCgroup string `json:"kubeReservedCgroup,omitempty"` + // This flag specifies the various Node Allocatable enforcements that Kubelet needs to perform. + // This flag accepts a list of options. Acceptible options are `pods`, `system-reserved` & `kube-reserved`. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + EnforceNodeAllocatable []string `json:"enforceNodeAllocatable,omitempty"` + // This flag, if set, will avoid including `EvictionHard` limits while computing Node Allocatable. + // Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information. + ExperimentalNodeAllocatableIgnoreEvictionThreshold bool `json:"experimentalNodeAllocatableIgnoreEvictionThreshold,omitempty"` } type KubeletAuthorizationMode string diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go index 12469913e37..ac958c3bf93 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.conversion.go @@ -396,8 +396,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu if err := v1.Convert_Pointer_bool_To_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil { return err } - out.SystemReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.SystemReserved)) - out.KubeReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.KubeReserved)) out.ProtectKernelDefaults = in.ProtectKernelDefaults if err := v1.Convert_Pointer_bool_To_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil { return err @@ -416,6 +414,12 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes + out.SystemReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.SystemReserved)) + out.KubeReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.KubeReserved)) + out.SystemReservedCgroup = in.SystemReservedCgroup + out.KubeReservedCgroup = in.KubeReservedCgroup + out.EnforceNodeAllocatable = *(*[]string)(unsafe.Pointer(&in.EnforceNodeAllocatable)) + out.ExperimentalNodeAllocatableIgnoreEvictionThreshold = in.ExperimentalNodeAllocatableIgnoreEvictionThreshold return nil } @@ -570,8 +574,6 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu if err := v1.Convert_bool_To_Pointer_bool(&in.EnableControllerAttachDetach, &out.EnableControllerAttachDetach, s); err != nil { return err } - out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved)) - out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved)) out.ProtectKernelDefaults = in.ProtectKernelDefaults if err := v1.Convert_bool_To_Pointer_bool(&in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains, s); err != nil { return err @@ -590,6 +592,12 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes + out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved)) + out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved)) + out.SystemReservedCgroup = in.SystemReservedCgroup + out.KubeReservedCgroup = in.KubeReservedCgroup + out.EnforceNodeAllocatable = *(*[]string)(unsafe.Pointer(&in.EnforceNodeAllocatable)) + out.ExperimentalNodeAllocatableIgnoreEvictionThreshold = in.ExperimentalNodeAllocatableIgnoreEvictionThreshold return nil } diff --git a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go index 332e2823a04..c21f1f14dc5 100644 --- a/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/componentconfig/v1alpha1/zz_generated.deepcopy.go @@ -266,20 +266,6 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c * *out = new(bool) **out = **in } - if in.SystemReserved != nil { - in, out := &in.SystemReserved, &out.SystemReserved - *out = make(map[string]string) - for key, val := range *in { - (*out)[key] = val - } - } - if in.KubeReserved != nil { - in, out := &in.KubeReserved, &out.KubeReserved - *out = make(map[string]string) - for key, val := range *in { - (*out)[key] = val - } - } if in.MakeIPTablesUtilChains != nil { in, out := &in.MakeIPTablesUtilChains, &out.MakeIPTablesUtilChains *out = new(bool) @@ -305,6 +291,25 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c * *out = new(bool) **out = **in } + if in.SystemReserved != nil { + in, out := &in.SystemReserved, &out.SystemReserved + *out = make(map[string]string) + for key, val := range *in { + (*out)[key] = val + } + } + if in.KubeReserved != nil { + in, out := &in.KubeReserved, &out.KubeReserved + *out = make(map[string]string) + for key, val := range *in { + (*out)[key] = val + } + } + if in.EnforceNodeAllocatable != nil { + in, out := &in.EnforceNodeAllocatable, &out.EnforceNodeAllocatable + *out = make([]string, len(*in)) + copy(*out, *in) + } return nil } } diff --git a/pkg/apis/componentconfig/zz_generated.deepcopy.go b/pkg/apis/componentconfig/zz_generated.deepcopy.go index 3f542578ccb..55b646d61fb 100644 --- a/pkg/apis/componentconfig/zz_generated.deepcopy.go +++ b/pkg/apis/componentconfig/zz_generated.deepcopy.go @@ -177,6 +177,11 @@ func DeepCopy_componentconfig_KubeletConfiguration(in interface{}, out interface (*out)[key] = val } } + if in.AllowedUnsafeSysctls != nil { + in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.SystemReserved != nil { in, out := &in.SystemReserved, &out.SystemReserved *out = make(ConfigurationMap) @@ -191,8 +196,8 @@ func DeepCopy_componentconfig_KubeletConfiguration(in interface{}, out interface (*out)[key] = val } } - if in.AllowedUnsafeSysctls != nil { - in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls + if in.EnforceNodeAllocatable != nil { + in, out := &in.EnforceNodeAllocatable, &out.EnforceNodeAllocatable *out = make([]string, len(*in)) copy(*out, *in) } diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index bf20ec9119d..1551eb38e69 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -13283,34 +13283,6 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope Format: "", }, }, - "systemReserved": { - SchemaProps: spec.SchemaProps{ - Description: "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. [default=none] See http://kubernetes.io/docs/user-guide/compute-resources for more detail.", - Type: []string{"object"}, - AdditionalProperties: &spec.SchemaOrBool{ - Schema: &spec.Schema{ - SchemaProps: spec.SchemaProps{ - Type: []string{"string"}, - Format: "", - }, - }, - }, - }, - }, - "kubeReserved": { - SchemaProps: spec.SchemaProps{ - Description: "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. [default=none] See http://kubernetes.io/docs/user-guide/compute-resources for more detail.", - Type: []string{"object"}, - AdditionalProperties: &spec.SchemaOrBool{ - Schema: &spec.Schema{ - SchemaProps: spec.SchemaProps{ - Type: []string{"string"}, - Format: "", - }, - }, - }, - }, - }, "protectKernelDefaults": { SchemaProps: spec.SchemaProps{ Description: "Default behaviour for kernel tuning", @@ -13388,8 +13360,71 @@ func GetOpenAPIDefinitions(ref openapi.ReferenceCallback) map[string]openapi.Ope Format: "", }, }, + "systemReserved": { + SchemaProps: spec.SchemaProps{ + Description: "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. [default=none] See http://kubernetes.io/docs/user-guide/compute-resources for more detail.", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + "kubeReserved": { + SchemaProps: spec.SchemaProps{ + Description: "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. [default=none] See http://kubernetes.io/docs/user-guide/compute-resources for more detail.", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + "systemReservedCgroup": { + SchemaProps: spec.SchemaProps{ + Description: "This flag helps kubelet identify absolute name of top level cgroup used to enforce `SystemReserved` compute resource reservation for OS system daemons. Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information.", + Type: []string{"string"}, + Format: "", + }, + }, + "kubeReservedCgroup": { + SchemaProps: spec.SchemaProps{ + Description: "This flag helps kubelet identify absolute name of top level cgroup used to enforce `KubeReserved` compute resource reservation for Kubernetes node system daemons. Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information.", + Type: []string{"string"}, + Format: "", + }, + }, + "enforceNodeAllocatable": { + SchemaProps: spec.SchemaProps{ + Description: "This flag specifies the various Node Allocatable enforcements that Kubelet needs to perform. This flag accepts a list of options. Acceptible options are `pods`, `system-reserved` & `kube-reserved`. Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Type: []string{"string"}, + Format: "", + }, + }, + }, + }, + }, + "experimentalNodeAllocatableIgnoreEvictionThreshold": { + SchemaProps: spec.SchemaProps{ + Description: "This flag, if set, will avoid including `EvictionHard` limits while computing Node Allocatable. Refer to [Node Allocatable](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) doc for more information.", + Type: []string{"boolean"}, + Format: "", + }, + }, }, - Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "systemReserved", "kubeReserved", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit"}, + Required: []string{"podManifestPath", "syncFrequency", "fileCheckFrequency", "httpCheckFrequency", "manifestURL", "manifestURLHeader", "enableServer", "address", "port", "readOnlyPort", "tlsCertFile", "tlsPrivateKeyFile", "certDirectory", "authentication", "authorization", "hostnameOverride", "podInfraContainerImage", "dockerEndpoint", "rootDirectory", "seccompProfileRoot", "allowPrivileged", "hostNetworkSources", "hostPIDSources", "hostIPCSources", "registryPullQPS", "registryBurst", "eventRecordQPS", "eventBurst", "enableDebuggingHandlers", "minimumGCAge", "maxPerPodContainerCount", "maxContainerCount", "cAdvisorPort", "healthzPort", "healthzBindAddress", "oomScoreAdj", "registerNode", "clusterDomain", "masterServiceNamespace", "clusterDNS", "streamingConnectionIdleTimeout", "nodeStatusUpdateFrequency", "imageMinimumGCAge", "imageGCHighThresholdPercent", "imageGCLowThresholdPercent", "lowDiskSpaceThresholdMB", "volumeStatsAggPeriod", "networkPluginName", "networkPluginDir", "cniConfDir", "cniBinDir", "networkPluginMTU", "volumePluginDir", "cloudProvider", "cloudConfigFile", "kubeletCgroups", "runtimeCgroups", "systemCgroups", "cgroupRoot", "containerRuntime", "remoteRuntimeEndpoint", "remoteImageEndpoint", "runtimeRequestTimeout", "rktPath", "rktAPIEndpoint", "rktStage1Image", "lockFilePath", "exitOnLockContention", "hairpinMode", "babysitDaemons", "maxPods", "nvidiaGPUs", "dockerExecHandlerName", "podCIDR", "resolvConf", "cpuCFSQuota", "containerized", "maxOpenFiles", "registerSchedulable", "registerWithTaints", "contentType", "kubeAPIQPS", "kubeAPIBurst", "serializeImagePulls", "outOfDiskTransitionFrequency", "nodeIP", "nodeLabels", "nonMasqueradeCIDR", "enableCustomMetrics", "evictionHard", "evictionSoft", "evictionSoftGracePeriod", "evictionPressureTransitionPeriod", "evictionMaxPodGracePeriod", "evictionMinimumReclaim", "experimentalKernelMemcgNotification", "podsPerCore", "enableControllerAttachDetach", "protectKernelDefaults", "makeIPTablesUtilChains", "iptablesMasqueradeBit", "iptablesDropBit", "systemReserved", "kubeReserved"}, }, }, Dependencies: []string{ diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 3e743502658..62f42cfbd7e 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -16,7 +16,10 @@ limitations under the License. package cm -import "k8s.io/kubernetes/pkg/api/v1" +import ( + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/kubernetes/pkg/api/v1" +) // Manages the containers running on a machine. type ContainerManager interface { @@ -56,6 +59,16 @@ type NodeConfig struct { CgroupDriver string ProtectKernelDefaults bool EnableCRI bool + NodeAllocatableConfig +} + +type NodeAllocatableConfig struct { + KubeReservedCgroupName string + SystemReservedCgroupName string + EnforceNodeAllocatable sets.String + KubeReserved v1.ResourceList + SystemReserved v1.ResourceList + HardEvictionThresholds []evictionapi.Threshold } type Status struct { diff --git a/pkg/kubelet/cm/node_allocatable.go b/pkg/kubelet/cm/node_allocatable.go new file mode 100644 index 00000000000..f9b99147448 --- /dev/null +++ b/pkg/kubelet/cm/node_allocatable.go @@ -0,0 +1,164 @@ +// +build linux + +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "fmt" + + "github.com/golang/glog" + + "k8s.io/kubernetes/pkg/api/v1" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" +) + +const ( + defaultNodeAllocatableCgroupName = "/kubepods" + nodeAllocatableEnforcementKey = "pods" + systemReservedEnforcementKey = "system-reserved" + kubeReservedEnforcementKey = "kube-reserved" +) + +func createNodeAllocatableCgroups(nc NodeAllocatableConfig, nodeAllocatable v1.ResourceList, cgroupManager CgroupManager) error { + cgroupConfig := &CgroupConfig{ + Name: CgroupName(defaultNodeAllocatableCgroupName), + } + if err := cgroupManager.Create(cgroupConfig); err != nil { + glog.Errorf("Failed to create %q cgroup and apply limits") + return err + } + return nil +} + +// Enforce Node Allocatable Cgroup settings. +func enforceNodeAllocatableCgroups(nc NodeAllocatableConfig, nodeAllocatable v1.ResourceList, cgroupManager CgroupManager) error { + glog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc) + glog.V(4).Infof("Node Allocatable resources: %+v", nodeAllocatable) + // Create top level cgroups for all pods if necessary. + if nc.EnforceNodeAllocatable.Has(nodeAllocatableEnforcementKey) { + cgroupConfig := &CgroupConfig{ + Name: CgroupName(defaultNodeAllocatableCgroupName), + ResourceParameters: getCgroupConfig(nodeAllocatable), + } + glog.V(4).Infof("Updating Node Allocatable cgroup with %d cpu shares and %d bytes of memory", cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) + if err := cgroupManager.Update(cgroupConfig); err != nil { + glog.Errorf("Failed to create %q cgroup and apply limits") + return err + } + } + // Now apply kube reserved and system reserved limits if required. + if nc.EnforceNodeAllocatable.Has(systemReservedEnforcementKey) { + glog.V(2).Infof("Enforcing system reserved on cgroup %q with limits: %+v", nc.SystemReservedCgroupName, nc.SystemReserved) + if err := enforceExistingCgroup(cgroupManager, nc.SystemReservedCgroupName, nc.SystemReserved); err != nil { + return fmt.Errorf("failed to enforce System Reserved Cgroup Limits: %v", err) + } + } + if nc.EnforceNodeAllocatable.Has(kubeReservedEnforcementKey) { + glog.V(2).Infof("Enforcing kube reserved on cgroup %q with limits: %+v", nc.KubeReservedCgroupName, nc.KubeReserved) + if err := enforceExistingCgroup(cgroupManager, nc.KubeReservedCgroupName, nc.KubeReserved); err != nil { + return fmt.Errorf("failed to enforce Kube Reserved Cgroup Limits: %v", err) + } + } + return nil +} + +func enforceExistingCgroup(cgroupManager CgroupManager, cName string, rl v1.ResourceList) error { + cgroupConfig := &CgroupConfig{ + Name: CgroupName(cName), + ResourceParameters: getCgroupConfig(rl), + } + glog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) + if !cgroupManager.Exists(cgroupConfig.Name) { + return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name) + } + if err := cgroupManager.Update(cgroupConfig); err != nil { + return err + } + return nil +} + +func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { + // TODO(vishh): Set CPU Quota if necessary. + if rl == nil { + return nil + } + var rc ResourceConfig + if q, exists := rl[v1.ResourceMemory]; exists { + // Memory is defined in bytes. + val := q.Value() + rc.Memory = &val + } + if q, exists := rl[v1.ResourceCPU]; exists { + // CPU is defined in milli-cores. + val := MilliCPUToShares(q.MilliValue()) + rc.CpuShares = &val + } + return &rc +} + +func (cm *containerManagerImpl) getNodeAllocatableInternal(includeHardEviction bool) v1.ResourceList { + var evictionReservation v1.ResourceList + if includeHardEviction { + evictionReservation = hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity) + } + result := make(v1.ResourceList) + for k, v := range cm.capacity { + value := *(v.Copy()) + if cm.NodeConfig.SystemReserved != nil { + value.Sub(cm.NodeConfig.SystemReserved[k]) + } + if cm.NodeConfig.KubeReserved != nil { + value.Sub(cm.NodeConfig.KubeReserved[k]) + } + if evictionReservation != nil { + value.Sub(evictionReservation[k]) + } + if value.Sign() < 0 { + // Negative Allocatable resources don't make sense. + value.Set(0) + } + result[k] = value + } + return result + +} + +// GetNodeAllocatable returns amount of compute resource available for pods. +func (cm *containerManagerImpl) GetNodeAllocatable() v1.ResourceList { + return cm.getNodeAllocatableInternal(!cm.NodeConfig.IgnoreHardEvictionThreshold) +} + +// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds. +func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList { + if len(thresholds) == 0 { + return nil + } + ret := v1.ResourceList{} + for _, threshold := range thresholds { + if threshold.Operator != evictionapi.OpLessThan { + continue + } + switch threshold.Signal { + case evictionapi.SignalMemoryAvailable: + memoryCapacity := capacity[v1.ResourceMemory] + value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity) + ret[v1.ResourceMemory] = *value + } + } + return ret +}