From 1ec2a69d9a83be6372842614b1c73925f4b56d2a Mon Sep 17 00:00:00 2001 From: Derek Carr Date: Thu, 17 Aug 2017 14:28:15 -0400 Subject: [PATCH] Kubelet changes to support hugepages --- pkg/kubelet/cadvisor/BUILD | 3 ++ pkg/kubelet/cadvisor/util.go | 14 ++++++ pkg/kubelet/cm/BUILD | 2 + pkg/kubelet/cm/cgroup_manager_linux.go | 50 ++++++++++++++++--- pkg/kubelet/cm/helpers_linux.go | 29 +++++++++++ pkg/kubelet/cm/node_container_manager.go | 6 +++ pkg/kubelet/cm/qos_container_manager_linux.go | 41 +++++++++++++++ pkg/kubelet/cm/types.go | 2 + pkg/kubelet/kubelet_node_status.go | 13 +++++ 9 files changed, 154 insertions(+), 6 deletions(-) diff --git a/pkg/kubelet/cadvisor/BUILD b/pkg/kubelet/cadvisor/BUILD index 15cea0e8d72..989103b82a3 100644 --- a/pkg/kubelet/cadvisor/BUILD +++ b/pkg/kubelet/cadvisor/BUILD @@ -23,11 +23,14 @@ go_library( "//conditions:default": [], }), deps = [ + "//pkg/api/v1/helper:go_default_library", + "//pkg/features:go_default_library", "//vendor/github.com/google/cadvisor/events:go_default_library", "//vendor/github.com/google/cadvisor/info/v1:go_default_library", "//vendor/github.com/google/cadvisor/info/v2:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", ] + select({ "@io_bazel_rules_go//go/platform:linux_amd64": [ "//pkg/kubelet/types:go_default_library", diff --git a/pkg/kubelet/cadvisor/util.go b/pkg/kubelet/cadvisor/util.go index a95a8fb19e2..dd4d7c4b1fd 100644 --- a/pkg/kubelet/cadvisor/util.go +++ b/pkg/kubelet/cadvisor/util.go @@ -21,6 +21,9 @@ import ( cadvisorapi2 "github.com/google/cadvisor/info/v2" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + utilfeature "k8s.io/apiserver/pkg/util/feature" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" + "k8s.io/kubernetes/pkg/features" ) func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList { @@ -32,6 +35,17 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList { int64(info.MemoryCapacity), resource.BinarySI), } + + // if huge pages are enabled, we report them as a schedulable resource on the node + if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + for _, hugepagesInfo := range info.HugePages { + pageSizeBytes := int64(hugepagesInfo.PageSize * 1024) + hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages) + pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI) + c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI) + } + } + return c } diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 992eafd9c9b..187cc3a0ecc 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -52,6 +52,7 @@ go_library( ] + select({ "@io_bazel_rules_go//go/platform:linux_amd64": [ "//pkg/api:go_default_library", + "//pkg/api/v1/helper:go_default_library", "//pkg/api/v1/helper/qos:go_default_library", "//pkg/api/v1/resource:go_default_library", "//pkg/kubelet/cm/util:go_default_library", @@ -63,6 +64,7 @@ go_library( "//pkg/util/procfs:go_default_library", "//pkg/util/sysctl:go_default_library", "//pkg/util/version:go_default_library", + "//vendor/github.com/docker/go-units:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library", diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index c210dc2f605..137a2a14620 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -24,12 +24,16 @@ import ( "strings" "time" + units "github.com/docker/go-units" "github.com/golang/glog" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs" + "k8s.io/apimachinery/pkg/util/sets" + utilfeature "k8s.io/apiserver/pkg/util/feature" + kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/metrics" ) @@ -43,6 +47,10 @@ const ( libcontainerSystemd libcontainerCgroupManagerType = "systemd" ) +// hugePageSizeList is useful for converting to the hugetlb canonical unit +// which is what is expected when interacting with libcontainer +var hugePageSizeList = []string{"", "kB", "MB", "GB", "TB", "PB"} + // ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name. // For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice // If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form. @@ -299,10 +307,16 @@ type subsystem interface { GetStats(path string, stats *libcontainercgroups.Stats) error } -// Cgroup subsystems we currently support -var supportedSubsystems = []subsystem{ - &cgroupfs.MemoryGroup{}, - &cgroupfs.CpuGroup{}, +// getSupportedSubsystems returns list of subsystems supported +func getSupportedSubsystems() []subsystem { + supportedSubsystems := []subsystem{ + &cgroupfs.MemoryGroup{}, + &cgroupfs.CpuGroup{}, + } + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{}) + } + return supportedSubsystems } // setSupportedSubsystems sets cgroup resource limits only on the supported @@ -315,7 +329,7 @@ var supportedSubsystems = []subsystem{ // but this is not possible with libcontainers Set() method // See https://github.com/opencontainers/runc/issues/932 func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error { - for _, sys := range supportedSubsystems { + for _, sys := range getSupportedSubsystems() { if _, ok := cgroupConfig.Paths[sys.Name()]; !ok { return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) } @@ -343,6 +357,30 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont if resourceConfig.CpuPeriod != nil { resources.CpuPeriod = *resourceConfig.CpuPeriod } + + // if huge pages are enabled, we set them in libcontainer + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + // for each page size enumerated, set that value + pageSizes := sets.NewString() + for pageSize, limit := range resourceConfig.HugePageLimit { + sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList) + resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{ + Pagesize: sizeString, + Limit: uint64(limit), + }) + pageSizes.Insert(sizeString) + } + // for each page size omitted, limit to 0 + for _, pageSize := range cgroupfs.HugePageSizes { + if pageSizes.Has(pageSize) { + continue + } + resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{ + Pagesize: pageSize, + Limit: uint64(0), + }) + } + } return resources } @@ -502,7 +540,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error { func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) { stats := libcontainercgroups.NewStats() - for _, sys := range supportedSubsystems { + for _, sys := range getSupportedSubsystems() { if _, ok := cgroupPaths[sys.Name()]; !ok { return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name()) } diff --git a/pkg/kubelet/cm/helpers_linux.go b/pkg/kubelet/cm/helpers_linux.go index 39f445a9431..d3f91a14def 100644 --- a/pkg/kubelet/cm/helpers_linux.go +++ b/pkg/kubelet/cm/helpers_linux.go @@ -26,6 +26,7 @@ import ( libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" "k8s.io/api/core/v1" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/api/v1/resource" ) @@ -83,6 +84,23 @@ func MilliCPUToShares(milliCPU int64) int64 { return shares } +// HugePageLimits converts the API representation to a map +// from huge page size (in bytes) to huge page limit (in bytes). +func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 { + hugePageLimits := map[int64]int64{} + for k, v := range resourceList { + if v1helper.IsHugePageResourceName(k) { + pageSize, _ := v1helper.HugePageSizeFromResourceName(k) + if value, exists := hugePageLimits[pageSize.Value()]; exists { + hugePageLimits[pageSize.Value()] = value + v.Value() + } else { + hugePageLimits[pageSize.Value()] = v.Value() + } + } + } + return hugePageLimits +} + // ResourceConfigForPod takes the input pod and outputs the cgroup resource config. func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { // sum requests and limits. @@ -108,6 +126,8 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { // track if limits were applied for each resource. memoryLimitsDeclared := true cpuLimitsDeclared := true + // map hugepage pagesize (bytes) to limits (bytes) + hugePageLimits := map[int64]int64{} for _, container := range pod.Spec.Containers { if container.Resources.Limits.Cpu().IsZero() { cpuLimitsDeclared = false @@ -115,6 +135,14 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { if container.Resources.Limits.Memory().IsZero() { memoryLimitsDeclared = false } + containerHugePageLimits := HugePageLimits(container.Resources.Requests) + for k, v := range containerHugePageLimits { + if value, exists := hugePageLimits[k]; exists { + hugePageLimits[k] = value + v + } else { + hugePageLimits[k] = v + } + } } // determine the qos class @@ -140,6 +168,7 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig { shares := int64(MinShares) result.CpuShares = &shares } + result.HugePageLimit = hugePageLimits return result } diff --git a/pkg/kubelet/cm/node_container_manager.go b/pkg/kubelet/cm/node_container_manager.go index a96f9c54037..c9cb48ce559 100644 --- a/pkg/kubelet/cm/node_container_manager.go +++ b/pkg/kubelet/cm/node_container_manager.go @@ -28,7 +28,9 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/kubernetes/pkg/api" + kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/events" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" ) @@ -154,6 +156,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { val := MilliCPUToShares(q.MilliValue()) rc.CpuShares = &val } + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + rc.HugePageLimit = HugePageLimits(rl) + } + return &rc } diff --git a/pkg/kubelet/cm/qos_container_manager_linux.go b/pkg/kubelet/cm/qos_container_manager_linux.go index 1e5ae939334..dab2e0bf3ac 100644 --- a/pkg/kubelet/cm/qos_container_manager_linux.go +++ b/pkg/kubelet/cm/qos_container_manager_linux.go @@ -27,9 +27,13 @@ import ( "k8s.io/apimachinery/pkg/util/wait" + units "github.com/docker/go-units" + cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs" "k8s.io/api/core/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/api/v1/resource" + kubefeatures "k8s.io/kubernetes/pkg/features" ) const ( @@ -100,11 +104,18 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis minShares := int64(MinShares) resourceParameters.CpuShares = &minShares } + // containerConfig object stores the cgroup specifications containerConfig := &CgroupConfig{ Name: absoluteContainerName, ResourceParameters: resourceParameters, } + + // for each enumerated huge page size, the qos tiers are unbounded + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + m.setHugePagesUnbounded(containerConfig) + } + // check if it exists if !cm.Exists(absoluteContainerName) { if err := cm.Create(containerConfig); err != nil { @@ -138,6 +149,29 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis return nil } +// setHugePagesUnbounded ensures hugetlb is effectively unbounded +func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error { + hugePageLimit := map[int64]int64{} + for _, pageSize := range cgroupfs.HugePageSizes { + pageSizeBytes, err := units.RAMInBytes(pageSize) + if err != nil { + return err + } + hugePageLimit[pageSizeBytes] = int64(1 << 62) + } + cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit + return nil +} + +func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { + for _, v := range configs { + if err := m.setHugePagesUnbounded(v); err != nil { + return err + } + } + return nil +} + func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error { pods := m.activePods() burstablePodCPURequest := int64(0) @@ -262,6 +296,13 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error { return err } + // update the qos level cgroup settings for huge pages (ensure they remain unbounded) + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) { + if err := m.setHugePagesConfig(qosConfigs); err != nil { + return err + } + } + for resource, percentReserve := range m.qosReserved { switch resource { case v1.ResourceMemory: diff --git a/pkg/kubelet/cm/types.go b/pkg/kubelet/cm/types.go index a2e8594262b..4419b0a3c72 100644 --- a/pkg/kubelet/cm/types.go +++ b/pkg/kubelet/cm/types.go @@ -31,6 +31,8 @@ type ResourceConfig struct { CpuQuota *int64 // CPU quota period. CpuPeriod *int64 + // HugePageLimit map from page size (in bytes) to limit (in bytes) + HugePageLimit map[int64]int64 } // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 4cb35ba52a1..4817e6d68e8 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -630,6 +630,19 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) { } node.Status.Allocatable[k] = value } + // for every huge page reservation, we need to remove it from allocatable memory + for k, v := range node.Status.Capacity { + if v1helper.IsHugePageResourceName(k) { + allocatableMemory := node.Status.Allocatable[v1.ResourceMemory] + value := *(v.Copy()) + allocatableMemory.Sub(value) + if allocatableMemory.Sign() < 0 { + // Negative Allocatable resources don't make sense. + allocatableMemory.Set(0) + } + node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory + } + } } // Set versioninfo for the node.