Kubelet changes to support hugepages

This commit is contained in:
Derek Carr 2017-08-17 14:28:15 -04:00
parent afd8045ed7
commit 1ec2a69d9a
9 changed files with 154 additions and 6 deletions

View File

@ -23,11 +23,14 @@ go_library(
"//conditions:default": [],
}),
deps = [
"//pkg/api/v1/helper:go_default_library",
"//pkg/features:go_default_library",
"//vendor/github.com/google/cadvisor/events:go_default_library",
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
"//vendor/github.com/google/cadvisor/info/v2:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/kubelet/types:go_default_library",

View File

@ -21,6 +21,9 @@ import (
cadvisorapi2 "github.com/google/cadvisor/info/v2"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
"k8s.io/kubernetes/pkg/features"
)
func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
@ -32,6 +35,17 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
int64(info.MemoryCapacity),
resource.BinarySI),
}
// if huge pages are enabled, we report them as a schedulable resource on the node
if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) {
for _, hugepagesInfo := range info.HugePages {
pageSizeBytes := int64(hugepagesInfo.PageSize * 1024)
hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages)
pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI)
c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI)
}
}
return c
}

View File

@ -52,6 +52,7 @@ go_library(
] + select({
"@io_bazel_rules_go//go/platform:linux_amd64": [
"//pkg/api:go_default_library",
"//pkg/api/v1/helper:go_default_library",
"//pkg/api/v1/helper/qos:go_default_library",
"//pkg/api/v1/resource:go_default_library",
"//pkg/kubelet/cm/util:go_default_library",
@ -63,6 +64,7 @@ go_library(
"//pkg/util/procfs:go_default_library",
"//pkg/util/sysctl:go_default_library",
"//pkg/util/version:go_default_library",
"//vendor/github.com/docker/go-units:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",

View File

@ -24,12 +24,16 @@ import (
"strings"
"time"
units "github.com/docker/go-units"
"github.com/golang/glog"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
@ -43,6 +47,10 @@ const (
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
)
// hugePageSizeList is useful for converting to the hugetlb canonical unit
// which is what is expected when interacting with libcontainer
var hugePageSizeList = []string{"", "kB", "MB", "GB", "TB", "PB"}
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
@ -299,10 +307,16 @@ type subsystem interface {
GetStats(path string, stats *libcontainercgroups.Stats) error
}
// Cgroup subsystems we currently support
var supportedSubsystems = []subsystem{
&cgroupfs.MemoryGroup{},
&cgroupfs.CpuGroup{},
// getSupportedSubsystems returns list of subsystems supported
func getSupportedSubsystems() []subsystem {
supportedSubsystems := []subsystem{
&cgroupfs.MemoryGroup{},
&cgroupfs.CpuGroup{},
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{})
}
return supportedSubsystems
}
// setSupportedSubsystems sets cgroup resource limits only on the supported
@ -315,7 +329,7 @@ var supportedSubsystems = []subsystem{
// but this is not possible with libcontainers Set() method
// See https://github.com/opencontainers/runc/issues/932
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
for _, sys := range supportedSubsystems {
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}
@ -343,6 +357,30 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
if resourceConfig.CpuPeriod != nil {
resources.CpuPeriod = *resourceConfig.CpuPeriod
}
// if huge pages are enabled, we set them in libcontainer
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
// for each page size enumerated, set that value
pageSizes := sets.NewString()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range cgroupfs.HugePageSizes {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
return resources
}
@ -502,7 +540,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
stats := libcontainercgroups.NewStats()
for _, sys := range supportedSubsystems {
for _, sys := range getSupportedSubsystems() {
if _, ok := cgroupPaths[sys.Name()]; !ok {
return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
}

View File

@ -26,6 +26,7 @@ import (
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"k8s.io/api/core/v1"
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource"
)
@ -83,6 +84,23 @@ func MilliCPUToShares(milliCPU int64) int64 {
return shares
}
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// sum requests and limits.
@ -108,6 +126,8 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
// track if limits were applied for each resource.
memoryLimitsDeclared := true
cpuLimitsDeclared := true
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := map[int64]int64{}
for _, container := range pod.Spec.Containers {
if container.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = false
@ -115,6 +135,14 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
if container.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = false
}
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
for k, v := range containerHugePageLimits {
if value, exists := hugePageLimits[k]; exists {
hugePageLimits[k] = value + v
} else {
hugePageLimits[k] = v
}
}
}
// determine the qos class
@ -140,6 +168,7 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
shares := int64(MinShares)
result.CpuShares = &shares
}
result.HugePageLimit = hugePageLimits
return result
}

View File

@ -28,7 +28,9 @@ import (
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
@ -154,6 +156,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
val := MilliCPUToShares(q.MilliValue())
rc.CpuShares = &val
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc
}

View File

@ -27,9 +27,13 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/api/v1/resource"
kubefeatures "k8s.io/kubernetes/pkg/features"
)
const (
@ -100,11 +104,18 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
minShares := int64(MinShares)
resourceParameters.CpuShares = &minShares
}
// containerConfig object stores the cgroup specifications
containerConfig := &CgroupConfig{
Name: absoluteContainerName,
ResourceParameters: resourceParameters,
}
// for each enumerated huge page size, the qos tiers are unbounded
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
m.setHugePagesUnbounded(containerConfig)
}
// check if it exists
if !cm.Exists(absoluteContainerName) {
if err := cm.Create(containerConfig); err != nil {
@ -138,6 +149,29 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
return nil
}
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
hugePageLimit := map[int64]int64{}
for _, pageSize := range cgroupfs.HugePageSizes {
pageSizeBytes, err := units.RAMInBytes(pageSize)
if err != nil {
return err
}
hugePageLimit[pageSizeBytes] = int64(1 << 62)
}
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
return nil
}
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
for _, v := range configs {
if err := m.setHugePagesUnbounded(v); err != nil {
return err
}
}
return nil
}
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
pods := m.activePods()
burstablePodCPURequest := int64(0)
@ -262,6 +296,13 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
return err
}
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
if err := m.setHugePagesConfig(qosConfigs); err != nil {
return err
}
}
for resource, percentReserve := range m.qosReserved {
switch resource {
case v1.ResourceMemory:

View File

@ -31,6 +31,8 @@ type ResourceConfig struct {
CpuQuota *int64
// CPU quota period.
CpuPeriod *int64
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
}
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.

View File

@ -630,6 +630,19 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
}
node.Status.Allocatable[k] = value
}
// for every huge page reservation, we need to remove it from allocatable memory
for k, v := range node.Status.Capacity {
if v1helper.IsHugePageResourceName(k) {
allocatableMemory := node.Status.Allocatable[v1.ResourceMemory]
value := *(v.Copy())
allocatableMemory.Sub(value)
if allocatableMemory.Sign() < 0 {
// Negative Allocatable resources don't make sense.
allocatableMemory.Set(0)
}
node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory
}
}
}
// Set versioninfo for the node.