mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-24 12:15:52 +00:00
Kubelet changes to support hugepages
This commit is contained in:
parent
afd8045ed7
commit
1ec2a69d9a
@ -23,11 +23,14 @@ go_library(
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = [
|
||||
"//pkg/api/v1/helper:go_default_library",
|
||||
"//pkg/features:go_default_library",
|
||||
"//vendor/github.com/google/cadvisor/events:go_default_library",
|
||||
"//vendor/github.com/google/cadvisor/info/v1:go_default_library",
|
||||
"//vendor/github.com/google/cadvisor/info/v2:go_default_library",
|
||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
] + select({
|
||||
"@io_bazel_rules_go//go/platform:linux_amd64": [
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
|
@ -21,6 +21,9 @@ import (
|
||||
cadvisorapi2 "github.com/google/cadvisor/info/v2"
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
)
|
||||
|
||||
func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
|
||||
@ -32,6 +35,17 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
|
||||
int64(info.MemoryCapacity),
|
||||
resource.BinarySI),
|
||||
}
|
||||
|
||||
// if huge pages are enabled, we report them as a schedulable resource on the node
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) {
|
||||
for _, hugepagesInfo := range info.HugePages {
|
||||
pageSizeBytes := int64(hugepagesInfo.PageSize * 1024)
|
||||
hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages)
|
||||
pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI)
|
||||
c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI)
|
||||
}
|
||||
}
|
||||
|
||||
return c
|
||||
}
|
||||
|
||||
|
@ -52,6 +52,7 @@ go_library(
|
||||
] + select({
|
||||
"@io_bazel_rules_go//go/platform:linux_amd64": [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1/helper:go_default_library",
|
||||
"//pkg/api/v1/helper/qos:go_default_library",
|
||||
"//pkg/api/v1/resource:go_default_library",
|
||||
"//pkg/kubelet/cm/util:go_default_library",
|
||||
@ -63,6 +64,7 @@ go_library(
|
||||
"//pkg/util/procfs:go_default_library",
|
||||
"//pkg/util/sysctl:go_default_library",
|
||||
"//pkg/util/version:go_default_library",
|
||||
"//vendor/github.com/docker/go-units:go_default_library",
|
||||
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
|
||||
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
|
||||
"//vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd:go_default_library",
|
||||
|
@ -24,12 +24,16 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
units "github.com/docker/go-units"
|
||||
"github.com/golang/glog"
|
||||
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
|
||||
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
||||
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
kubefeatures "k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
@ -43,6 +47,10 @@ const (
|
||||
libcontainerSystemd libcontainerCgroupManagerType = "systemd"
|
||||
)
|
||||
|
||||
// hugePageSizeList is useful for converting to the hugetlb canonical unit
|
||||
// which is what is expected when interacting with libcontainer
|
||||
var hugePageSizeList = []string{"", "kB", "MB", "GB", "TB", "PB"}
|
||||
|
||||
// ConvertCgroupNameToSystemd converts the internal cgroup name to a systemd name.
|
||||
// For example, the name /Burstable/pod_123-456 becomes Burstable-pod_123_456.slice
|
||||
// If outputToCgroupFs is true, it expands the systemd name into the cgroupfs form.
|
||||
@ -299,10 +307,16 @@ type subsystem interface {
|
||||
GetStats(path string, stats *libcontainercgroups.Stats) error
|
||||
}
|
||||
|
||||
// Cgroup subsystems we currently support
|
||||
var supportedSubsystems = []subsystem{
|
||||
&cgroupfs.MemoryGroup{},
|
||||
&cgroupfs.CpuGroup{},
|
||||
// getSupportedSubsystems returns list of subsystems supported
|
||||
func getSupportedSubsystems() []subsystem {
|
||||
supportedSubsystems := []subsystem{
|
||||
&cgroupfs.MemoryGroup{},
|
||||
&cgroupfs.CpuGroup{},
|
||||
}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
||||
supportedSubsystems = append(supportedSubsystems, &cgroupfs.HugetlbGroup{})
|
||||
}
|
||||
return supportedSubsystems
|
||||
}
|
||||
|
||||
// setSupportedSubsystems sets cgroup resource limits only on the supported
|
||||
@ -315,7 +329,7 @@ var supportedSubsystems = []subsystem{
|
||||
// but this is not possible with libcontainers Set() method
|
||||
// See https://github.com/opencontainers/runc/issues/932
|
||||
func setSupportedSubsystems(cgroupConfig *libcontainerconfigs.Cgroup) error {
|
||||
for _, sys := range supportedSubsystems {
|
||||
for _, sys := range getSupportedSubsystems() {
|
||||
if _, ok := cgroupConfig.Paths[sys.Name()]; !ok {
|
||||
return fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
|
||||
}
|
||||
@ -343,6 +357,30 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
|
||||
if resourceConfig.CpuPeriod != nil {
|
||||
resources.CpuPeriod = *resourceConfig.CpuPeriod
|
||||
}
|
||||
|
||||
// if huge pages are enabled, we set them in libcontainer
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
||||
// for each page size enumerated, set that value
|
||||
pageSizes := sets.NewString()
|
||||
for pageSize, limit := range resourceConfig.HugePageLimit {
|
||||
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, hugePageSizeList)
|
||||
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
|
||||
Pagesize: sizeString,
|
||||
Limit: uint64(limit),
|
||||
})
|
||||
pageSizes.Insert(sizeString)
|
||||
}
|
||||
// for each page size omitted, limit to 0
|
||||
for _, pageSize := range cgroupfs.HugePageSizes {
|
||||
if pageSizes.Has(pageSize) {
|
||||
continue
|
||||
}
|
||||
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
|
||||
Pagesize: pageSize,
|
||||
Limit: uint64(0),
|
||||
})
|
||||
}
|
||||
}
|
||||
return resources
|
||||
}
|
||||
|
||||
@ -502,7 +540,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
|
||||
|
||||
func getStatsSupportedSubsystems(cgroupPaths map[string]string) (*libcontainercgroups.Stats, error) {
|
||||
stats := libcontainercgroups.NewStats()
|
||||
for _, sys := range supportedSubsystems {
|
||||
for _, sys := range getSupportedSubsystems() {
|
||||
if _, ok := cgroupPaths[sys.Name()]; !ok {
|
||||
return nil, fmt.Errorf("Failed to find subsystem mount for subsystem: %v", sys.Name())
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ import (
|
||||
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
v1helper "k8s.io/kubernetes/pkg/api/v1/helper"
|
||||
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
|
||||
"k8s.io/kubernetes/pkg/api/v1/resource"
|
||||
)
|
||||
@ -83,6 +84,23 @@ func MilliCPUToShares(milliCPU int64) int64 {
|
||||
return shares
|
||||
}
|
||||
|
||||
// HugePageLimits converts the API representation to a map
|
||||
// from huge page size (in bytes) to huge page limit (in bytes).
|
||||
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
|
||||
hugePageLimits := map[int64]int64{}
|
||||
for k, v := range resourceList {
|
||||
if v1helper.IsHugePageResourceName(k) {
|
||||
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
|
||||
if value, exists := hugePageLimits[pageSize.Value()]; exists {
|
||||
hugePageLimits[pageSize.Value()] = value + v.Value()
|
||||
} else {
|
||||
hugePageLimits[pageSize.Value()] = v.Value()
|
||||
}
|
||||
}
|
||||
}
|
||||
return hugePageLimits
|
||||
}
|
||||
|
||||
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
|
||||
func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
|
||||
// sum requests and limits.
|
||||
@ -108,6 +126,8 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
|
||||
// track if limits were applied for each resource.
|
||||
memoryLimitsDeclared := true
|
||||
cpuLimitsDeclared := true
|
||||
// map hugepage pagesize (bytes) to limits (bytes)
|
||||
hugePageLimits := map[int64]int64{}
|
||||
for _, container := range pod.Spec.Containers {
|
||||
if container.Resources.Limits.Cpu().IsZero() {
|
||||
cpuLimitsDeclared = false
|
||||
@ -115,6 +135,14 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
|
||||
if container.Resources.Limits.Memory().IsZero() {
|
||||
memoryLimitsDeclared = false
|
||||
}
|
||||
containerHugePageLimits := HugePageLimits(container.Resources.Requests)
|
||||
for k, v := range containerHugePageLimits {
|
||||
if value, exists := hugePageLimits[k]; exists {
|
||||
hugePageLimits[k] = value + v
|
||||
} else {
|
||||
hugePageLimits[k] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// determine the qos class
|
||||
@ -140,6 +168,7 @@ func ResourceConfigForPod(pod *v1.Pod) *ResourceConfig {
|
||||
shares := int64(MinShares)
|
||||
result.CpuShares = &shares
|
||||
}
|
||||
result.HugePageLimit = hugePageLimits
|
||||
return result
|
||||
}
|
||||
|
||||
|
@ -28,7 +28,9 @@ import (
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
kubefeatures "k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/events"
|
||||
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
|
||||
)
|
||||
@ -154,6 +156,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
|
||||
val := MilliCPUToShares(q.MilliValue())
|
||||
rc.CpuShares = &val
|
||||
}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
||||
rc.HugePageLimit = HugePageLimits(rl)
|
||||
}
|
||||
|
||||
return &rc
|
||||
}
|
||||
|
||||
|
@ -27,9 +27,13 @@ import (
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
|
||||
units "github.com/docker/go-units"
|
||||
cgroupfs "github.com/opencontainers/runc/libcontainer/cgroups/fs"
|
||||
"k8s.io/api/core/v1"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
|
||||
"k8s.io/kubernetes/pkg/api/v1/resource"
|
||||
kubefeatures "k8s.io/kubernetes/pkg/features"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -100,11 +104,18 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
|
||||
minShares := int64(MinShares)
|
||||
resourceParameters.CpuShares = &minShares
|
||||
}
|
||||
|
||||
// containerConfig object stores the cgroup specifications
|
||||
containerConfig := &CgroupConfig{
|
||||
Name: absoluteContainerName,
|
||||
ResourceParameters: resourceParameters,
|
||||
}
|
||||
|
||||
// for each enumerated huge page size, the qos tiers are unbounded
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
||||
m.setHugePagesUnbounded(containerConfig)
|
||||
}
|
||||
|
||||
// check if it exists
|
||||
if !cm.Exists(absoluteContainerName) {
|
||||
if err := cm.Create(containerConfig); err != nil {
|
||||
@ -138,6 +149,29 @@ func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceLis
|
||||
return nil
|
||||
}
|
||||
|
||||
// setHugePagesUnbounded ensures hugetlb is effectively unbounded
|
||||
func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
|
||||
hugePageLimit := map[int64]int64{}
|
||||
for _, pageSize := range cgroupfs.HugePageSizes {
|
||||
pageSizeBytes, err := units.RAMInBytes(pageSize)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
hugePageLimit[pageSizeBytes] = int64(1 << 62)
|
||||
}
|
||||
cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
|
||||
for _, v := range configs {
|
||||
if err := m.setHugePagesUnbounded(v); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
|
||||
pods := m.activePods()
|
||||
burstablePodCPURequest := int64(0)
|
||||
@ -262,6 +296,13 @@ func (m *qosContainerManagerImpl) UpdateCgroups() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
|
||||
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.HugePages) {
|
||||
if err := m.setHugePagesConfig(qosConfigs); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
for resource, percentReserve := range m.qosReserved {
|
||||
switch resource {
|
||||
case v1.ResourceMemory:
|
||||
|
@ -31,6 +31,8 @@ type ResourceConfig struct {
|
||||
CpuQuota *int64
|
||||
// CPU quota period.
|
||||
CpuPeriod *int64
|
||||
// HugePageLimit map from page size (in bytes) to limit (in bytes)
|
||||
HugePageLimit map[int64]int64
|
||||
}
|
||||
|
||||
// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
|
||||
|
@ -630,6 +630,19 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
||||
}
|
||||
node.Status.Allocatable[k] = value
|
||||
}
|
||||
// for every huge page reservation, we need to remove it from allocatable memory
|
||||
for k, v := range node.Status.Capacity {
|
||||
if v1helper.IsHugePageResourceName(k) {
|
||||
allocatableMemory := node.Status.Allocatable[v1.ResourceMemory]
|
||||
value := *(v.Copy())
|
||||
allocatableMemory.Sub(value)
|
||||
if allocatableMemory.Sign() < 0 {
|
||||
// Negative Allocatable resources don't make sense.
|
||||
allocatableMemory.Set(0)
|
||||
}
|
||||
node.Status.Allocatable[v1.ResourceMemory] = allocatableMemory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set versioninfo for the node.
|
||||
|
Loading…
Reference in New Issue
Block a user