diff --git a/pkg/api/helper/qos/BUILD b/pkg/api/helper/qos/BUILD index 63b594dc911..1b2f763a17e 100644 --- a/pkg/api/helper/qos/BUILD +++ b/pkg/api/helper/qos/BUILD @@ -10,6 +10,7 @@ go_library( srcs = ["qos.go"], deps = [ "//pkg/api:go_default_library", + "//pkg/api/helper:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", ], diff --git a/pkg/api/helper/qos/qos.go b/pkg/api/helper/qos/qos.go index 6edd0e1b25a..0ea6286c841 100644 --- a/pkg/api/helper/qos/qos.go +++ b/pkg/api/helper/qos/qos.go @@ -22,11 +22,16 @@ import ( "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/helper" ) // supportedComputeResources is the list of compute resources for with QoS is supported. var supportedQoSComputeResources = sets.NewString(string(api.ResourceCPU), string(api.ResourceMemory)) +func isSupportedQoSComputeResource(name api.ResourceName) bool { + return supportedQoSComputeResources.Has(string(name)) || helper.IsHugePageResourceName(name) +} + // GetPodQOS returns the QoS class of a pod. // A pod is besteffort if none of its containers have specified any requests or limits. // A pod is guaranteed only when requests and limits are specified for all the containers and they are equal. @@ -39,7 +44,7 @@ func GetPodQOS(pod *api.Pod) api.PodQOSClass { for _, container := range pod.Spec.Containers { // process requests for name, quantity := range container.Resources.Requests { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -55,7 +60,7 @@ func GetPodQOS(pod *api.Pod) api.PodQOSClass { // process limits qosLimitsFound := sets.NewString() for name, quantity := range container.Resources.Limits { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { diff --git a/pkg/api/v1/helper/qos/qos_test.go b/pkg/api/v1/helper/qos/qos_test.go index 59c8bfad926..a48387c3ded 100644 --- a/pkg/api/v1/helper/qos/qos_test.go +++ b/pkg/api/v1/helper/qos/qos_test.go @@ -132,7 +132,7 @@ func TestGetPodQOS(t *testing.T) { }, { pod: newPod("burstable-hugepages", []v1.Container{ - newContainer("burstable", getResourceList("0", "0"), addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0"))), + newContainer("burstable", addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0")), addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0"))), }), expected: v1.PodQOSBurstable, }, @@ -147,7 +147,7 @@ func TestGetPodQOS(t *testing.T) { k8sv1.Convert_v1_Pod_To_api_Pod(testCase.pod, &pod, nil) if actual := qos.GetPodQOS(&pod); api.PodQOSClass(testCase.expected) != actual { - t.Errorf("[%d]: invalid qos pod %s, expected: %s, actual: %s", id, testCase.pod.Name, testCase.expected, actual) + t.Errorf("[%d]: conversion invalid qos pod %s, expected: %s, actual: %s", id, testCase.pod.Name, testCase.expected, actual) } } } diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 427700b5300..92b5e4fe81c 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -509,6 +509,12 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { result.SetExtended(rName, value) } } + if v1helper.IsHugePageResourceName(rName) { + value := rQuantity.Value() + if value > result.HugePages[rName] { + result.SetHugePages(rName, value) + } + } } } } @@ -542,7 +548,12 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No // We couldn't parse metadata - fallback to computing it. podRequest = GetResourceRequest(pod) } - if podRequest.MilliCPU == 0 && podRequest.Memory == 0 && podRequest.NvidiaGPU == 0 && podRequest.EphemeralStorage == 0 && len(podRequest.ExtendedResources) == 0 { + if podRequest.MilliCPU == 0 && + podRequest.Memory == 0 && + podRequest.NvidiaGPU == 0 && + podRequest.EphemeralStorage == 0 && + len(podRequest.ExtendedResources) == 0 && + len(podRequest.HugePages) == 0 { return len(predicateFails) == 0, predicateFails, nil } @@ -567,6 +578,12 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No } } + for rName, rQuant := range podRequest.HugePages { + if allocatable.HugePages[rName] < rQuant+nodeInfo.RequestedResource().HugePages[rName] { + predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.HugePages[rName], nodeInfo.RequestedResource().HugePages[rName], allocatable.HugePages[rName])) + } + } + if glog.V(10) { if len(predicateFails) == 0 { // We explicitly don't do glog.V(10).Infof() to avoid computing all the parameters if this is diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 4f5c877a866..734ff26a78c 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -73,11 +73,12 @@ func (pvs FakePersistentVolumeInfo) GetPersistentVolumeInfo(pvID string) (*v1.Pe } var ( - opaqueResourceA = v1helper.OpaqueIntResourceName("AAA") - opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") + opaqueResourceA = v1helper.OpaqueIntResourceName("AAA") + opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") + hugePageResourceA = v1helper.HugePageResourceName(resource.MustParse("2Mi")) ) -func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.NodeResources { +func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage, hugePageA int64) v1.NodeResources { return v1.NodeResources{ Capacity: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), @@ -86,11 +87,12 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI), + hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI), }, } } -func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.ResourceList { +func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage, hugePageA int64) v1.ResourceList { return v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), @@ -98,6 +100,7 @@ func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, stora v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI), + hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI), } } @@ -348,10 +351,38 @@ func TestPodFitsResources(t *testing.T) { test: "opaque resource allocatable enforced for unknown resource for init container", reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(opaqueResourceB, 1, 0, 0)}, }, + { + pod: newResourcePod( + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 10}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 0}})), + fits: false, + test: "hugepages resource capacity enforced", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 10, 0, 5)}, + }, + { + pod: newResourceInitPod(newResourcePod(schedulercache.Resource{}), + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 10}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 0}})), + fits: false, + test: "hugepages resource capacity enforced for init container", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 10, 0, 5)}, + }, + { + pod: newResourcePod( + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 3}}, + schedulercache.Resource{MilliCPU: 1, Memory: 1, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 3}}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 0, Memory: 0, HugePages: map[v1.ResourceName]int64{hugePageResourceA: 2}})), + fits: false, + test: "hugepages resource allocatable enforced for multiple containers", + reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(hugePageResourceA, 6, 2, 5)}, + }, } for _, test := range enoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -406,7 +437,7 @@ func TestPodFitsResources(t *testing.T) { }, } for _, test := range notEnoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0, 0)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -464,7 +495,7 @@ func TestPodFitsResources(t *testing.T) { } for _, test := range storagePodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -1889,7 +1920,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: true, wErr: nil, @@ -1901,7 +1932,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -1915,7 +1946,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: &v1.Pod{}, nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: true, wErr: nil, test: "no resources/port/host requested always fits on GPU machine", @@ -1924,7 +1955,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: false, wErr: nil, reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)}, @@ -1934,7 +1965,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}}, fits: true, wErr: nil, test: "enough GPU resource", @@ -1948,7 +1979,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -1960,7 +1991,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)}, }, fits: false, wErr: nil, @@ -3252,7 +3283,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) { ImagePullPolicy: "Always", // at least one requirement -> burstable pod Resources: v1.ResourceRequirements{ - Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0), + Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0, 0), }, }, }, diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index dd3f8206b09..0ec8a94daa9 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -72,6 +72,7 @@ type Resource struct { // explicitly as int, to avoid conversions and improve performance. AllowedPodNumber int ExtendedResources map[v1.ResourceName]int64 + HugePages map[v1.ResourceName]int64 } // New creates a Resource from ResourceList @@ -103,6 +104,9 @@ func (r *Resource) Add(rl v1.ResourceList) { if v1helper.IsExtendedResourceName(rName) { r.AddExtended(rName, rQuant.Value()) } + if v1helper.IsHugePageResourceName(rName) { + r.AddHugePages(rName, rQuant.Value()) + } } } } @@ -118,6 +122,9 @@ func (r *Resource) ResourceList() v1.ResourceList { for rName, rQuant := range r.ExtendedResources { result[rName] = *resource.NewQuantity(rQuant, resource.DecimalSI) } + for rName, rQuant := range r.HugePages { + result[rName] = *resource.NewQuantity(rQuant, resource.BinarySI) + } return result } @@ -135,6 +142,12 @@ func (r *Resource) Clone() *Resource { res.ExtendedResources[k] = v } } + if r.HugePages != nil { + res.HugePages = make(map[v1.ResourceName]int64) + for k, v := range r.HugePages { + res.HugePages[k] = v + } + } return res } @@ -150,6 +163,18 @@ func (r *Resource) SetExtended(name v1.ResourceName, quantity int64) { r.ExtendedResources[name] = quantity } +func (r *Resource) AddHugePages(name v1.ResourceName, quantity int64) { + r.SetHugePages(name, r.HugePages[name]+quantity) +} + +func (r *Resource) SetHugePages(name v1.ResourceName, quantity int64) { + // Lazily allocate hugepages resource map. + if r.HugePages == nil { + r.HugePages = map[v1.ResourceName]int64{} + } + r.HugePages[name] = quantity +} + // NewNodeInfo returns a ready to use empty NodeInfo object. // If any pods are given in arguments, their information will be aggregated in // the returned object. @@ -307,6 +332,12 @@ func (n *NodeInfo) addPod(pod *v1.Pod) { for rName, rQuant := range res.ExtendedResources { n.requestedResource.ExtendedResources[rName] += rQuant } + if n.requestedResource.HugePages == nil && len(res.HugePages) > 0 { + n.requestedResource.HugePages = map[v1.ResourceName]int64{} + } + for rName, rQuant := range res.HugePages { + n.requestedResource.HugePages[rName] += rQuant + } n.nonzeroRequest.MilliCPU += non0_cpu n.nonzeroRequest.Memory += non0_mem n.pods = append(n.pods, pod) @@ -362,6 +393,12 @@ func (n *NodeInfo) removePod(pod *v1.Pod) error { for rName, rQuant := range res.ExtendedResources { n.requestedResource.ExtendedResources[rName] -= rQuant } + if len(res.HugePages) > 0 && n.requestedResource.HugePages == nil { + n.requestedResource.HugePages = map[v1.ResourceName]int64{} + } + for rName, rQuant := range res.HugePages { + n.requestedResource.HugePages[rName] -= rQuant + } n.nonzeroRequest.MilliCPU -= non0_cpu n.nonzeroRequest.Memory -= non0_mem