From 943fc53bf727ff6e005378a872710071042abf72 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 30 May 2017 12:41:31 -0700 Subject: [PATCH] Add predicates check for local storage request This PR adds the check for local storage request when admitting pods. If the local storage request exceeds the available resource, pod will be rejected. --- cmd/kubelet/app/options/options.go | 2 +- cmd/kubelet/app/server.go | 2 +- pkg/apis/componentconfig/v1alpha1/types.go | 2 +- pkg/kubelet/cadvisor/util.go | 9 ++ pkg/kubelet/cm/node_container_manager.go | 12 +- pkg/kubelet/eviction/helpers_test.go | 13 +- pkg/kubelet/kubelet_node_status_test.go | 75 ++++++----- .../algorithm/predicates/predicates.go | 32 +++++ .../algorithm/predicates/predicates_test.go | 125 ++++++++++++++++-- .../pkg/scheduler/schedulercache/node_info.go | 24 +++- test/e2e_node/BUILD | 1 + 11 files changed, 240 insertions(+), 57 deletions(-) diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index b5dc81df4af..d5e412a618f 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -291,7 +291,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) { // Node Allocatable Flags fs.Var(&c.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") - fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi, storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details.") fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 9470c3cdc5f..1d4d59979fb 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -914,7 +914,7 @@ func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, err rl := make(v1.ResourceList) for k, v := range m { switch v1.ResourceName(k) { - // Only CPU and memory resources are supported. + // CPU, memory and local storage resources are supported. case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceStorage: q, err := resource.ParseQuantity(v) if err != nil { diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 2b010a2932b..ff700bbf4dd 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -551,7 +551,7 @@ type KubeletConfiguration struct { SystemReserved map[string]string `json:"systemReserved"` // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs // that describe resources reserved for kubernetes system components. - // Currently only cpu and memory are supported. [default=none] + // Currently cpu, memory and local storage for root file system are supported. [default=none] // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. KubeReserved map[string]string `json:"kubeReserved"` diff --git a/pkg/kubelet/cadvisor/util.go b/pkg/kubelet/cadvisor/util.go index abab696f660..4b8a67340c7 100644 --- a/pkg/kubelet/cadvisor/util.go +++ b/pkg/kubelet/cadvisor/util.go @@ -43,3 +43,12 @@ func StorageScratchCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList } return c } + +func StorageOverlayCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList { + c := v1.ResourceList{ + v1.ResourceStorageOverlay: *resource.NewQuantity( + int64(info.Capacity), + resource.BinarySI), + } + return c +} diff --git a/pkg/kubelet/cm/node_container_manager.go b/pkg/kubelet/cm/node_container_manager.go index cb91ab550e0..e9391076903 100644 --- a/pkg/kubelet/cm/node_container_manager.go +++ b/pkg/kubelet/cm/node_container_manager.go @@ -185,12 +185,14 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList { evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity) if _, ok := cm.capacity[v1.ResourceStorage]; !ok { - if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil { - for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) { - cm.capacity[rName] = rCap + if cm.cadvisorInterface != nil { + if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil { + for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) { + cm.capacity[rName] = rCap + } + } else { + glog.Warning("Error getting rootfs info: %v", err) } - } else { - glog.Warning("Error getting rootfs info: %v", err) } } result := make(v1.ResourceList) diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 5772a41ab16..fda1d010e1e 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -76,6 +76,16 @@ func TestParseThresholdConfig(t *testing.T) { Quantity: quantityMustParse("0"), }, }, + { + Signal: evictionapi.SignalAllocatableNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + }, { Signal: evictionapi.SignalMemoryAvailable, Operator: evictionapi.OpLessThan, @@ -777,8 +787,7 @@ func TestMakeSignalObservations(t *testing.T) { if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 { t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity) } - actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider) - + actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider, pods, false) if err != nil { t.Errorf("Unexpected err: %v", err) } diff --git a/pkg/kubelet/kubelet_node_status_test.go b/pkg/kubelet/kubelet_node_status_test.go index 96161b5de59..74382a57f7f 100644 --- a/pkg/kubelet/kubelet_node_status_test.go +++ b/pkg/kubelet/kubelet_node_status_test.go @@ -208,14 +208,16 @@ func TestUpdateNewNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -361,14 +363,16 @@ func TestUpdateExistingNodeStatus(t *testing.T) { }, }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(3000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(3000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, }, } @@ -444,14 +448,16 @@ func TestUpdateExistingNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -655,8 +661,9 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { kubelet.containerManager = &localCM{ ContainerManager: cm.NewStubContainerManager(), allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(200*mb, resource.BinarySI), }, } @@ -727,14 +734,16 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(300*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -1141,14 +1150,16 @@ func TestUpdateNewNodeStatusTooLargeReservation(t *testing.T) { Spec: v1.NodeSpec{}, Status: v1.NodeStatus{ Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(0, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(0, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, }, } diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 10ea3eafea3..dbf0c055211 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -508,6 +508,8 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { result.MilliCPU += rQuantity.MilliValue() case v1.ResourceNvidiaGPU: result.NvidiaGPU += rQuantity.Value() + case v1.ResourceStorageOverlay: + result.StorageOverlay += rQuantity.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { result.AddOpaque(rName, rQuantity.Value()) @@ -515,6 +517,15 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { } } } + // Account for storage requested by emptydir volumes + // If the storage medium is memory, should exclude the size + for _, vol := range pod.Spec.Volumes { + if vol.EmptyDir != nil && vol.EmptyDir.Medium != v1.StorageMediumMemory { + + result.StorageScratch += vol.EmptyDir.SizeLimit.Value() + } + } + // take max_resource(sum_pod, any_init_container) for _, container := range pod.Spec.InitContainers { for rName, rQuantity := range container.Resources.Requests { @@ -531,6 +542,10 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { if gpu := rQuantity.Value(); gpu > result.NvidiaGPU { result.NvidiaGPU = gpu } + case v1.ResourceStorageOverlay: + if overlay := rQuantity.Value(); overlay > result.StorageOverlay { + result.StorageOverlay = overlay + } default: if v1helper.IsOpaqueIntResourceName(rName) { value := rQuantity.Value() @@ -581,6 +596,23 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No if allocatable.NvidiaGPU < podRequest.NvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU { predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceNvidiaGPU, podRequest.NvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, allocatable.NvidiaGPU)) } + + scratchSpaceRequest := podRequest.StorageScratch + if allocatable.StorageOverlay == 0 { + scratchSpaceRequest += podRequest.StorageOverlay + //scratchSpaceRequest += nodeInfo.RequestedResource().StorageOverlay + nodeScratchRequest := nodeInfo.RequestedResource().StorageOverlay + nodeInfo.RequestedResource().StorageScratch + if allocatable.StorageScratch < scratchSpaceRequest+nodeScratchRequest { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageScratch, scratchSpaceRequest, nodeScratchRequest, allocatable.StorageScratch)) + } + + } else if allocatable.StorageScratch < scratchSpaceRequest+nodeInfo.RequestedResource().StorageScratch { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageScratch, scratchSpaceRequest, nodeInfo.RequestedResource().StorageScratch, allocatable.StorageScratch)) + } + if allocatable.StorageOverlay > 0 && allocatable.StorageOverlay < podRequest.StorageOverlay+nodeInfo.RequestedResource().StorageOverlay { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageOverlay, podRequest.StorageOverlay, nodeInfo.RequestedResource().StorageOverlay, allocatable.StorageOverlay)) + } + for rName, rQuant := range podRequest.OpaqueIntResources { if allocatable.OpaqueIntResources[rName] < rQuant+nodeInfo.RequestedResource().OpaqueIntResources[rName] { predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.OpaqueIntResources[rName], nodeInfo.RequestedResource().OpaqueIntResources[rName], allocatable.OpaqueIntResources[rName])) diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 1118c4842b4..05987cf80be 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -76,7 +76,7 @@ var ( opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") ) -func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.NodeResources { +func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.NodeResources { return v1.NodeResources{ Capacity: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), @@ -84,17 +84,19 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.NodeRes v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI), v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), + v1.ResourceStorage: *resource.NewQuantity(storage, resource.BinarySI), }, } } -func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.ResourceList { +func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.ResourceList { return v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI), v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), + v1.ResourceStorage: *resource.NewQuantity(storage, resource.BinarySI), } } @@ -112,6 +114,25 @@ func newResourcePod(usage ...schedulercache.Resource) *v1.Pod { } } +func addStorageLimit(pod *v1.Pod, sizeLimit int64, medium v1.StorageMedium) *v1.Pod { + return &v1.Pod{ + Spec: v1.PodSpec{ + Containers: pod.Spec.Containers, + Volumes: []v1.Volume{ + { + Name: "emptyDirVolumeName", + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(sizeLimit, resource.BinarySI), + Medium: medium, + }, + }, + }, + }, + }, + } +} + func newResourceInitPod(pod *v1.Pod, usage ...schedulercache.Resource) *v1.Pod { pod.Spec.InitContainers = newResourcePod(usage...).Spec.Containers return pod @@ -329,7 +350,7 @@ func TestPodFitsResources(t *testing.T) { } for _, test := range enoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -384,7 +405,7 @@ func TestPodFitsResources(t *testing.T) { }, } for _, test := range notEnoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -397,6 +418,86 @@ func TestPodFitsResources(t *testing.T) { t.Errorf("%s: expected: %v got %v", test.test, test.fits, fits) } } + + storagePodsTests := []struct { + pod *v1.Pod + emptyDirLimit int64 + storageMedium v1.StorageMedium + nodeInfo *schedulercache.NodeInfo + fits bool + test string + reasons []algorithm.PredicateFailureReason + }{ + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 1}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 10, Memory: 10, StorageOverlay: 20})), + fits: false, + test: "due to init container scratch disk", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceCPU, 1, 10, 10), + NewInsufficientResourceError(v1.ResourceStorageScratch, 1, 20, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 10})), + fits: true, + test: "pod fit", + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 18}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: false, + test: "request exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 18, 5, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + emptyDirLimit: 15, + storageMedium: v1.StorageMediumDefault, + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: false, + test: "storage scratchrequest exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 25, 5, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + emptyDirLimit: 15, + storageMedium: v1.StorageMediumMemory, + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: true, + test: "storage scratchrequest exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 25, 5, 20), + }, + }, + } + + for _, test := range storagePodsTests { + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + test.nodeInfo.SetNode(&node) + pod := addStorageLimit(test.pod, test.emptyDirLimit, test.storageMedium) + fits, reasons, err := PodFitsResources(pod, PredicateMetadata(pod, nil), test.nodeInfo) + if err != nil { + t.Errorf("%s: unexpected error: %v", test.test, err) + } + if !fits && !reflect.DeepEqual(reasons, test.reasons) { + t.Errorf("%s: unexpected failure reasons: %v, want: %v", test.test, reasons, test.reasons) + } + if fits != test.fits { + t.Errorf("%s: expected: %v got %v", test.test, test.fits, fits) + } + } + } func TestPodFitsHost(t *testing.T) { @@ -1843,7 +1944,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: true, wErr: nil, @@ -1855,7 +1956,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -1869,7 +1970,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: &v1.Pod{}, nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: true, wErr: nil, test: "no resources/port/host requested always fits on GPU machine", @@ -1878,7 +1979,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: false, wErr: nil, reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)}, @@ -1888,7 +1989,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: true, wErr: nil, test: "enough GPU resource", @@ -1902,7 +2003,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -1914,7 +2015,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -3249,7 +3350,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) { ImagePullPolicy: "Always", // at least one requirement -> burstable pod Resources: v1.ResourceRequirements{ - Requests: makeAllocatableResources(100, 100, 100, 100, 0), + Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0), }, }, }, diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index 85f75de97b0..951a1621199 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -76,9 +76,10 @@ type Resource struct { func (r *Resource) ResourceList() v1.ResourceList { result := v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI), + v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI), + v1.ResourceStorageOverlay: *resource.NewQuantity(r.StorageOverlay, resource.BinarySI), } for rName, rQuant := range r.OpaqueIntResources { result[rName] = *resource.NewQuantity(rQuant, resource.DecimalSI) @@ -264,6 +265,8 @@ func (n *NodeInfo) addPod(pod *v1.Pod) { n.requestedResource.MilliCPU += res.MilliCPU n.requestedResource.Memory += res.Memory n.requestedResource.NvidiaGPU += res.NvidiaGPU + n.requestedResource.StorageOverlay += res.StorageOverlay + n.requestedResource.StorageScratch += res.StorageScratch if n.requestedResource.OpaqueIntResources == nil && len(res.OpaqueIntResources) > 0 { n.requestedResource.OpaqueIntResources = map[v1.ResourceName]int64{} } @@ -349,6 +352,8 @@ func calculateResource(pod *v1.Pod) (res Resource, non0_cpu int64, non0_mem int6 res.Memory += rQuant.Value() case v1.ResourceNvidiaGPU: res.NvidiaGPU += rQuant.Value() + case v1.ResourceStorageOverlay: + res.StorageOverlay += rQuant.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { res.AddOpaque(rName, rQuant.Value()) @@ -361,6 +366,15 @@ func calculateResource(pod *v1.Pod) (res Resource, non0_cpu int64, non0_mem int6 non0_mem += non0_mem_req // No non-zero resources for GPUs or opaque resources. } + + // Account for storage requested by emptydir volumes + // If the storage medium is memory, should exclude the size + for _, vol := range pod.Spec.Volumes { + if vol.EmptyDir != nil && vol.EmptyDir.Medium != v1.StorageMediumMemory { + res.StorageScratch += vol.EmptyDir.SizeLimit.Value() + } + } + return } @@ -391,6 +405,10 @@ func (n *NodeInfo) SetNode(node *v1.Node) error { n.allocatableResource.NvidiaGPU = rQuant.Value() case v1.ResourcePods: n.allowedPodNumber = int(rQuant.Value()) + case v1.ResourceStorage: + n.allocatableResource.StorageScratch = rQuant.Value() + case v1.ResourceStorageOverlay: + n.allocatableResource.StorageOverlay = rQuant.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { n.allocatableResource.SetOpaque(rName, rQuant.Value()) diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 100580d54fe..b8956ab1381 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -76,6 +76,7 @@ go_test( "inode_eviction_test.go", "kubelet_test.go", "lifecycle_hook_test.go", + "local_storage_allocatable_eviction_test.go", "log_path_test.go", "memory_eviction_test.go", "mirror_pod_test.go",