diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index b5dc81df4af..d5e412a618f 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -291,7 +291,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) { // Node Allocatable Flags fs.Var(&c.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") - fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi, storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details.") fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 9470c3cdc5f..1d4d59979fb 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -914,7 +914,7 @@ func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, err rl := make(v1.ResourceList) for k, v := range m { switch v1.ResourceName(k) { - // Only CPU and memory resources are supported. + // CPU, memory and local storage resources are supported. case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceStorage: q, err := resource.ParseQuantity(v) if err != nil { diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 2b010a2932b..ff700bbf4dd 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -551,7 +551,7 @@ type KubeletConfiguration struct { SystemReserved map[string]string `json:"systemReserved"` // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs // that describe resources reserved for kubernetes system components. - // Currently only cpu and memory are supported. [default=none] + // Currently cpu, memory and local storage for root file system are supported. [default=none] // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. KubeReserved map[string]string `json:"kubeReserved"` diff --git a/pkg/kubelet/cadvisor/util.go b/pkg/kubelet/cadvisor/util.go index abab696f660..4b8a67340c7 100644 --- a/pkg/kubelet/cadvisor/util.go +++ b/pkg/kubelet/cadvisor/util.go @@ -43,3 +43,12 @@ func StorageScratchCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList } return c } + +func StorageOverlayCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList { + c := v1.ResourceList{ + v1.ResourceStorageOverlay: *resource.NewQuantity( + int64(info.Capacity), + resource.BinarySI), + } + return c +} diff --git a/pkg/kubelet/cm/node_container_manager.go b/pkg/kubelet/cm/node_container_manager.go index cb91ab550e0..e9391076903 100644 --- a/pkg/kubelet/cm/node_container_manager.go +++ b/pkg/kubelet/cm/node_container_manager.go @@ -185,12 +185,14 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList { evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity) if _, ok := cm.capacity[v1.ResourceStorage]; !ok { - if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil { - for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) { - cm.capacity[rName] = rCap + if cm.cadvisorInterface != nil { + if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil { + for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) { + cm.capacity[rName] = rCap + } + } else { + glog.Warning("Error getting rootfs info: %v", err) } - } else { - glog.Warning("Error getting rootfs info: %v", err) } } result := make(v1.ResourceList) diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 5772a41ab16..fda1d010e1e 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -76,6 +76,16 @@ func TestParseThresholdConfig(t *testing.T) { Quantity: quantityMustParse("0"), }, }, + { + Signal: evictionapi.SignalAllocatableNodeFsAvailable, + Operator: evictionapi.OpLessThan, + Value: evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + MinReclaim: &evictionapi.ThresholdValue{ + Quantity: quantityMustParse("0"), + }, + }, { Signal: evictionapi.SignalMemoryAvailable, Operator: evictionapi.OpLessThan, @@ -777,8 +787,7 @@ func TestMakeSignalObservations(t *testing.T) { if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 { t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity) } - actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider) - + actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider, pods, false) if err != nil { t.Errorf("Unexpected err: %v", err) } diff --git a/pkg/kubelet/kubelet_node_status_test.go b/pkg/kubelet/kubelet_node_status_test.go index 96161b5de59..74382a57f7f 100644 --- a/pkg/kubelet/kubelet_node_status_test.go +++ b/pkg/kubelet/kubelet_node_status_test.go @@ -208,14 +208,16 @@ func TestUpdateNewNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -361,14 +363,16 @@ func TestUpdateExistingNodeStatus(t *testing.T) { }, }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(3000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(3000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, }, } @@ -444,14 +448,16 @@ func TestUpdateExistingNodeStatus(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(20E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(19900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -655,8 +661,9 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { kubelet.containerManager = &localCM{ ContainerManager: cm.NewStubContainerManager(), allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(200*mb, resource.BinarySI), }, } @@ -727,14 +734,16 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { KubeProxyVersion: version.Get().String(), }, Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(1800, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(9900E6, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(300*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Addresses: []v1.NodeAddress{ {Type: v1.NodeInternalIP, Address: "127.0.0.1"}, @@ -1141,14 +1150,16 @@ func TestUpdateNewNodeStatusTooLargeReservation(t *testing.T) { Spec: v1.NodeSpec{}, Status: v1.NodeStatus{ Capacity: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(0, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(0, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10E9, resource.BinarySI), + v1.ResourceStorage: *resource.NewQuantity(500*mb, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(0, resource.DecimalSI), }, }, } diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index 10ea3eafea3..dbf0c055211 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -508,6 +508,8 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { result.MilliCPU += rQuantity.MilliValue() case v1.ResourceNvidiaGPU: result.NvidiaGPU += rQuantity.Value() + case v1.ResourceStorageOverlay: + result.StorageOverlay += rQuantity.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { result.AddOpaque(rName, rQuantity.Value()) @@ -515,6 +517,15 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { } } } + // Account for storage requested by emptydir volumes + // If the storage medium is memory, should exclude the size + for _, vol := range pod.Spec.Volumes { + if vol.EmptyDir != nil && vol.EmptyDir.Medium != v1.StorageMediumMemory { + + result.StorageScratch += vol.EmptyDir.SizeLimit.Value() + } + } + // take max_resource(sum_pod, any_init_container) for _, container := range pod.Spec.InitContainers { for rName, rQuantity := range container.Resources.Requests { @@ -531,6 +542,10 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource { if gpu := rQuantity.Value(); gpu > result.NvidiaGPU { result.NvidiaGPU = gpu } + case v1.ResourceStorageOverlay: + if overlay := rQuantity.Value(); overlay > result.StorageOverlay { + result.StorageOverlay = overlay + } default: if v1helper.IsOpaqueIntResourceName(rName) { value := rQuantity.Value() @@ -581,6 +596,23 @@ func PodFitsResources(pod *v1.Pod, meta interface{}, nodeInfo *schedulercache.No if allocatable.NvidiaGPU < podRequest.NvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU { predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceNvidiaGPU, podRequest.NvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, allocatable.NvidiaGPU)) } + + scratchSpaceRequest := podRequest.StorageScratch + if allocatable.StorageOverlay == 0 { + scratchSpaceRequest += podRequest.StorageOverlay + //scratchSpaceRequest += nodeInfo.RequestedResource().StorageOverlay + nodeScratchRequest := nodeInfo.RequestedResource().StorageOverlay + nodeInfo.RequestedResource().StorageScratch + if allocatable.StorageScratch < scratchSpaceRequest+nodeScratchRequest { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageScratch, scratchSpaceRequest, nodeScratchRequest, allocatable.StorageScratch)) + } + + } else if allocatable.StorageScratch < scratchSpaceRequest+nodeInfo.RequestedResource().StorageScratch { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageScratch, scratchSpaceRequest, nodeInfo.RequestedResource().StorageScratch, allocatable.StorageScratch)) + } + if allocatable.StorageOverlay > 0 && allocatable.StorageOverlay < podRequest.StorageOverlay+nodeInfo.RequestedResource().StorageOverlay { + predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceStorageOverlay, podRequest.StorageOverlay, nodeInfo.RequestedResource().StorageOverlay, allocatable.StorageOverlay)) + } + for rName, rQuant := range podRequest.OpaqueIntResources { if allocatable.OpaqueIntResources[rName] < rQuant+nodeInfo.RequestedResource().OpaqueIntResources[rName] { predicateFails = append(predicateFails, NewInsufficientResourceError(rName, podRequest.OpaqueIntResources[rName], nodeInfo.RequestedResource().OpaqueIntResources[rName], allocatable.OpaqueIntResources[rName])) diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 1118c4842b4..05987cf80be 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -76,7 +76,7 @@ var ( opaqueResourceB = v1helper.OpaqueIntResourceName("BBB") ) -func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.NodeResources { +func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.NodeResources { return v1.NodeResources{ Capacity: v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), @@ -84,17 +84,19 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.NodeRes v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI), v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), + v1.ResourceStorage: *resource.NewQuantity(storage, resource.BinarySI), }, } } -func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA int64) v1.ResourceList { +func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, opaqueA, storage int64) v1.ResourceList { return v1.ResourceList{ v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI), v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI), v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI), opaqueResourceA: *resource.NewQuantity(opaqueA, resource.DecimalSI), + v1.ResourceStorage: *resource.NewQuantity(storage, resource.BinarySI), } } @@ -112,6 +114,25 @@ func newResourcePod(usage ...schedulercache.Resource) *v1.Pod { } } +func addStorageLimit(pod *v1.Pod, sizeLimit int64, medium v1.StorageMedium) *v1.Pod { + return &v1.Pod{ + Spec: v1.PodSpec{ + Containers: pod.Spec.Containers, + Volumes: []v1.Volume{ + { + Name: "emptyDirVolumeName", + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(sizeLimit, resource.BinarySI), + Medium: medium, + }, + }, + }, + }, + }, + } +} + func newResourceInitPod(pod *v1.Pod, usage ...schedulercache.Resource) *v1.Pod { pod.Spec.InitContainers = newResourcePod(usage...).Spec.Containers return pod @@ -329,7 +350,7 @@ func TestPodFitsResources(t *testing.T) { } for _, test := range enoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -384,7 +405,7 @@ func TestPodFitsResources(t *testing.T) { }, } for _, test := range notEnoughPodsTests { - node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0)}} + node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0)}} test.nodeInfo.SetNode(&node) fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) if err != nil { @@ -397,6 +418,86 @@ func TestPodFitsResources(t *testing.T) { t.Errorf("%s: expected: %v got %v", test.test, test.fits, fits) } } + + storagePodsTests := []struct { + pod *v1.Pod + emptyDirLimit int64 + storageMedium v1.StorageMedium + nodeInfo *schedulercache.NodeInfo + fits bool + test string + reasons []algorithm.PredicateFailureReason + }{ + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 1}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 10, Memory: 10, StorageOverlay: 20})), + fits: false, + test: "due to init container scratch disk", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceCPU, 1, 10, 10), + NewInsufficientResourceError(v1.ResourceStorageScratch, 1, 20, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 10})), + fits: true, + test: "pod fit", + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 18}), + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: false, + test: "request exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 18, 5, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + emptyDirLimit: 15, + storageMedium: v1.StorageMediumDefault, + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: false, + test: "storage scratchrequest exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 25, 5, 20), + }, + }, + { + pod: newResourcePod(schedulercache.Resource{MilliCPU: 1, Memory: 1, StorageOverlay: 10}), + emptyDirLimit: 15, + storageMedium: v1.StorageMediumMemory, + nodeInfo: schedulercache.NewNodeInfo( + newResourcePod(schedulercache.Resource{MilliCPU: 2, Memory: 2, StorageOverlay: 5})), + fits: true, + test: "storage scratchrequest exceeds allocatable", + reasons: []algorithm.PredicateFailureReason{ + NewInsufficientResourceError(v1.ResourceStorageScratch, 25, 5, 20), + }, + }, + } + + for _, test := range storagePodsTests { + node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20)}} + test.nodeInfo.SetNode(&node) + pod := addStorageLimit(test.pod, test.emptyDirLimit, test.storageMedium) + fits, reasons, err := PodFitsResources(pod, PredicateMetadata(pod, nil), test.nodeInfo) + if err != nil { + t.Errorf("%s: unexpected error: %v", test.test, err) + } + if !fits && !reflect.DeepEqual(reasons, test.reasons) { + t.Errorf("%s: unexpected failure reasons: %v, want: %v", test.test, reasons, test.reasons) + } + if fits != test.fits { + t.Errorf("%s: expected: %v got %v", test.test, test.fits, fits) + } + } + } func TestPodFitsHost(t *testing.T) { @@ -1843,7 +1944,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: true, wErr: nil, @@ -1855,7 +1956,7 @@ func TestRunGeneralPredicates(t *testing.T) { newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -1869,7 +1970,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: &v1.Pod{}, nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: true, wErr: nil, test: "no resources/port/host requested always fits on GPU machine", @@ -1878,7 +1979,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: false, wErr: nil, reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)}, @@ -1888,7 +1989,7 @@ func TestRunGeneralPredicates(t *testing.T) { pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}), nodeInfo: schedulercache.NewNodeInfo( newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})), - node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0)}}, + node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0)}}, fits: true, wErr: nil, test: "enough GPU resource", @@ -1902,7 +2003,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -1914,7 +2015,7 @@ func TestRunGeneralPredicates(t *testing.T) { nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)), node: &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "machine1"}, - Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0)}, + Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0)}, }, fits: false, wErr: nil, @@ -3249,7 +3350,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) { ImagePullPolicy: "Always", // at least one requirement -> burstable pod Resources: v1.ResourceRequirements{ - Requests: makeAllocatableResources(100, 100, 100, 100, 0), + Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0), }, }, }, diff --git a/plugin/pkg/scheduler/schedulercache/node_info.go b/plugin/pkg/scheduler/schedulercache/node_info.go index 85f75de97b0..951a1621199 100644 --- a/plugin/pkg/scheduler/schedulercache/node_info.go +++ b/plugin/pkg/scheduler/schedulercache/node_info.go @@ -76,9 +76,10 @@ type Resource struct { func (r *Resource) ResourceList() v1.ResourceList { result := v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI), - v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI), + v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI), + v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI), + v1.ResourceStorageOverlay: *resource.NewQuantity(r.StorageOverlay, resource.BinarySI), } for rName, rQuant := range r.OpaqueIntResources { result[rName] = *resource.NewQuantity(rQuant, resource.DecimalSI) @@ -264,6 +265,8 @@ func (n *NodeInfo) addPod(pod *v1.Pod) { n.requestedResource.MilliCPU += res.MilliCPU n.requestedResource.Memory += res.Memory n.requestedResource.NvidiaGPU += res.NvidiaGPU + n.requestedResource.StorageOverlay += res.StorageOverlay + n.requestedResource.StorageScratch += res.StorageScratch if n.requestedResource.OpaqueIntResources == nil && len(res.OpaqueIntResources) > 0 { n.requestedResource.OpaqueIntResources = map[v1.ResourceName]int64{} } @@ -349,6 +352,8 @@ func calculateResource(pod *v1.Pod) (res Resource, non0_cpu int64, non0_mem int6 res.Memory += rQuant.Value() case v1.ResourceNvidiaGPU: res.NvidiaGPU += rQuant.Value() + case v1.ResourceStorageOverlay: + res.StorageOverlay += rQuant.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { res.AddOpaque(rName, rQuant.Value()) @@ -361,6 +366,15 @@ func calculateResource(pod *v1.Pod) (res Resource, non0_cpu int64, non0_mem int6 non0_mem += non0_mem_req // No non-zero resources for GPUs or opaque resources. } + + // Account for storage requested by emptydir volumes + // If the storage medium is memory, should exclude the size + for _, vol := range pod.Spec.Volumes { + if vol.EmptyDir != nil && vol.EmptyDir.Medium != v1.StorageMediumMemory { + res.StorageScratch += vol.EmptyDir.SizeLimit.Value() + } + } + return } @@ -391,6 +405,10 @@ func (n *NodeInfo) SetNode(node *v1.Node) error { n.allocatableResource.NvidiaGPU = rQuant.Value() case v1.ResourcePods: n.allowedPodNumber = int(rQuant.Value()) + case v1.ResourceStorage: + n.allocatableResource.StorageScratch = rQuant.Value() + case v1.ResourceStorageOverlay: + n.allocatableResource.StorageOverlay = rQuant.Value() default: if v1helper.IsOpaqueIntResourceName(rName) { n.allocatableResource.SetOpaque(rName, rQuant.Value()) diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 100580d54fe..b8956ab1381 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -76,6 +76,7 @@ go_test( "inode_eviction_test.go", "kubelet_test.go", "lifecycle_hook_test.go", + "local_storage_allocatable_eviction_test.go", "log_path_test.go", "memory_eviction_test.go", "mirror_pod_test.go",