diff --git a/pkg/scheduler/algorithm/predicates/error.go b/pkg/scheduler/algorithm/predicates/error.go index 6f7160dfa33..1e1cbcaa9fa 100644 --- a/pkg/scheduler/algorithm/predicates/error.go +++ b/pkg/scheduler/algorithm/predicates/error.go @@ -61,6 +61,8 @@ var ( ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure", "node(s) had memory pressure") // ErrNodeUnderDiskPressure is used for NodeUnderDiskPressure predicate error. ErrNodeUnderDiskPressure = newPredicateFailureError("NodeUnderDiskPressure", "node(s) had disk pressure") + // ErrNodeUnderPIDPressure is used for NodeUnderPIDPressure predicate error. + ErrNodeUnderPIDPressure = newPredicateFailureError("NodeUnderPIDPressure", "node(s) had pid pressure") // ErrNodeOutOfDisk is used for NodeOutOfDisk predicate error. ErrNodeOutOfDisk = newPredicateFailureError("NodeOutOfDisk", "node(s) were out of disk space") // ErrNodeNotReady is used for NodeNotReady predicate error. diff --git a/pkg/scheduler/algorithm/predicates/predicates.go b/pkg/scheduler/algorithm/predicates/predicates.go index 1055a55528d..abca33adbee 100644 --- a/pkg/scheduler/algorithm/predicates/predicates.go +++ b/pkg/scheduler/algorithm/predicates/predicates.go @@ -88,6 +88,8 @@ const ( CheckNodeMemoryPressurePred = "CheckNodeMemoryPressure" // CheckNodeDiskPressurePred defines the name of predicate CheckNodeDiskPressure. CheckNodeDiskPressurePred = "CheckNodeDiskPressure" + // CheckNodePIDPressurePred defines the name of predicate CheckNodePIDPressure. + CheckNodePIDPressurePred = "CheckNodePIDPressure" // DefaultMaxEBSVolumes is the limit for volumes attached to an instance. // Amazon recommends no more than 40; the system root volume uses at least one. @@ -132,7 +134,7 @@ var ( PodToleratesNodeTaintsPred, PodToleratesNodeNoExecuteTaintsPred, CheckNodeLabelPresencePred, CheckServiceAffinityPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred, MaxAzureDiskVolumeCountPred, CheckVolumeBindingPred, NoVolumeZoneConflictPred, - CheckNodeMemoryPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred} + CheckNodeMemoryPressurePred, CheckNodePIDPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred} ) // NodeInfo interface represents anything that can get node object from node ID. @@ -1591,6 +1593,16 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadat return true, nil, nil } +// CheckNodePIDPressurePredicate checks if a pod can be scheduled on a node +// reporting pid pressure condition. +func CheckNodePIDPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { + // check if node is under pid pressure + if nodeInfo.PIDPressureCondition() == v1.ConditionTrue { + return false, []algorithm.PredicateFailureReason{ErrNodeUnderPIDPressure}, nil + } + return true, nil, nil +} + // CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk, // network unavailable and not ready condition. Only node conditions are accounted in this predicate. func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { diff --git a/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go index 63e09eea400..06d8a983a5c 100644 --- a/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go +++ b/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go @@ -480,7 +480,6 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { {"name": "CheckVolumeBinding"}, {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} - ],"priorities": [ {"name": "EqualPriority", "weight": 2}, {"name": "ImageLocalityPriority", "weight": 2}, @@ -529,6 +528,81 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { }, }, }, + // Do not change this JSON after the corresponding release has been tagged. + // A failure indicates backwards compatibility with the specified release was broken. + "1.10": { + JSON: `{ + "kind": "Policy", + "apiVersion": "v1", + "predicates": [ + {"name": "MatchNodeSelector"}, + {"name": "PodFitsResources"}, + {"name": "PodFitsHostPorts"}, + {"name": "HostName"}, + {"name": "NoDiskConflict"}, + {"name": "NoVolumeZoneConflict"}, + {"name": "PodToleratesNodeTaints"}, + {"name": "CheckNodeMemoryPressure"}, + {"name": "CheckNodeDiskPressure"}, + {"name": "CheckNodePIDPressure"}, + {"name": "CheckNodeCondition"}, + {"name": "MaxEBSVolumeCount"}, + {"name": "MaxGCEPDVolumeCount"}, + {"name": "MaxAzureDiskVolumeCount"}, + {"name": "MatchInterPodAffinity"}, + {"name": "GeneralPredicates"}, + {"name": "CheckVolumeBinding"}, + {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} + ],"priorities": [ + {"name": "EqualPriority", "weight": 2}, + {"name": "ImageLocalityPriority", "weight": 2}, + {"name": "LeastRequestedPriority", "weight": 2}, + {"name": "BalancedResourceAllocation", "weight": 2}, + {"name": "SelectorSpreadPriority", "weight": 2}, + {"name": "NodePreferAvoidPodsPriority", "weight": 2}, + {"name": "NodeAffinityPriority", "weight": 2}, + {"name": "TaintTolerationPriority", "weight": 2}, + {"name": "InterPodAffinityPriority", "weight": 2}, + {"name": "MostRequestedPriority", "weight": 2} + ] + }`, + ExpectedPolicy: schedulerapi.Policy{ + Predicates: []schedulerapi.PredicatePolicy{ + {Name: "MatchNodeSelector"}, + {Name: "PodFitsResources"}, + {Name: "PodFitsHostPorts"}, + {Name: "HostName"}, + {Name: "NoDiskConflict"}, + {Name: "NoVolumeZoneConflict"}, + {Name: "PodToleratesNodeTaints"}, + {Name: "CheckNodeMemoryPressure"}, + {Name: "CheckNodeDiskPressure"}, + {Name: "CheckNodePIDPressure"}, + {Name: "CheckNodeCondition"}, + {Name: "MaxEBSVolumeCount"}, + {Name: "MaxGCEPDVolumeCount"}, + {Name: "MaxAzureDiskVolumeCount"}, + {Name: "MatchInterPodAffinity"}, + {Name: "GeneralPredicates"}, + {Name: "CheckVolumeBinding"}, + {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, + {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, + }, + Priorities: []schedulerapi.PriorityPolicy{ + {Name: "EqualPriority", Weight: 2}, + {Name: "ImageLocalityPriority", Weight: 2}, + {Name: "LeastRequestedPriority", Weight: 2}, + {Name: "BalancedResourceAllocation", Weight: 2}, + {Name: "SelectorSpreadPriority", Weight: 2}, + {Name: "NodePreferAvoidPodsPriority", Weight: 2}, + {Name: "NodeAffinityPriority", Weight: 2}, + {Name: "TaintTolerationPriority", Weight: 2}, + {Name: "InterPodAffinityPriority", Weight: 2}, + {Name: "MostRequestedPriority", Weight: 2}, + }, + }, + }, } registeredPredicates := sets.NewString(factory.ListRegisteredFitPredicates()...) diff --git a/pkg/scheduler/algorithmprovider/defaults/defaults.go b/pkg/scheduler/algorithmprovider/defaults/defaults.go index 327d627ded2..33265cead09 100644 --- a/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -17,6 +17,8 @@ limitations under the License. package defaults import ( + "github.com/golang/glog" + "k8s.io/apimachinery/pkg/util/sets" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -26,8 +28,6 @@ import ( "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities" "k8s.io/kubernetes/pkg/scheduler/core" "k8s.io/kubernetes/pkg/scheduler/factory" - - "github.com/golang/glog" ) const ( @@ -160,6 +160,9 @@ func defaultPredicates() sets.String { // Fit is determined by node disk pressure condition. factory.RegisterFitPredicate(predicates.CheckNodeDiskPressurePred, predicates.CheckNodeDiskPressurePredicate), + // Fit is determined by node pid pressure condition. + factory.RegisterFitPredicate(predicates.CheckNodePIDPressurePred, predicates.CheckNodePIDPressurePredicate), + // Fit is determined by node conditions: not ready, network unavailable or out of disk. factory.RegisterMandatoryFitPredicate(predicates.CheckNodeConditionPred, predicates.CheckNodeConditionPredicate), @@ -179,10 +182,12 @@ func defaultPredicates() sets.String { // ApplyFeatureGates applies algorithm by feature gates. func ApplyFeatureGates() { if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) { - // Remove "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure" predicates + // Remove "CheckNodeCondition", "CheckNodeMemoryPressure", "CheckNodePIDPressurePred" + // and "CheckNodeDiskPressure" predicates factory.RemoveFitPredicate(predicates.CheckNodeConditionPred) factory.RemoveFitPredicate(predicates.CheckNodeMemoryPressurePred) factory.RemoveFitPredicate(predicates.CheckNodeDiskPressurePred) + factory.RemoveFitPredicate(predicates.CheckNodePIDPressurePred) // Remove key "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure" // from ALL algorithm provider // The key will be removed from all providers which in algorithmProviderMap[] @@ -190,6 +195,7 @@ func ApplyFeatureGates() { factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeConditionPred) factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeMemoryPressurePred) factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeDiskPressurePred) + factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodePIDPressurePred) // Fit is determined based on whether a pod can tolerate all of the node's taints factory.RegisterMandatoryFitPredicate(predicates.PodToleratesNodeTaintsPred, predicates.PodToleratesNodeTaints) diff --git a/pkg/scheduler/algorithmprovider/defaults/defaults_test.go b/pkg/scheduler/algorithmprovider/defaults/defaults_test.go index d78bb62835a..8c1a66de2a6 100644 --- a/pkg/scheduler/algorithmprovider/defaults/defaults_test.go +++ b/pkg/scheduler/algorithmprovider/defaults/defaults_test.go @@ -76,6 +76,7 @@ func TestDefaultPredicates(t *testing.T) { "GeneralPredicates", "CheckNodeMemoryPressure", "CheckNodeDiskPressure", + "CheckNodePIDPressure", "CheckNodeCondition", "PodToleratesNodeTaints", predicates.CheckVolumeBindingPred, diff --git a/pkg/scheduler/core/generic_scheduler.go b/pkg/scheduler/core/generic_scheduler.go index 5b2f76dae89..c222b80d314 100644 --- a/pkg/scheduler/core/generic_scheduler.go +++ b/pkg/scheduler/core/generic_scheduler.go @@ -25,6 +25,8 @@ import ( "sync/atomic" "time" + "github.com/golang/glog" + "k8s.io/api/core/v1" policy "k8s.io/api/policy/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -40,8 +42,6 @@ import ( "k8s.io/kubernetes/pkg/scheduler/schedulercache" "k8s.io/kubernetes/pkg/scheduler/util" "k8s.io/kubernetes/pkg/scheduler/volumebinder" - - "github.com/golang/glog" ) // FailedPredicateMap declares a map[string][]algorithm.PredicateFailureReason type. diff --git a/pkg/scheduler/core/scheduling_queue.go b/pkg/scheduler/core/scheduling_queue.go index 0710a27f856..2b789f338c4 100644 --- a/pkg/scheduler/core/scheduling_queue.go +++ b/pkg/scheduler/core/scheduling_queue.go @@ -29,8 +29,11 @@ package core import ( "container/heap" "fmt" + "reflect" "sync" + "github.com/golang/glog" + "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" @@ -38,10 +41,6 @@ import ( "k8s.io/kubernetes/pkg/scheduler/algorithm/predicates" priorityutil "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util" "k8s.io/kubernetes/pkg/scheduler/util" - - "reflect" - - "github.com/golang/glog" ) // SchedulingQueue is an interface for a queue to store pods waiting to be scheduled. diff --git a/pkg/scheduler/factory/factory.go b/pkg/scheduler/factory/factory.go index 174c1f384ce..cf309ffd8ed 100644 --- a/pkg/scheduler/factory/factory.go +++ b/pkg/scheduler/factory/factory.go @@ -837,6 +837,9 @@ func (c *configFactory) invalidateCachedPredicatesOnNodeUpdate(newNode *v1.Node, if oldConditions[v1.NodeDiskPressure] != newConditions[v1.NodeDiskPressure] { invalidPredicates.Insert(predicates.CheckNodeDiskPressurePred) } + if oldConditions[v1.NodePIDPressure] != newConditions[v1.NodePIDPressure] { + invalidPredicates.Insert(predicates.CheckNodePIDPressurePred) + } if oldConditions[v1.NodeReady] != newConditions[v1.NodeReady] || oldConditions[v1.NodeOutOfDisk] != newConditions[v1.NodeOutOfDisk] || oldConditions[v1.NodeNetworkUnavailable] != newConditions[v1.NodeNetworkUnavailable] { diff --git a/pkg/scheduler/schedulercache/node_info.go b/pkg/scheduler/schedulercache/node_info.go index a98e9cdcabc..03907b334fe 100644 --- a/pkg/scheduler/schedulercache/node_info.go +++ b/pkg/scheduler/schedulercache/node_info.go @@ -62,6 +62,7 @@ type NodeInfo struct { // Cached conditions of node for faster lookup. memoryPressureCondition v1.ConditionStatus diskPressureCondition v1.ConditionStatus + pidPressureCondition v1.ConditionStatus // Whenever NodeInfo changes, generation is bumped. // This is used to avoid cloning it if the object didn't change. @@ -284,6 +285,14 @@ func (n *NodeInfo) DiskPressureCondition() v1.ConditionStatus { return n.diskPressureCondition } +// PIDPressureCondition returns the pid pressure condition status on this node. +func (n *NodeInfo) PIDPressureCondition() v1.ConditionStatus { + if n == nil { + return v1.ConditionUnknown + } + return n.pidPressureCondition +} + // RequestedResource returns aggregated resource request of pods on this node. func (n *NodeInfo) RequestedResource() Resource { if n == nil { @@ -324,6 +333,7 @@ func (n *NodeInfo) Clone() *NodeInfo { TransientInfo: n.TransientInfo, memoryPressureCondition: n.memoryPressureCondition, diskPressureCondition: n.diskPressureCondition, + pidPressureCondition: n.pidPressureCondition, usedPorts: make(util.HostPortInfo), generation: n.generation, } @@ -482,6 +492,8 @@ func (n *NodeInfo) SetNode(node *v1.Node) error { n.memoryPressureCondition = cond.Status case v1.NodeDiskPressure: n.diskPressureCondition = cond.Status + case v1.NodePIDPressure: + n.pidPressureCondition = cond.Status default: // We ignore other conditions. } @@ -502,6 +514,7 @@ func (n *NodeInfo) RemoveNode(node *v1.Node) error { n.taints, n.taintsErr = nil, nil n.memoryPressureCondition = v1.ConditionUnknown n.diskPressureCondition = v1.ConditionUnknown + n.pidPressureCondition = v1.ConditionUnknown n.generation++ return nil } diff --git a/test/integration/scheduler/predicates_test.go b/test/integration/scheduler/predicates_test.go index bf3513211d9..16a9890eed8 100644 --- a/test/integration/scheduler/predicates_test.go +++ b/test/integration/scheduler/predicates_test.go @@ -870,3 +870,53 @@ func TestInterPodAffinity(t *testing.T) { } } } + +// TestNodePIDPressure verifies that scheduler's CheckNodePIDPressurePredicate predicate +// functions works correctly. +func TestNodePIDPressure(t *testing.T) { + context := initTest(t, "node-pid-pressure") + defer cleanupTest(t, context) + // Add a node. + node, err := createNode(context.clientSet, "testnode", nil) + if err != nil { + t.Fatalf("Cannot create node: %v", err) + } + + cs := context.clientSet + + // Adds PID pressure condition to the node. + node.Status.Conditions = []v1.NodeCondition{ + { + Type: v1.NodePIDPressure, + Status: v1.ConditionTrue, + }, + } + + // Update node condition. + err = updateNodeStatus(context.clientSet, node) + if err != nil { + t.Fatalf("Cannot update node: %v", err) + } + + // Creats test pod. + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pidpressure-fake-name"}, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + {Name: "container", Image: imageutils.GetPauseImageName()}, + }, + }, + } + + testPod, err = cs.CoreV1().Pods(context.ns.Name).Create(testPod) + if err != nil { + t.Fatalf("Test Failed: error: %v, while creating pod", err) + } + + err = waitForPodUnschedulable(cs, testPod) + if err != nil { + t.Errorf("Test Failed: error, %v, while waiting for scheduled", err) + } + + cleanupPods(cs, t, []*v1.Pod{testPod}) +} diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index 39852279532..5307845b3ec 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -134,6 +134,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) { "CheckNodeCondition", // mandatory predicate "CheckNodeDiskPressure", "CheckNodeMemoryPressure", + "CheckNodePIDPressure", "CheckVolumeBinding", "GeneralPredicates", "MatchInterPodAffinity", diff --git a/test/integration/scheduler/taint_test.go b/test/integration/scheduler/taint_test.go index 84181d9ad50..592348c0a87 100644 --- a/test/integration/scheduler/taint_test.go +++ b/test/integration/scheduler/taint_test.go @@ -45,6 +45,12 @@ import ( // 2. NodeController taints nodes by node condition // 3. Scheduler allows pod to tolerate node condition taints, e.g. network unavailable func TestTaintNodeByCondition(t *testing.T) { + enabled := utilfeature.DefaultFeatureGate.Enabled("TaintNodesByCondition") + defer func() { + if !enabled { + utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False") + } + }() // Enable TaintNodeByCondition utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=True") diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index c55e8d9428f..020d4803e17 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -19,6 +19,7 @@ package scheduler import ( "fmt" "net/http" + "net/http/httptest" "testing" "time" @@ -48,8 +49,6 @@ import ( "k8s.io/kubernetes/pkg/scheduler/factory" "k8s.io/kubernetes/test/integration/framework" imageutils "k8s.io/kubernetes/test/utils/image" - - "net/http/httptest" ) type TestContext struct { @@ -318,6 +317,12 @@ func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1. return cs.CoreV1().Nodes().Create(n) } +// updateNodeStatus updates the status of node. +func updateNodeStatus(cs clientset.Interface, node *v1.Node) error { + _, err := cs.CoreV1().Nodes().UpdateStatus(node) + return err +} + // createNodes creates `numNodes` nodes. The created node names will be in the // form of "`prefix`-X" where X is an ordinal. func createNodes(cs clientset.Interface, prefix string, res *v1.ResourceList, numNodes int) ([]*v1.Node, error) {