Merge pull request #60007 from k82cn/k8s_54313_1

Automatic merge from submit-queue (batch tested with PRs 59367, 60007). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Do not schedule pod to the node under PID pressure.

Signed-off-by: Da K. Ma <klaus1982.cn@gmail.com>

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
part of #54313 

**Release note**:
```release-note
Added CheckNodePIDPressurePredicate to checks if a pod can be scheduled on
a node reporting pid pressure condition.
```
This commit is contained in:
Kubernetes Submit Queue 2018-04-26 00:43:11 -07:00 committed by GitHub
commit 30e811c97c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 185 additions and 13 deletions

View File

@ -61,6 +61,8 @@ var (
ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure", "node(s) had memory pressure")
// ErrNodeUnderDiskPressure is used for NodeUnderDiskPressure predicate error.
ErrNodeUnderDiskPressure = newPredicateFailureError("NodeUnderDiskPressure", "node(s) had disk pressure")
// ErrNodeUnderPIDPressure is used for NodeUnderPIDPressure predicate error.
ErrNodeUnderPIDPressure = newPredicateFailureError("NodeUnderPIDPressure", "node(s) had pid pressure")
// ErrNodeOutOfDisk is used for NodeOutOfDisk predicate error.
ErrNodeOutOfDisk = newPredicateFailureError("NodeOutOfDisk", "node(s) were out of disk space")
// ErrNodeNotReady is used for NodeNotReady predicate error.

View File

@ -88,6 +88,8 @@ const (
CheckNodeMemoryPressurePred = "CheckNodeMemoryPressure"
// CheckNodeDiskPressurePred defines the name of predicate CheckNodeDiskPressure.
CheckNodeDiskPressurePred = "CheckNodeDiskPressure"
// CheckNodePIDPressurePred defines the name of predicate CheckNodePIDPressure.
CheckNodePIDPressurePred = "CheckNodePIDPressure"
// DefaultMaxEBSVolumes is the limit for volumes attached to an instance.
// Amazon recommends no more than 40; the system root volume uses at least one.
@ -132,7 +134,7 @@ var (
PodToleratesNodeTaintsPred, PodToleratesNodeNoExecuteTaintsPred, CheckNodeLabelPresencePred,
CheckServiceAffinityPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred,
MaxAzureDiskVolumeCountPred, CheckVolumeBindingPred, NoVolumeZoneConflictPred,
CheckNodeMemoryPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred}
CheckNodeMemoryPressurePred, CheckNodePIDPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred}
)
// NodeInfo interface represents anything that can get node object from node ID.
@ -1591,6 +1593,16 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadat
return true, nil, nil
}
// CheckNodePIDPressurePredicate checks if a pod can be scheduled on a node
// reporting pid pressure condition.
func CheckNodePIDPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
// check if node is under pid pressure
if nodeInfo.PIDPressureCondition() == v1.ConditionTrue {
return false, []algorithm.PredicateFailureReason{ErrNodeUnderPIDPressure}, nil
}
return true, nil, nil
}
// CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk,
// network unavailable and not ready condition. Only node conditions are accounted in this predicate.
func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {

View File

@ -480,7 +480,6 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
{"name": "CheckVolumeBinding"},
{"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}},
{"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}
],"priorities": [
{"name": "EqualPriority", "weight": 2},
{"name": "ImageLocalityPriority", "weight": 2},
@ -529,6 +528,81 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
},
},
},
// Do not change this JSON after the corresponding release has been tagged.
// A failure indicates backwards compatibility with the specified release was broken.
"1.10": {
JSON: `{
"kind": "Policy",
"apiVersion": "v1",
"predicates": [
{"name": "MatchNodeSelector"},
{"name": "PodFitsResources"},
{"name": "PodFitsHostPorts"},
{"name": "HostName"},
{"name": "NoDiskConflict"},
{"name": "NoVolumeZoneConflict"},
{"name": "PodToleratesNodeTaints"},
{"name": "CheckNodeMemoryPressure"},
{"name": "CheckNodeDiskPressure"},
{"name": "CheckNodePIDPressure"},
{"name": "CheckNodeCondition"},
{"name": "MaxEBSVolumeCount"},
{"name": "MaxGCEPDVolumeCount"},
{"name": "MaxAzureDiskVolumeCount"},
{"name": "MatchInterPodAffinity"},
{"name": "GeneralPredicates"},
{"name": "CheckVolumeBinding"},
{"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}},
{"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}
],"priorities": [
{"name": "EqualPriority", "weight": 2},
{"name": "ImageLocalityPriority", "weight": 2},
{"name": "LeastRequestedPriority", "weight": 2},
{"name": "BalancedResourceAllocation", "weight": 2},
{"name": "SelectorSpreadPriority", "weight": 2},
{"name": "NodePreferAvoidPodsPriority", "weight": 2},
{"name": "NodeAffinityPriority", "weight": 2},
{"name": "TaintTolerationPriority", "weight": 2},
{"name": "InterPodAffinityPriority", "weight": 2},
{"name": "MostRequestedPriority", "weight": 2}
]
}`,
ExpectedPolicy: schedulerapi.Policy{
Predicates: []schedulerapi.PredicatePolicy{
{Name: "MatchNodeSelector"},
{Name: "PodFitsResources"},
{Name: "PodFitsHostPorts"},
{Name: "HostName"},
{Name: "NoDiskConflict"},
{Name: "NoVolumeZoneConflict"},
{Name: "PodToleratesNodeTaints"},
{Name: "CheckNodeMemoryPressure"},
{Name: "CheckNodeDiskPressure"},
{Name: "CheckNodePIDPressure"},
{Name: "CheckNodeCondition"},
{Name: "MaxEBSVolumeCount"},
{Name: "MaxGCEPDVolumeCount"},
{Name: "MaxAzureDiskVolumeCount"},
{Name: "MatchInterPodAffinity"},
{Name: "GeneralPredicates"},
{Name: "CheckVolumeBinding"},
{Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}},
{Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}},
},
Priorities: []schedulerapi.PriorityPolicy{
{Name: "EqualPriority", Weight: 2},
{Name: "ImageLocalityPriority", Weight: 2},
{Name: "LeastRequestedPriority", Weight: 2},
{Name: "BalancedResourceAllocation", Weight: 2},
{Name: "SelectorSpreadPriority", Weight: 2},
{Name: "NodePreferAvoidPodsPriority", Weight: 2},
{Name: "NodeAffinityPriority", Weight: 2},
{Name: "TaintTolerationPriority", Weight: 2},
{Name: "InterPodAffinityPriority", Weight: 2},
{Name: "MostRequestedPriority", Weight: 2},
},
},
},
}
registeredPredicates := sets.NewString(factory.ListRegisteredFitPredicates()...)

View File

@ -17,6 +17,8 @@ limitations under the License.
package defaults
import (
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -26,8 +28,6 @@ import (
"k8s.io/kubernetes/pkg/scheduler/algorithm/priorities"
"k8s.io/kubernetes/pkg/scheduler/core"
"k8s.io/kubernetes/pkg/scheduler/factory"
"github.com/golang/glog"
)
const (
@ -160,6 +160,9 @@ func defaultPredicates() sets.String {
// Fit is determined by node disk pressure condition.
factory.RegisterFitPredicate(predicates.CheckNodeDiskPressurePred, predicates.CheckNodeDiskPressurePredicate),
// Fit is determined by node pid pressure condition.
factory.RegisterFitPredicate(predicates.CheckNodePIDPressurePred, predicates.CheckNodePIDPressurePredicate),
// Fit is determined by node conditions: not ready, network unavailable or out of disk.
factory.RegisterMandatoryFitPredicate(predicates.CheckNodeConditionPred, predicates.CheckNodeConditionPredicate),
@ -179,10 +182,12 @@ func defaultPredicates() sets.String {
// ApplyFeatureGates applies algorithm by feature gates.
func ApplyFeatureGates() {
if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) {
// Remove "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure" predicates
// Remove "CheckNodeCondition", "CheckNodeMemoryPressure", "CheckNodePIDPressurePred"
// and "CheckNodeDiskPressure" predicates
factory.RemoveFitPredicate(predicates.CheckNodeConditionPred)
factory.RemoveFitPredicate(predicates.CheckNodeMemoryPressurePred)
factory.RemoveFitPredicate(predicates.CheckNodeDiskPressurePred)
factory.RemoveFitPredicate(predicates.CheckNodePIDPressurePred)
// Remove key "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure"
// from ALL algorithm provider
// The key will be removed from all providers which in algorithmProviderMap[]
@ -190,6 +195,7 @@ func ApplyFeatureGates() {
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeConditionPred)
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeMemoryPressurePred)
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeDiskPressurePred)
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodePIDPressurePred)
// Fit is determined based on whether a pod can tolerate all of the node's taints
factory.RegisterMandatoryFitPredicate(predicates.PodToleratesNodeTaintsPred, predicates.PodToleratesNodeTaints)

View File

@ -76,6 +76,7 @@ func TestDefaultPredicates(t *testing.T) {
"GeneralPredicates",
"CheckNodeMemoryPressure",
"CheckNodeDiskPressure",
"CheckNodePIDPressure",
"CheckNodeCondition",
"PodToleratesNodeTaints",
predicates.CheckVolumeBindingPred,

View File

@ -25,6 +25,8 @@ import (
"sync/atomic"
"time"
"github.com/golang/glog"
"k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -40,8 +42,6 @@ import (
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
"k8s.io/kubernetes/pkg/scheduler/util"
"k8s.io/kubernetes/pkg/scheduler/volumebinder"
"github.com/golang/glog"
)
// FailedPredicateMap declares a map[string][]algorithm.PredicateFailureReason type.

View File

@ -29,8 +29,11 @@ package core
import (
"container/heap"
"fmt"
"reflect"
"sync"
"github.com/golang/glog"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/tools/cache"
@ -38,10 +41,6 @@ import (
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
priorityutil "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util"
"k8s.io/kubernetes/pkg/scheduler/util"
"reflect"
"github.com/golang/glog"
)
// SchedulingQueue is an interface for a queue to store pods waiting to be scheduled.

View File

@ -837,6 +837,9 @@ func (c *configFactory) invalidateCachedPredicatesOnNodeUpdate(newNode *v1.Node,
if oldConditions[v1.NodeDiskPressure] != newConditions[v1.NodeDiskPressure] {
invalidPredicates.Insert(predicates.CheckNodeDiskPressurePred)
}
if oldConditions[v1.NodePIDPressure] != newConditions[v1.NodePIDPressure] {
invalidPredicates.Insert(predicates.CheckNodePIDPressurePred)
}
if oldConditions[v1.NodeReady] != newConditions[v1.NodeReady] ||
oldConditions[v1.NodeOutOfDisk] != newConditions[v1.NodeOutOfDisk] ||
oldConditions[v1.NodeNetworkUnavailable] != newConditions[v1.NodeNetworkUnavailable] {

View File

@ -62,6 +62,7 @@ type NodeInfo struct {
// Cached conditions of node for faster lookup.
memoryPressureCondition v1.ConditionStatus
diskPressureCondition v1.ConditionStatus
pidPressureCondition v1.ConditionStatus
// Whenever NodeInfo changes, generation is bumped.
// This is used to avoid cloning it if the object didn't change.
@ -284,6 +285,14 @@ func (n *NodeInfo) DiskPressureCondition() v1.ConditionStatus {
return n.diskPressureCondition
}
// PIDPressureCondition returns the pid pressure condition status on this node.
func (n *NodeInfo) PIDPressureCondition() v1.ConditionStatus {
if n == nil {
return v1.ConditionUnknown
}
return n.pidPressureCondition
}
// RequestedResource returns aggregated resource request of pods on this node.
func (n *NodeInfo) RequestedResource() Resource {
if n == nil {
@ -324,6 +333,7 @@ func (n *NodeInfo) Clone() *NodeInfo {
TransientInfo: n.TransientInfo,
memoryPressureCondition: n.memoryPressureCondition,
diskPressureCondition: n.diskPressureCondition,
pidPressureCondition: n.pidPressureCondition,
usedPorts: make(util.HostPortInfo),
generation: n.generation,
}
@ -482,6 +492,8 @@ func (n *NodeInfo) SetNode(node *v1.Node) error {
n.memoryPressureCondition = cond.Status
case v1.NodeDiskPressure:
n.diskPressureCondition = cond.Status
case v1.NodePIDPressure:
n.pidPressureCondition = cond.Status
default:
// We ignore other conditions.
}
@ -502,6 +514,7 @@ func (n *NodeInfo) RemoveNode(node *v1.Node) error {
n.taints, n.taintsErr = nil, nil
n.memoryPressureCondition = v1.ConditionUnknown
n.diskPressureCondition = v1.ConditionUnknown
n.pidPressureCondition = v1.ConditionUnknown
n.generation++
return nil
}

View File

@ -870,3 +870,53 @@ func TestInterPodAffinity(t *testing.T) {
}
}
}
// TestNodePIDPressure verifies that scheduler's CheckNodePIDPressurePredicate predicate
// functions works correctly.
func TestNodePIDPressure(t *testing.T) {
context := initTest(t, "node-pid-pressure")
defer cleanupTest(t, context)
// Add a node.
node, err := createNode(context.clientSet, "testnode", nil)
if err != nil {
t.Fatalf("Cannot create node: %v", err)
}
cs := context.clientSet
// Adds PID pressure condition to the node.
node.Status.Conditions = []v1.NodeCondition{
{
Type: v1.NodePIDPressure,
Status: v1.ConditionTrue,
},
}
// Update node condition.
err = updateNodeStatus(context.clientSet, node)
if err != nil {
t.Fatalf("Cannot update node: %v", err)
}
// Creats test pod.
testPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "pidpressure-fake-name"},
Spec: v1.PodSpec{
Containers: []v1.Container{
{Name: "container", Image: imageutils.GetPauseImageName()},
},
},
}
testPod, err = cs.CoreV1().Pods(context.ns.Name).Create(testPod)
if err != nil {
t.Fatalf("Test Failed: error: %v, while creating pod", err)
}
err = waitForPodUnschedulable(cs, testPod)
if err != nil {
t.Errorf("Test Failed: error, %v, while waiting for scheduled", err)
}
cleanupPods(cs, t, []*v1.Pod{testPod})
}

View File

@ -134,6 +134,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) {
"CheckNodeCondition", // mandatory predicate
"CheckNodeDiskPressure",
"CheckNodeMemoryPressure",
"CheckNodePIDPressure",
"CheckVolumeBinding",
"GeneralPredicates",
"MatchInterPodAffinity",

View File

@ -45,6 +45,12 @@ import (
// 2. NodeController taints nodes by node condition
// 3. Scheduler allows pod to tolerate node condition taints, e.g. network unavailable
func TestTaintNodeByCondition(t *testing.T) {
enabled := utilfeature.DefaultFeatureGate.Enabled("TaintNodesByCondition")
defer func() {
if !enabled {
utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False")
}
}()
// Enable TaintNodeByCondition
utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=True")

View File

@ -19,6 +19,7 @@ package scheduler
import (
"fmt"
"net/http"
"net/http/httptest"
"testing"
"time"
@ -48,8 +49,6 @@ import (
"k8s.io/kubernetes/pkg/scheduler/factory"
"k8s.io/kubernetes/test/integration/framework"
imageutils "k8s.io/kubernetes/test/utils/image"
"net/http/httptest"
)
type TestContext struct {
@ -318,6 +317,12 @@ func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.
return cs.CoreV1().Nodes().Create(n)
}
// updateNodeStatus updates the status of node.
func updateNodeStatus(cs clientset.Interface, node *v1.Node) error {
_, err := cs.CoreV1().Nodes().UpdateStatus(node)
return err
}
// createNodes creates `numNodes` nodes. The created node names will be in the
// form of "`prefix`-X" where X is an ordinal.
func createNodes(cs clientset.Interface, prefix string, res *v1.ResourceList, numNodes int) ([]*v1.Node, error) {