mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 03:11:40 +00:00
Merge pull request #60007 from k82cn/k8s_54313_1
Automatic merge from submit-queue (batch tested with PRs 59367, 60007). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Do not schedule pod to the node under PID pressure. Signed-off-by: Da K. Ma <klaus1982.cn@gmail.com> **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: part of #54313 **Release note**: ```release-note Added CheckNodePIDPressurePredicate to checks if a pod can be scheduled on a node reporting pid pressure condition. ```
This commit is contained in:
commit
30e811c97c
@ -61,6 +61,8 @@ var (
|
||||
ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure", "node(s) had memory pressure")
|
||||
// ErrNodeUnderDiskPressure is used for NodeUnderDiskPressure predicate error.
|
||||
ErrNodeUnderDiskPressure = newPredicateFailureError("NodeUnderDiskPressure", "node(s) had disk pressure")
|
||||
// ErrNodeUnderPIDPressure is used for NodeUnderPIDPressure predicate error.
|
||||
ErrNodeUnderPIDPressure = newPredicateFailureError("NodeUnderPIDPressure", "node(s) had pid pressure")
|
||||
// ErrNodeOutOfDisk is used for NodeOutOfDisk predicate error.
|
||||
ErrNodeOutOfDisk = newPredicateFailureError("NodeOutOfDisk", "node(s) were out of disk space")
|
||||
// ErrNodeNotReady is used for NodeNotReady predicate error.
|
||||
|
@ -88,6 +88,8 @@ const (
|
||||
CheckNodeMemoryPressurePred = "CheckNodeMemoryPressure"
|
||||
// CheckNodeDiskPressurePred defines the name of predicate CheckNodeDiskPressure.
|
||||
CheckNodeDiskPressurePred = "CheckNodeDiskPressure"
|
||||
// CheckNodePIDPressurePred defines the name of predicate CheckNodePIDPressure.
|
||||
CheckNodePIDPressurePred = "CheckNodePIDPressure"
|
||||
|
||||
// DefaultMaxEBSVolumes is the limit for volumes attached to an instance.
|
||||
// Amazon recommends no more than 40; the system root volume uses at least one.
|
||||
@ -132,7 +134,7 @@ var (
|
||||
PodToleratesNodeTaintsPred, PodToleratesNodeNoExecuteTaintsPred, CheckNodeLabelPresencePred,
|
||||
CheckServiceAffinityPred, MaxEBSVolumeCountPred, MaxGCEPDVolumeCountPred,
|
||||
MaxAzureDiskVolumeCountPred, CheckVolumeBindingPred, NoVolumeZoneConflictPred,
|
||||
CheckNodeMemoryPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred}
|
||||
CheckNodeMemoryPressurePred, CheckNodePIDPressurePred, CheckNodeDiskPressurePred, MatchInterPodAffinityPred}
|
||||
)
|
||||
|
||||
// NodeInfo interface represents anything that can get node object from node ID.
|
||||
@ -1591,6 +1593,16 @@ func CheckNodeDiskPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadat
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
// CheckNodePIDPressurePredicate checks if a pod can be scheduled on a node
|
||||
// reporting pid pressure condition.
|
||||
func CheckNodePIDPressurePredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
// check if node is under pid pressure
|
||||
if nodeInfo.PIDPressureCondition() == v1.ConditionTrue {
|
||||
return false, []algorithm.PredicateFailureReason{ErrNodeUnderPIDPressure}, nil
|
||||
}
|
||||
return true, nil, nil
|
||||
}
|
||||
|
||||
// CheckNodeConditionPredicate checks if a pod can be scheduled on a node reporting out of disk,
|
||||
// network unavailable and not ready condition. Only node conditions are accounted in this predicate.
|
||||
func CheckNodeConditionPredicate(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) {
|
||||
|
@ -480,7 +480,6 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
|
||||
{"name": "CheckVolumeBinding"},
|
||||
{"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}},
|
||||
{"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}
|
||||
|
||||
],"priorities": [
|
||||
{"name": "EqualPriority", "weight": 2},
|
||||
{"name": "ImageLocalityPriority", "weight": 2},
|
||||
@ -529,6 +528,81 @@ func TestCompatibility_v1_Scheduler(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
// Do not change this JSON after the corresponding release has been tagged.
|
||||
// A failure indicates backwards compatibility with the specified release was broken.
|
||||
"1.10": {
|
||||
JSON: `{
|
||||
"kind": "Policy",
|
||||
"apiVersion": "v1",
|
||||
"predicates": [
|
||||
{"name": "MatchNodeSelector"},
|
||||
{"name": "PodFitsResources"},
|
||||
{"name": "PodFitsHostPorts"},
|
||||
{"name": "HostName"},
|
||||
{"name": "NoDiskConflict"},
|
||||
{"name": "NoVolumeZoneConflict"},
|
||||
{"name": "PodToleratesNodeTaints"},
|
||||
{"name": "CheckNodeMemoryPressure"},
|
||||
{"name": "CheckNodeDiskPressure"},
|
||||
{"name": "CheckNodePIDPressure"},
|
||||
{"name": "CheckNodeCondition"},
|
||||
{"name": "MaxEBSVolumeCount"},
|
||||
{"name": "MaxGCEPDVolumeCount"},
|
||||
{"name": "MaxAzureDiskVolumeCount"},
|
||||
{"name": "MatchInterPodAffinity"},
|
||||
{"name": "GeneralPredicates"},
|
||||
{"name": "CheckVolumeBinding"},
|
||||
{"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}},
|
||||
{"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}}
|
||||
],"priorities": [
|
||||
{"name": "EqualPriority", "weight": 2},
|
||||
{"name": "ImageLocalityPriority", "weight": 2},
|
||||
{"name": "LeastRequestedPriority", "weight": 2},
|
||||
{"name": "BalancedResourceAllocation", "weight": 2},
|
||||
{"name": "SelectorSpreadPriority", "weight": 2},
|
||||
{"name": "NodePreferAvoidPodsPriority", "weight": 2},
|
||||
{"name": "NodeAffinityPriority", "weight": 2},
|
||||
{"name": "TaintTolerationPriority", "weight": 2},
|
||||
{"name": "InterPodAffinityPriority", "weight": 2},
|
||||
{"name": "MostRequestedPriority", "weight": 2}
|
||||
]
|
||||
}`,
|
||||
ExpectedPolicy: schedulerapi.Policy{
|
||||
Predicates: []schedulerapi.PredicatePolicy{
|
||||
{Name: "MatchNodeSelector"},
|
||||
{Name: "PodFitsResources"},
|
||||
{Name: "PodFitsHostPorts"},
|
||||
{Name: "HostName"},
|
||||
{Name: "NoDiskConflict"},
|
||||
{Name: "NoVolumeZoneConflict"},
|
||||
{Name: "PodToleratesNodeTaints"},
|
||||
{Name: "CheckNodeMemoryPressure"},
|
||||
{Name: "CheckNodeDiskPressure"},
|
||||
{Name: "CheckNodePIDPressure"},
|
||||
{Name: "CheckNodeCondition"},
|
||||
{Name: "MaxEBSVolumeCount"},
|
||||
{Name: "MaxGCEPDVolumeCount"},
|
||||
{Name: "MaxAzureDiskVolumeCount"},
|
||||
{Name: "MatchInterPodAffinity"},
|
||||
{Name: "GeneralPredicates"},
|
||||
{Name: "CheckVolumeBinding"},
|
||||
{Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}},
|
||||
{Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}},
|
||||
},
|
||||
Priorities: []schedulerapi.PriorityPolicy{
|
||||
{Name: "EqualPriority", Weight: 2},
|
||||
{Name: "ImageLocalityPriority", Weight: 2},
|
||||
{Name: "LeastRequestedPriority", Weight: 2},
|
||||
{Name: "BalancedResourceAllocation", Weight: 2},
|
||||
{Name: "SelectorSpreadPriority", Weight: 2},
|
||||
{Name: "NodePreferAvoidPodsPriority", Weight: 2},
|
||||
{Name: "NodeAffinityPriority", Weight: 2},
|
||||
{Name: "TaintTolerationPriority", Weight: 2},
|
||||
{Name: "InterPodAffinityPriority", Weight: 2},
|
||||
{Name: "MostRequestedPriority", Weight: 2},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
registeredPredicates := sets.NewString(factory.ListRegisteredFitPredicates()...)
|
||||
|
@ -17,6 +17,8 @@ limitations under the License.
|
||||
package defaults
|
||||
|
||||
import (
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
|
||||
@ -26,8 +28,6 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/algorithm/priorities"
|
||||
"k8s.io/kubernetes/pkg/scheduler/core"
|
||||
"k8s.io/kubernetes/pkg/scheduler/factory"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -160,6 +160,9 @@ func defaultPredicates() sets.String {
|
||||
// Fit is determined by node disk pressure condition.
|
||||
factory.RegisterFitPredicate(predicates.CheckNodeDiskPressurePred, predicates.CheckNodeDiskPressurePredicate),
|
||||
|
||||
// Fit is determined by node pid pressure condition.
|
||||
factory.RegisterFitPredicate(predicates.CheckNodePIDPressurePred, predicates.CheckNodePIDPressurePredicate),
|
||||
|
||||
// Fit is determined by node conditions: not ready, network unavailable or out of disk.
|
||||
factory.RegisterMandatoryFitPredicate(predicates.CheckNodeConditionPred, predicates.CheckNodeConditionPredicate),
|
||||
|
||||
@ -179,10 +182,12 @@ func defaultPredicates() sets.String {
|
||||
// ApplyFeatureGates applies algorithm by feature gates.
|
||||
func ApplyFeatureGates() {
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.TaintNodesByCondition) {
|
||||
// Remove "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure" predicates
|
||||
// Remove "CheckNodeCondition", "CheckNodeMemoryPressure", "CheckNodePIDPressurePred"
|
||||
// and "CheckNodeDiskPressure" predicates
|
||||
factory.RemoveFitPredicate(predicates.CheckNodeConditionPred)
|
||||
factory.RemoveFitPredicate(predicates.CheckNodeMemoryPressurePred)
|
||||
factory.RemoveFitPredicate(predicates.CheckNodeDiskPressurePred)
|
||||
factory.RemoveFitPredicate(predicates.CheckNodePIDPressurePred)
|
||||
// Remove key "CheckNodeCondition", "CheckNodeMemoryPressure" and "CheckNodeDiskPressure"
|
||||
// from ALL algorithm provider
|
||||
// The key will be removed from all providers which in algorithmProviderMap[]
|
||||
@ -190,6 +195,7 @@ func ApplyFeatureGates() {
|
||||
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeConditionPred)
|
||||
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeMemoryPressurePred)
|
||||
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodeDiskPressurePred)
|
||||
factory.RemovePredicateKeyFromAlgorithmProviderMap(predicates.CheckNodePIDPressurePred)
|
||||
|
||||
// Fit is determined based on whether a pod can tolerate all of the node's taints
|
||||
factory.RegisterMandatoryFitPredicate(predicates.PodToleratesNodeTaintsPred, predicates.PodToleratesNodeTaints)
|
||||
|
@ -76,6 +76,7 @@ func TestDefaultPredicates(t *testing.T) {
|
||||
"GeneralPredicates",
|
||||
"CheckNodeMemoryPressure",
|
||||
"CheckNodeDiskPressure",
|
||||
"CheckNodePIDPressure",
|
||||
"CheckNodeCondition",
|
||||
"PodToleratesNodeTaints",
|
||||
predicates.CheckVolumeBindingPred,
|
||||
|
@ -25,6 +25,8 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
policy "k8s.io/api/policy/v1beta1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@ -40,8 +42,6 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/schedulercache"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
"k8s.io/kubernetes/pkg/scheduler/volumebinder"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// FailedPredicateMap declares a map[string][]algorithm.PredicateFailureReason type.
|
||||
|
@ -29,8 +29,11 @@ package core
|
||||
import (
|
||||
"container/heap"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/tools/cache"
|
||||
@ -38,10 +41,6 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
|
||||
priorityutil "k8s.io/kubernetes/pkg/scheduler/algorithm/priorities/util"
|
||||
"k8s.io/kubernetes/pkg/scheduler/util"
|
||||
|
||||
"reflect"
|
||||
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// SchedulingQueue is an interface for a queue to store pods waiting to be scheduled.
|
||||
|
@ -837,6 +837,9 @@ func (c *configFactory) invalidateCachedPredicatesOnNodeUpdate(newNode *v1.Node,
|
||||
if oldConditions[v1.NodeDiskPressure] != newConditions[v1.NodeDiskPressure] {
|
||||
invalidPredicates.Insert(predicates.CheckNodeDiskPressurePred)
|
||||
}
|
||||
if oldConditions[v1.NodePIDPressure] != newConditions[v1.NodePIDPressure] {
|
||||
invalidPredicates.Insert(predicates.CheckNodePIDPressurePred)
|
||||
}
|
||||
if oldConditions[v1.NodeReady] != newConditions[v1.NodeReady] ||
|
||||
oldConditions[v1.NodeOutOfDisk] != newConditions[v1.NodeOutOfDisk] ||
|
||||
oldConditions[v1.NodeNetworkUnavailable] != newConditions[v1.NodeNetworkUnavailable] {
|
||||
|
@ -62,6 +62,7 @@ type NodeInfo struct {
|
||||
// Cached conditions of node for faster lookup.
|
||||
memoryPressureCondition v1.ConditionStatus
|
||||
diskPressureCondition v1.ConditionStatus
|
||||
pidPressureCondition v1.ConditionStatus
|
||||
|
||||
// Whenever NodeInfo changes, generation is bumped.
|
||||
// This is used to avoid cloning it if the object didn't change.
|
||||
@ -284,6 +285,14 @@ func (n *NodeInfo) DiskPressureCondition() v1.ConditionStatus {
|
||||
return n.diskPressureCondition
|
||||
}
|
||||
|
||||
// PIDPressureCondition returns the pid pressure condition status on this node.
|
||||
func (n *NodeInfo) PIDPressureCondition() v1.ConditionStatus {
|
||||
if n == nil {
|
||||
return v1.ConditionUnknown
|
||||
}
|
||||
return n.pidPressureCondition
|
||||
}
|
||||
|
||||
// RequestedResource returns aggregated resource request of pods on this node.
|
||||
func (n *NodeInfo) RequestedResource() Resource {
|
||||
if n == nil {
|
||||
@ -324,6 +333,7 @@ func (n *NodeInfo) Clone() *NodeInfo {
|
||||
TransientInfo: n.TransientInfo,
|
||||
memoryPressureCondition: n.memoryPressureCondition,
|
||||
diskPressureCondition: n.diskPressureCondition,
|
||||
pidPressureCondition: n.pidPressureCondition,
|
||||
usedPorts: make(util.HostPortInfo),
|
||||
generation: n.generation,
|
||||
}
|
||||
@ -482,6 +492,8 @@ func (n *NodeInfo) SetNode(node *v1.Node) error {
|
||||
n.memoryPressureCondition = cond.Status
|
||||
case v1.NodeDiskPressure:
|
||||
n.diskPressureCondition = cond.Status
|
||||
case v1.NodePIDPressure:
|
||||
n.pidPressureCondition = cond.Status
|
||||
default:
|
||||
// We ignore other conditions.
|
||||
}
|
||||
@ -502,6 +514,7 @@ func (n *NodeInfo) RemoveNode(node *v1.Node) error {
|
||||
n.taints, n.taintsErr = nil, nil
|
||||
n.memoryPressureCondition = v1.ConditionUnknown
|
||||
n.diskPressureCondition = v1.ConditionUnknown
|
||||
n.pidPressureCondition = v1.ConditionUnknown
|
||||
n.generation++
|
||||
return nil
|
||||
}
|
||||
|
@ -870,3 +870,53 @@ func TestInterPodAffinity(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestNodePIDPressure verifies that scheduler's CheckNodePIDPressurePredicate predicate
|
||||
// functions works correctly.
|
||||
func TestNodePIDPressure(t *testing.T) {
|
||||
context := initTest(t, "node-pid-pressure")
|
||||
defer cleanupTest(t, context)
|
||||
// Add a node.
|
||||
node, err := createNode(context.clientSet, "testnode", nil)
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot create node: %v", err)
|
||||
}
|
||||
|
||||
cs := context.clientSet
|
||||
|
||||
// Adds PID pressure condition to the node.
|
||||
node.Status.Conditions = []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodePIDPressure,
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
}
|
||||
|
||||
// Update node condition.
|
||||
err = updateNodeStatus(context.clientSet, node)
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot update node: %v", err)
|
||||
}
|
||||
|
||||
// Creats test pod.
|
||||
testPod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "pidpressure-fake-name"},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{Name: "container", Image: imageutils.GetPauseImageName()},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
testPod, err = cs.CoreV1().Pods(context.ns.Name).Create(testPod)
|
||||
if err != nil {
|
||||
t.Fatalf("Test Failed: error: %v, while creating pod", err)
|
||||
}
|
||||
|
||||
err = waitForPodUnschedulable(cs, testPod)
|
||||
if err != nil {
|
||||
t.Errorf("Test Failed: error, %v, while waiting for scheduled", err)
|
||||
}
|
||||
|
||||
cleanupPods(cs, t, []*v1.Pod{testPod})
|
||||
}
|
||||
|
@ -134,6 +134,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) {
|
||||
"CheckNodeCondition", // mandatory predicate
|
||||
"CheckNodeDiskPressure",
|
||||
"CheckNodeMemoryPressure",
|
||||
"CheckNodePIDPressure",
|
||||
"CheckVolumeBinding",
|
||||
"GeneralPredicates",
|
||||
"MatchInterPodAffinity",
|
||||
|
@ -45,6 +45,12 @@ import (
|
||||
// 2. NodeController taints nodes by node condition
|
||||
// 3. Scheduler allows pod to tolerate node condition taints, e.g. network unavailable
|
||||
func TestTaintNodeByCondition(t *testing.T) {
|
||||
enabled := utilfeature.DefaultFeatureGate.Enabled("TaintNodesByCondition")
|
||||
defer func() {
|
||||
if !enabled {
|
||||
utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=False")
|
||||
}
|
||||
}()
|
||||
// Enable TaintNodeByCondition
|
||||
utilfeature.DefaultFeatureGate.Set("TaintNodesByCondition=True")
|
||||
|
||||
|
@ -19,6 +19,7 @@ package scheduler
|
||||
import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@ -48,8 +49,6 @@ import (
|
||||
"k8s.io/kubernetes/pkg/scheduler/factory"
|
||||
"k8s.io/kubernetes/test/integration/framework"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
|
||||
"net/http/httptest"
|
||||
)
|
||||
|
||||
type TestContext struct {
|
||||
@ -318,6 +317,12 @@ func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.
|
||||
return cs.CoreV1().Nodes().Create(n)
|
||||
}
|
||||
|
||||
// updateNodeStatus updates the status of node.
|
||||
func updateNodeStatus(cs clientset.Interface, node *v1.Node) error {
|
||||
_, err := cs.CoreV1().Nodes().UpdateStatus(node)
|
||||
return err
|
||||
}
|
||||
|
||||
// createNodes creates `numNodes` nodes. The created node names will be in the
|
||||
// form of "`prefix`-X" where X is an ordinal.
|
||||
func createNodes(cs clientset.Interface, prefix string, res *v1.ResourceList, numNodes int) ([]*v1.Node, error) {
|
||||
|
Loading…
Reference in New Issue
Block a user