diff --git a/pkg/api/types.go b/pkg/api/types.go index 3afc49cf0f6..8f08fe8dbc5 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -2224,6 +2224,8 @@ const ( NodeDiskPressure NodeConditionType = "DiskPressure" // NodeNetworkUnavailable means that network for the node is not correctly configured. NodeNetworkUnavailable NodeConditionType = "NetworkUnavailable" + // NodeInodePressure means the kublet is under pressure due to insufficient available inodes. + NodeInodePressure NodeConditionType = "InodePressure" ) type NodeCondition struct { diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 8395b3c4fa6..e2b7d5742b0 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -136,6 +136,13 @@ func (m *managerImpl) IsUnderDiskPressure() bool { return hasNodeCondition(m.nodeConditions, api.NodeDiskPressure) } +// IsUnderDiskPressure returns true if the node is under disk pressure. +func (m *managerImpl) IsUnderInodePressure() bool { + m.RLock() + defer m.RUnlock() + return hasNodeCondition(m.nodeConditions, api.NodeInodePressure) +} + // synchronize is the main control loop that enforces eviction thresholds. func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) { // if we have nothing to do, just return diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 0deebaf3595..559d5867e6e 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -916,7 +916,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } func TestDiskPressureNodeFsInodes(t *testing.T) { - // TODO: we need to know inodes used when cadvisor supports per container stats + // TODO(dashpole): we need to know inodes used when cadvisor supports per container stats podMaker := func(name string, requests api.ResourceList, limits api.ResourceList) (*api.Pod, statsapi.PodStats) { pod := newPod(name, []api.Container{ newContainer(name, requests, limits), @@ -943,7 +943,7 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { } return result } - // TODO: pass inodes used in future when supported by cadvisor. + // TODO(dashpole): pass inodes used in future when supported by cadvisor. podsToMake := []struct { name string requests api.ResourceList @@ -1013,9 +1013,9 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { // synchronize manager.synchronize(diskInfoProvider, activePodsFunc) - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") + // we should not have inode pressure + if manager.IsUnderInodePressure() { + t.Errorf("Manager should not report inode pressure") } // try to admit our pod (should succeed) @@ -1028,9 +1028,9 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) manager.synchronize(diskInfoProvider, activePodsFunc) - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") + // we should have inode pressure + if !manager.IsUnderInodePressure() { + t.Errorf("Manager should report inode pressure since soft threshold was met") } // verify no pod was yet killed because there has not yet been enough time passed. @@ -1043,9 +1043,9 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) manager.synchronize(diskInfoProvider, activePodsFunc) - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure since soft threshold was met") + // we should have inode pressure + if !manager.IsUnderInodePressure() { + t.Errorf("Manager should report inode pressure since soft threshold was met") } // verify the right pod was killed with the right grace period. @@ -1063,24 +1063,24 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { podKiller.pod = nil podKiller.gracePeriodOverride = nil - // remove disk pressure + // remove inode pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) manager.synchronize(diskInfoProvider, activePodsFunc) - // we should not have disk pressure - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") + // we should not have inode pressure + if manager.IsUnderInodePressure() { + t.Errorf("Manager should not report inode pressure") } - // induce disk pressure! + // induce inode pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) manager.synchronize(diskInfoProvider, activePodsFunc) - // we should have disk pressure - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") + // we should have inode pressure + if !manager.IsUnderInodePressure() { + t.Errorf("Manager should report inode pressure") } // check the right pod was killed @@ -1097,15 +1097,15 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) } - // reduce disk pressure + // reduce inode pressure fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) podKiller.pod = nil // reset state manager.synchronize(diskInfoProvider, activePodsFunc) - // we should have disk pressure (because transition period not yet met) - if !manager.IsUnderDiskPressure() { - t.Errorf("Manager should report disk pressure") + // we should have inode pressure (because transition period not yet met) + if !manager.IsUnderInodePressure() { + t.Errorf("Manager should report inode pressure") } // no pod should have been killed @@ -1124,9 +1124,9 @@ func TestDiskPressureNodeFsInodes(t *testing.T) { podKiller.pod = nil // reset state manager.synchronize(diskInfoProvider, activePodsFunc) - // we should not have disk pressure (because transition period met) - if manager.IsUnderDiskPressure() { - t.Errorf("Manager should not report disk pressure") + // we should not have inode pressure (because transition period met) + if manager.IsUnderInodePressure() { + t.Errorf("Manager should not report inode pressure") } // no pod should have been killed diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index e70dd2cf8f5..77bf5f3ebe0 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -68,8 +68,8 @@ func init() { signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure - signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure - signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure + signalToNodeCondition[SignalImageFsInodesFree] = api.NodeInodePressure + signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeInodePressure // map signals to resources (and vice-versa) signalToResource = map[Signal]api.ResourceName{} diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 3d85f44d2a1..3fd00dedeab 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -104,6 +104,9 @@ type Manager interface { // IsUnderDiskPressure returns true if the node is under disk pressure. IsUnderDiskPressure() bool + + // IsUnderInodePressure returns true if the node is under disk pressure. + IsUnderInodePressure() bool } // DiskInfoProvider is responsible for informing the manager how disk is configured. diff --git a/pkg/kubelet/kubelet_node_status.go b/pkg/kubelet/kubelet_node_status.go index 1f0e9f175e5..faca94d988d 100644 --- a/pkg/kubelet/kubelet_node_status.go +++ b/pkg/kubelet/kubelet_node_status.go @@ -742,6 +742,65 @@ func (kl *Kubelet) setNodeDiskPressureCondition(node *api.Node) { } } +// setNodeInodePressureCondition for the node. +// TODO: this needs to move somewhere centralized... +func (kl *Kubelet) setNodeInodePressureCondition(node *api.Node) { + currentTime := unversioned.NewTime(kl.clock.Now()) + var condition *api.NodeCondition + + // Check if NodeInodePressure condition already exists and if it does, just pick it up for update. + for i := range node.Status.Conditions { + if node.Status.Conditions[i].Type == api.NodeInodePressure { + condition = &node.Status.Conditions[i] + } + } + + newCondition := false + // If the NodeInodePressure condition doesn't exist, create one + if condition == nil { + condition = &api.NodeCondition{ + Type: api.NodeInodePressure, + Status: api.ConditionUnknown, + } + // cannot be appended to node.Status.Conditions here because it gets + // copied to the slice. So if we append to the slice here none of the + // updates we make below are reflected in the slice. + newCondition = true + } + + // Update the heartbeat time + condition.LastHeartbeatTime = currentTime + + // Note: The conditions below take care of the case when a new NodeInodePressure condition is + // created and as well as the case when the condition already exists. When a new condition + // is created its status is set to api.ConditionUnknown which matches either + // condition.Status != api.ConditionTrue or + // condition.Status != api.ConditionFalse in the conditions below depending on whether + // the kubelet is under inode pressure or not. + if kl.evictionManager.IsUnderInodePressure() { + if condition.Status != api.ConditionTrue { + condition.Status = api.ConditionTrue + condition.Reason = "KubeletHasInodePressure" + condition.Message = "kubelet has inode pressure" + condition.LastTransitionTime = currentTime + kl.recordNodeStatusEvent(api.EventTypeNormal, "NodeHasInodePressure") + } + } else { + if condition.Status != api.ConditionFalse { + condition.Status = api.ConditionFalse + condition.Reason = "KubeletHasNoInodePressure" + condition.Message = "kubelet has no inode pressure" + condition.LastTransitionTime = currentTime + kl.recordNodeStatusEvent(api.EventTypeNormal, "NodeHasNoInodePressure") + } + } + + if newCondition { + node.Status.Conditions = append(node.Status.Conditions, *condition) + } + +} + // Set OODcondition for the node. func (kl *Kubelet) setNodeOODCondition(node *api.Node) { currentTime := unversioned.NewTime(kl.clock.Now()) @@ -856,6 +915,7 @@ func (kl *Kubelet) defaultNodeStatusFuncs() []func(*api.Node) error { withoutError(kl.setNodeOODCondition), withoutError(kl.setNodeMemoryPressureCondition), withoutError(kl.setNodeDiskPressureCondition), + withoutError(kl.setNodeInodePressureCondition), withoutError(kl.setNodeReadyCondition), withoutError(kl.setNodeVolumesInUseStatus), withoutError(kl.recordNodeSchedulableEvent), diff --git a/pkg/kubelet/kubelet_node_status_test.go b/pkg/kubelet/kubelet_node_status_test.go index d387fbcd6c9..e2f8acef6ea 100644 --- a/pkg/kubelet/kubelet_node_status_test.go +++ b/pkg/kubelet/kubelet_node_status_test.go @@ -149,6 +149,14 @@ func TestUpdateNewNodeStatus(t *testing.T) { LastHeartbeatTime: unversioned.Time{}, LastTransitionTime: unversioned.Time{}, }, + { + Type: api.NodeInodePressure, + Status: api.ConditionFalse, + Reason: "KubeletHasNoInodePressure", + Message: fmt.Sprintf("kubelet has no inode pressure"), + LastHeartbeatTime: unversioned.Time{}, + LastTransitionTime: unversioned.Time{}, + }, { Type: api.NodeReady, Status: api.ConditionTrue, @@ -340,6 +348,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) { LastHeartbeatTime: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), LastTransitionTime: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), }, + { + Type: api.NodeInodePressure, + Status: api.ConditionFalse, + Reason: "KubeletHasSufficientInode", + Message: fmt.Sprintf("kubelet has sufficient inodes available"), + LastHeartbeatTime: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + LastTransitionTime: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, { Type: api.NodeReady, Status: api.ConditionTrue, @@ -412,6 +428,14 @@ func TestUpdateExistingNodeStatus(t *testing.T) { LastHeartbeatTime: unversioned.Time{}, LastTransitionTime: unversioned.Time{}, }, + { + Type: api.NodeInodePressure, + Status: api.ConditionFalse, + Reason: "KubeletHasSufficientInode", + Message: fmt.Sprintf("kubelet has sufficient inodes available"), + LastHeartbeatTime: unversioned.Time{}, + LastTransitionTime: unversioned.Time{}, + }, { Type: api.NodeReady, Status: api.ConditionTrue, @@ -716,6 +740,14 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) { LastHeartbeatTime: unversioned.Time{}, LastTransitionTime: unversioned.Time{}, }, + { + Type: api.NodeInodePressure, + Status: api.ConditionFalse, + Reason: "KubeletHasNoInodePressure", + Message: fmt.Sprintf("kubelet has no inode pressure"), + LastHeartbeatTime: unversioned.Time{}, + LastTransitionTime: unversioned.Time{}, + }, {}, //placeholder }, NodeInfo: api.NodeSystemInfo{ diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go index a71cdb9aae7..028c2a96d7c 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/error.go +++ b/plugin/pkg/scheduler/algorithm/predicates/error.go @@ -37,6 +37,7 @@ var ( ErrMaxVolumeCountExceeded = newPredicateFailureError("MaxVolumeCount") ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure") ErrNodeUnderDiskPressure = newPredicateFailureError("NodeUnderDiskPressure") + ErrNodeUnderInodePressure = newPredicateFailureError("NodeUnderInodePressure") // ErrFakePredicate is used for test only. The fake predicates returning false also returns error // as ErrFakePredicate. ErrFakePredicate = newPredicateFailureError("FakePredicateError") diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index ed7e96ad5e4..8dfb3e86558 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -1168,3 +1168,21 @@ func CheckNodeDiskPressurePredicate(pod *api.Pod, meta interface{}, nodeInfo *sc return true, nil, nil } + +// CheckNodeInodePressurePredicate checks if a pod can be scheduled on a node +// reporting inode pressure condition. +func CheckNodeInodePressurePredicate(pod *api.Pod, meta interface{}, nodeInfo *schedulercache.NodeInfo) (bool, []algorithm.PredicateFailureReason, error) { + node := nodeInfo.Node() + if node == nil { + return false, nil, fmt.Errorf("node not found") + } + + // is node under presure? + for _, cond := range node.Status.Conditions { + if cond.Type == api.NodeInodePressure && cond.Status == api.ConditionTrue { + return false, []algorithm.PredicateFailureReason{ErrNodeUnderInodePressure}, nil + } + } + + return true, nil, nil +} diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 40893e73ab8..47a1ea714b0 100755 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -3028,3 +3028,75 @@ func TestPodSchedulesOnNodeWithDiskPressureCondition(t *testing.T) { } } } + +func TestPodSchedulesOnNodeWithInodePressureCondition(t *testing.T) { + pod := &api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "container", + Image: "image", + ImagePullPolicy: "Always", + }, + }, + }, + } + + // specify a node with no inode pressure condition on + noPressureNode := &api.Node{ + Status: api.NodeStatus{ + Conditions: []api.NodeCondition{ + { + Type: api.NodeReady, + Status: api.ConditionTrue, + }, + }, + }, + } + + // specify a node with pressure condition on + pressureNode := &api.Node{ + Status: api.NodeStatus{ + Conditions: []api.NodeCondition{ + { + Type: api.NodeInodePressure, + Status: api.ConditionTrue, + }, + }, + }, + } + + tests := []struct { + pod *api.Pod + nodeInfo *schedulercache.NodeInfo + fits bool + name string + }{ + { + pod: pod, + nodeInfo: makeEmptyNodeInfo(noPressureNode), + fits: true, + name: "pod schedulable on node without inode pressure condition on", + }, + { + pod: pod, + nodeInfo: makeEmptyNodeInfo(pressureNode), + fits: false, + name: "pod not schedulable on node with inode pressure condition on", + }, + } + expectedFailureReasons := []algorithm.PredicateFailureReason{ErrNodeUnderInodePressure} + + for _, test := range tests { + fits, reasons, err := CheckNodeInodePressurePredicate(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo) + if err != nil { + t.Errorf("%s: unexpected error: %v", test.name, err) + } + if !fits && !reflect.DeepEqual(reasons, expectedFailureReasons) { + t.Errorf("%s: unexpected failure reasons: %v, want: %v", test.name, reasons, expectedFailureReasons) + } + if fits != test.fits { + t.Errorf("%s: expected %v got %v", test.name, test.fits, fits) + } + } +} diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go index 2b2c6cf4fa8..8a28b190e7e 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/compatibility_test.go @@ -306,6 +306,76 @@ func TestCompatibility_v1_Scheduler(t *testing.T) { }, }, }, + + // Do not change this JSON after the corresponding release has been tagged. + // A failure indicates backwards compatibility with the specified release was broken. + "1.5": { + JSON: `{ + "kind": "Policy", + "apiVersion": "v1", + "predicates": [ + {"name": "MatchNodeSelector"}, + {"name": "PodFitsResources"}, + {"name": "PodFitsHostPorts"}, + {"name": "HostName"}, + {"name": "NoDiskConflict"}, + {"name": "NoVolumeZoneConflict"}, + {"name": "PodToleratesNodeTaints"}, + {"name": "CheckNodeMemoryPressure"}, + {"name": "CheckNodeDiskPressure"}, + {"name": "CheckNodeInodePressure"}, + {"name": "MaxEBSVolumeCount"}, + {"name": "MaxGCEPDVolumeCount"}, + {"name": "MatchInterPodAffinity"}, + {"name": "GeneralPredicates"}, + {"name": "TestServiceAffinity", "argument": {"serviceAffinity" : {"labels" : ["region"]}}}, + {"name": "TestLabelsPresence", "argument": {"labelsPresence" : {"labels" : ["foo"], "presence":true}}} + ],"priorities": [ + {"name": "EqualPriority", "weight": 2}, + {"name": "ImageLocalityPriority", "weight": 2}, + {"name": "LeastRequestedPriority", "weight": 2}, + {"name": "BalancedResourceAllocation", "weight": 2}, + {"name": "SelectorSpreadPriority", "weight": 2}, + {"name": "NodePreferAvoidPodsPriority", "weight": 2}, + {"name": "NodeAffinityPriority", "weight": 2}, + {"name": "TaintTolerationPriority", "weight": 2}, + {"name": "InterPodAffinityPriority", "weight": 2}, + {"name": "MostRequestedPriority", "weight": 2} + ] + }`, + ExpectedPolicy: schedulerapi.Policy{ + Predicates: []schedulerapi.PredicatePolicy{ + {Name: "MatchNodeSelector"}, + {Name: "PodFitsResources"}, + {Name: "PodFitsHostPorts"}, + {Name: "HostName"}, + {Name: "NoDiskConflict"}, + {Name: "NoVolumeZoneConflict"}, + {Name: "PodToleratesNodeTaints"}, + {Name: "CheckNodeMemoryPressure"}, + {Name: "CheckNodeDiskPressure"}, + {Name: "CheckNodeInodePressure"}, + {Name: "MaxEBSVolumeCount"}, + {Name: "MaxGCEPDVolumeCount"}, + {Name: "MatchInterPodAffinity"}, + {Name: "GeneralPredicates"}, + {Name: "TestServiceAffinity", Argument: &schedulerapi.PredicateArgument{ServiceAffinity: &schedulerapi.ServiceAffinity{Labels: []string{"region"}}}}, + {Name: "TestLabelsPresence", Argument: &schedulerapi.PredicateArgument{LabelsPresence: &schedulerapi.LabelsPresence{Labels: []string{"foo"}, Presence: true}}}, + }, + Priorities: []schedulerapi.PriorityPolicy{ + {Name: "EqualPriority", Weight: 2}, + {Name: "ImageLocalityPriority", Weight: 2}, + {Name: "LeastRequestedPriority", Weight: 2}, + {Name: "BalancedResourceAllocation", Weight: 2}, + {Name: "SelectorSpreadPriority", Weight: 2}, + {Name: "NodePreferAvoidPodsPriority", Weight: 2}, + {Name: "NodeAffinityPriority", Weight: 2}, + {Name: "TaintTolerationPriority", Weight: 2}, + {Name: "InterPodAffinityPriority", Weight: 2}, + {Name: "MostRequestedPriority", Weight: 2}, + }, + }, + }, } registeredPredicates := sets.NewString(factory.ListRegisteredFitPredicates()...) diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 487a0f9bb16..373cd498994 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -158,6 +158,9 @@ func defaultPredicates() sets.String { // Fit is determined by node disk pressure condition. factory.RegisterFitPredicate("CheckNodeDiskPressure", predicates.CheckNodeDiskPressurePredicate), + // Fit is determined by node inode pressure condition. + factory.RegisterFitPredicate("CheckNodeInodePressure", predicates.CheckNodeInodePressurePredicate), + // Fit is determined by inter-pod affinity. factory.RegisterFitPredicateFactory( "MatchInterPodAffinity",