Node lifecycle controller should use a label for excluding nodes

The current mechanism for excluding "master" nodes based on node names
is fragile and should be fixed by using a label exclusion similar to
service load balancers. The legacy code path is preserved behind a
defaulted-on gate and will be removed in the future.
This commit is contained in:
Clayton Coleman 2019-07-16 22:24:21 -04:00
parent 0f49d892d5
commit 2888e6e923
No known key found for this signature in database
GPG Key ID: 3D16906B4F1C5CB3
4 changed files with 84 additions and 6 deletions

View File

@ -17,7 +17,6 @@ go_library(
"//pkg/scheduler/api:go_default_library",
"//pkg/util/metrics:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/system:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",

View File

@ -23,13 +23,14 @@ package nodelifecycle
import (
"fmt"
"strings"
"sync"
"time"
"k8s.io/klog"
coordv1beta1 "k8s.io/api/coordination/v1beta1"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -54,11 +55,11 @@ import (
"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
"k8s.io/kubernetes/pkg/features"
kubefeatures "k8s.io/kubernetes/pkg/features"
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
"k8s.io/kubernetes/pkg/util/metrics"
utilnode "k8s.io/kubernetes/pkg/util/node"
"k8s.io/kubernetes/pkg/util/system"
taintutils "k8s.io/kubernetes/pkg/util/taints"
)
@ -715,8 +716,8 @@ func (nc *Controller) monitorNodeHealth() error {
continue
}
// We do not treat a master node as a part of the cluster for network disruption checking.
if !system.IsMasterNode(node.Name) {
// Some nodes may be excluded from disruption checking
if !isNodeExcludedFromDisruptionChecks(node) {
zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
}
@ -806,6 +807,45 @@ func (nc *Controller) monitorNodeHealth() error {
return nil
}
// labelNodeDisruptionExclusion is a label on nodes that controls whether they are
// excluded from being considered for disruption checks by the node controller.
const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool {
// DEPRECATED: will be removed in 1.19
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.LegacyNodeRoleBehavior) {
if legacyIsMasterNode(node.Name) {
return true
}
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeDisruptionExclusion) {
if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok {
return true
}
}
return false
}
// legacyIsMasterNode returns true if given node is a registered master according
// to the logic historically used for this function. This code path is deprecated
// and the node disruption exclusion label should be used in the future.
// This code will not be allowed to update to use the node-role label, since
// node-roles may not be used for feature enablement.
// DEPRECATED: Will be removed in 1.19
func legacyIsMasterNode(nodeName string) bool {
// We are trying to capture "master(-...)?$" regexp.
// However, using regexp.MatchString() results even in more than 35%
// of all space allocations in ControllerManager spent in this function.
// That's why we are trying to be a bit smarter.
if strings.HasSuffix(nodeName, "master") {
return true
}
if len(nodeName) >= 10 {
return strings.HasSuffix(nodeName[:len(nodeName)-3], "master-")
}
return false
}
// tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
// which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {

View File

@ -23,7 +23,7 @@ import (
apps "k8s.io/api/apps/v1"
coordv1beta1 "k8s.io/api/coordination/v1beta1"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -3270,3 +3270,35 @@ func TestTryUpdateNodeHealth(t *testing.T) {
})
}
}
func Test_isNodeExcludedFromDisruptionChecks(t *testing.T) {
validNodeStatus := v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: "Test"}}}
tests := []struct {
name string
enableExclusion bool
enableLegacy bool
input *v1.Node
want bool
}{
{want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{}}}},
{want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
{want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
{want: false, enableExclusion: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
{want: false, enableLegacy: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
{want: true, enableLegacy: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}},
{want: true, enableExclusion: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeDisruptionExclusion, tt.enableExclusion)()
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.LegacyNodeRoleBehavior, tt.enableLegacy)()
if result := isNodeExcludedFromDisruptionChecks(tt.input); result != tt.want {
t.Errorf("isNodeExcludedFromDisruptionChecks() = %v, want %v", result, tt.want)
}
})
}
}

View File

@ -184,6 +184,12 @@ const (
// Enable nodes to exclude themselves from service load balancers
ServiceNodeExclusion featuregate.Feature = "ServiceNodeExclusion"
// owner @smarterclayton
// alpha: v1.16
//
// Enable nodes to exclude themselves from network disruption checks
NodeDisruptionExclusion featuregate.Feature = "NodeDisruptionExclusion"
// owner: @jsafrane
// alpha: v1.9
//
@ -508,6 +514,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
CPUCFSQuotaPeriod: {Default: false, PreRelease: featuregate.Alpha},
TopologyManager: {Default: false, PreRelease: featuregate.Alpha},
ServiceNodeExclusion: {Default: false, PreRelease: featuregate.Alpha},
NodeDisruptionExclusion: {Default: false, PreRelease: featuregate.Alpha},
MountContainers: {Default: false, PreRelease: featuregate.Alpha},
CSIDriverRegistry: {Default: true, PreRelease: featuregate.Beta},
CSINodeInfo: {Default: true, PreRelease: featuregate.Beta},