Remove deprecated pod-eviction-timeout flag from controller-manager

This commit is contained in:
Andrea Tosatto 2023-03-07 14:27:14 +00:00
parent 98bbeae99f
commit cae19f9e85
10 changed files with 4 additions and 639 deletions

View File

@ -186,7 +186,6 @@ func startNodeLifecycleController(ctx context.Context, controllerContext Control
controllerContext.ComponentConfig.KubeCloudShared.NodeMonitorPeriod.Duration,
controllerContext.ComponentConfig.NodeLifecycleController.NodeStartupGracePeriod.Duration,
controllerContext.ComponentConfig.NodeLifecycleController.NodeMonitorGracePeriod.Duration,
controllerContext.ComponentConfig.NodeLifecycleController.PodEvictionTimeout.Duration,
controllerContext.ComponentConfig.NodeLifecycleController.NodeEvictionRate,
controllerContext.ComponentConfig.NodeLifecycleController.SecondaryNodeEvictionRate,
controllerContext.ComponentConfig.NodeLifecycleController.LargeClusterSizeThreshold,

View File

@ -39,12 +39,10 @@ func (o *NodeLifecycleControllerOptions) AddFlags(fs *pflag.FlagSet) {
"Amount of time which we allow running Node to be unresponsive before marking it unhealthy. "+
"Must be N times more than kubelet's nodeStatusUpdateFrequency, "+
"where N means number of retries allowed for kubelet to post node status.")
fs.DurationVar(&o.PodEvictionTimeout.Duration, "pod-eviction-timeout", o.PodEvictionTimeout.Duration, "The grace period for deleting pods on failed nodes.")
fs.Float32Var(&o.NodeEvictionRate, "node-eviction-rate", 0.1, "Number of nodes per second on which pods are deleted in case of node failure when a zone is healthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters.")
fs.Float32Var(&o.SecondaryNodeEvictionRate, "secondary-node-eviction-rate", 0.01, "Number of nodes per second on which pods are deleted in case of node failure when a zone is unhealthy (see --unhealthy-zone-threshold for definition of healthy/unhealthy). Zone refers to entire cluster in non-multizone clusters. This value is implicitly overridden to 0 if the cluster size is smaller than --large-cluster-size-threshold.")
fs.Int32Var(&o.LargeClusterSizeThreshold, "large-cluster-size-threshold", 50, "Number of nodes from which NodeController treats the cluster as large for the eviction logic purposes. --secondary-node-eviction-rate is implicitly overridden to 0 for clusters this size or smaller.")
fs.Float32Var(&o.UnhealthyZoneThreshold, "unhealthy-zone-threshold", 0.55, "Fraction of Nodes in a zone which needs to be not Ready (minimum 3) for zone to be treated as unhealthy. ")
fs.MarkDeprecated("pod-eviction-timeout", "This flag is deprecated and it will be removed in 1.28. Since taint manager is always enabled, this flag will have no effect.")
}
// ApplyTo fills up NodeLifecycleController config with options.
@ -55,7 +53,6 @@ func (o *NodeLifecycleControllerOptions) ApplyTo(cfg *nodelifecycleconfig.NodeLi
cfg.NodeStartupGracePeriod = o.NodeStartupGracePeriod
cfg.NodeMonitorGracePeriod = o.NodeMonitorGracePeriod
cfg.PodEvictionTimeout = o.PodEvictionTimeout
cfg.NodeEvictionRate = o.NodeEvictionRate
cfg.SecondaryNodeEvictionRate = o.SecondaryNodeEvictionRate
cfg.LargeClusterSizeThreshold = o.LargeClusterSizeThreshold

View File

@ -141,7 +141,6 @@ var args = []string{
"--node-monitor-grace-period=30s",
"--node-monitor-period=10s",
"--node-startup-grace-period=30s",
"--pod-eviction-timeout=2m",
"--profiling=false",
"--pv-recycler-increment-timeout-nfs=45",
"--pv-recycler-minimum-timeout-hostpath=45",
@ -347,7 +346,6 @@ func TestAddFlags(t *testing.T) {
SecondaryNodeEvictionRate: 0.05,
NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second},
NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second},
PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute},
LargeClusterSizeThreshold: 100,
UnhealthyZoneThreshold: 0.6,
},
@ -591,7 +589,6 @@ func TestApplyTo(t *testing.T) {
SecondaryNodeEvictionRate: 0.05,
NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second},
NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second},
PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute},
LargeClusterSizeThreshold: 100,
UnhealthyZoneThreshold: 0.6,
},
@ -1165,7 +1162,6 @@ func TestValidateControllersOptions(t *testing.T) {
SecondaryNodeEvictionRate: 0.05,
NodeMonitorGracePeriod: metav1.Duration{Duration: 30 * time.Second},
NodeStartupGracePeriod: metav1.Duration{Duration: 30 * time.Second},
PodEvictionTimeout: metav1.Duration{Duration: 2 * time.Minute},
LargeClusterSizeThreshold: 100,
UnhealthyZoneThreshold: 0.6,
},

View File

@ -34,8 +34,6 @@ type NodeLifecycleControllerConfiguration struct {
// nodeStatusUpdateFrequency, where N means number of retries allowed for kubelet
// to post node status.
NodeMonitorGracePeriod metav1.Duration
// podEvictionTimeout is the grace period for deleting pods on failed nodes.
PodEvictionTimeout metav1.Duration
// secondaryNodeEvictionRate is implicitly overridden to 0 for clusters smaller than or equal to largeClusterSizeThreshold
LargeClusterSizeThreshold int32
// Zone is treated as unhealthy in nodeEvictionRate and secondaryNodeEvictionRate when at least

View File

@ -86,7 +86,7 @@ func autoConvert_v1alpha1_NodeLifecycleControllerConfiguration_To_config_NodeLif
out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate
out.NodeStartupGracePeriod = in.NodeStartupGracePeriod
out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod
out.PodEvictionTimeout = in.PodEvictionTimeout
// WARNING: in.PodEvictionTimeout requires manual conversion: does not exist in peer-type
out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold
out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold
return nil
@ -97,7 +97,6 @@ func autoConvert_config_NodeLifecycleControllerConfiguration_To_v1alpha1_NodeLif
out.SecondaryNodeEvictionRate = in.SecondaryNodeEvictionRate
out.NodeStartupGracePeriod = in.NodeStartupGracePeriod
out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod
out.PodEvictionTimeout = in.PodEvictionTimeout
out.LargeClusterSizeThreshold = in.LargeClusterSizeThreshold
out.UnhealthyZoneThreshold = in.UnhealthyZoneThreshold
return nil

View File

@ -26,7 +26,6 @@ func (in *NodeLifecycleControllerConfiguration) DeepCopyInto(out *NodeLifecycleC
*out = *in
out.NodeStartupGracePeriod = in.NodeStartupGracePeriod
out.NodeMonitorGracePeriod = in.NodeMonitorGracePeriod
out.PodEvictionTimeout = in.PodEvictionTimeout
return
}

View File

@ -273,7 +273,7 @@ type Controller struct {
// post node status/lease. It is pointless to make nodeMonitorGracePeriod
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
// update frequency. The constant must be less than podEvictionTimeout.
// update frequency.
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
// value takes longer for user to see up-to-date node health.
nodeMonitorGracePeriod time.Duration
@ -282,7 +282,6 @@ type Controller struct {
// Defaults to scheduler.UpdateWorkerSize.
nodeUpdateWorkerSize int
podEvictionTimeout time.Duration
evictionLimiterQPS float32
secondaryEvictionLimiterQPS float32
largeClusterThreshold int32
@ -303,7 +302,6 @@ func NewNodeLifecycleController(
nodeMonitorPeriod time.Duration,
nodeStartupGracePeriod time.Duration,
nodeMonitorGracePeriod time.Duration,
podEvictionTimeout time.Duration,
evictionLimiterQPS float32,
secondaryEvictionLimiterQPS float32,
largeClusterThreshold int32,
@ -332,7 +330,6 @@ func NewNodeLifecycleController(
zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue),
nodesToRetry: sync.Map{},
zoneStates: make(map[string]ZoneState),
podEvictionTimeout: podEvictionTimeout,
evictionLimiterQPS: evictionLimiterQPS,
secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
largeClusterThreshold: largeClusterThreshold,

View File

@ -41,7 +41,6 @@ import (
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/fake"
testcore "k8s.io/client-go/testing"
"k8s.io/klog/v2"
"k8s.io/klog/v2/ktesting"
kubeletapis "k8s.io/kubelet/pkg/apis"
"k8s.io/kubernetes/pkg/controller"
@ -89,28 +88,6 @@ type nodeLifecycleController struct {
daemonSetInformer appsinformers.DaemonSetInformer
}
// doEviction does the fake eviction and returns the status of eviction operation.
func (nc *nodeLifecycleController) doEviction(logger klog.Logger, fakeNodeHandler *testutil.FakeNodeHandler) bool {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
zones := testutil.GetZones(fakeNodeHandler)
for _, zone := range zones {
nc.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) {
uid, _ := value.UID.(string)
pods, _ := nc.getPodsAssignedToNode(value.Value)
controllerutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore)
return true, 0
})
}
for _, action := range fakeNodeHandler.Actions() {
if action.GetVerb() == "delete" && action.GetResource().Resource == "pods" {
return true
}
}
return false
}
func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease {
return &coordv1.Lease{
ObjectMeta: metav1.ObjectMeta{
@ -148,7 +125,6 @@ func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeN
func newNodeLifecycleControllerFromClient(
ctx context.Context,
kubeClient clientset.Interface,
podEvictionTimeout time.Duration,
evictionLimiterQPS float32,
secondaryEvictionLimiterQPS float32,
largeClusterThreshold int32,
@ -174,7 +150,6 @@ func newNodeLifecycleControllerFromClient(
nodeMonitorPeriod,
nodeStartupGracePeriod,
nodeMonitorGracePeriod,
podEvictionTimeout,
evictionLimiterQPS,
secondaryEvictionLimiterQPS,
largeClusterThreshold,
@ -194,7 +169,6 @@ func newNodeLifecycleControllerFromClient(
func TestMonitorNodeHealthEvictPods(t *testing.T) {
fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
labels := map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
@ -362,7 +336,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) {
Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
},
daemonSets: nil,
timeToPass: evictionTimeout,
newNodeStatus: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
@ -593,7 +566,7 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) {
Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
},
daemonSets: nil,
timeToPass: evictionTimeout - testNodeMonitorGracePeriod,
timeToPass: testNodeMonitorGracePeriod,
newNodeStatus: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
@ -683,7 +656,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -754,7 +726,6 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) {
func TestPodStatusChange(t *testing.T) {
fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
// Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
// we need second healthy node in tests. Because of how the tests are written we need to update
@ -853,7 +824,6 @@ func TestPodStatusChange(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -913,574 +883,6 @@ func TestPodStatusChange(t *testing.T) {
}
}
func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) {
fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
timeToPass := 60 * time.Minute
// Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
// we need second healthy node in tests. Because of how the tests are written we need to update
// the status of this Node.
healthyNodeNewStatus := v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 13, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
}
unhealthyNodeNewStatus := v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
// Node status was updated by nodecontroller 1hr ago
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
}
table := []struct {
nodeList []*v1.Node
podList []v1.Pod
updatedNodeStatuses []v1.NodeStatus
expectedInitialStates map[string]ZoneState
expectedFollowingStates map[string]ZoneState
expectedEvictPods bool
description string
}{
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
// Only zone is down - eviction shouldn't take place
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
unhealthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
expectedFollowingStates: map[string]ZoneState{testutil.CreateZoneID("region1", "zone1"): stateFullDisruption},
expectedEvictPods: false,
description: "Network Disruption: Only zone is down - eviction shouldn't take place.",
},
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
// Both zones down - eviction shouldn't take place
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region2",
v1.LabelTopologyZone: "zone2",
v1.LabelFailureDomainBetaRegion: "region2",
v1.LabelFailureDomainBetaZone: "zone2",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
unhealthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
},
expectedFollowingStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region2", "zone2"): stateFullDisruption,
},
expectedEvictPods: false,
description: "Network Disruption: Both zones down - eviction shouldn't take place.",
},
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
// One zone is down - eviction should take place
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone2",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone2",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
healthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region1", "zone2"): stateNormal,
},
expectedFollowingStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region1", "zone2"): stateNormal,
},
expectedEvictPods: true,
description: "Network Disruption: One zone is down - eviction should take place.",
},
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period
// of on first Node, eviction should stop even though Node with label
// node.kubernetes.io/exclude-disruption is healthy.
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node-master",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
labelNodeDisruptionExclusion: "",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
healthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
},
expectedFollowingStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
},
expectedEvictPods: false,
description: "NetworkDisruption: eviction should stop, only Node with label node.kubernetes.io/exclude-disruption is healthy",
},
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
// Initially both zones down, one comes back - eviction should take place
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone2",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone2",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
healthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region1", "zone2"): stateFullDisruption,
},
expectedFollowingStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): stateFullDisruption,
testutil.CreateZoneID("region1", "zone2"): stateNormal,
},
expectedEvictPods: true,
description: "Initially both zones down, one comes back - eviction should take place",
},
// NetworkDisruption: Node created long time ago, node controller posted Unknown for a long period of time on both Nodes.
// Zone is partially disrupted - eviction should take place
{
nodeList: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node2",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node3",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "node4",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
Labels: map[string]string{
v1.LabelTopologyRegion: "region1",
v1.LabelTopologyZone: "zone1",
v1.LabelFailureDomainBetaRegion: "region1",
v1.LabelFailureDomainBetaZone: "zone1",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
podList: []v1.Pod{*testutil.NewPod("pod0", "node0")},
updatedNodeStatuses: []v1.NodeStatus{
unhealthyNodeNewStatus,
unhealthyNodeNewStatus,
unhealthyNodeNewStatus,
healthyNodeNewStatus,
healthyNodeNewStatus,
},
expectedInitialStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
},
expectedFollowingStates: map[string]ZoneState{
testutil.CreateZoneID("region1", "zone1"): statePartialDisruption,
},
expectedEvictPods: true,
description: "Zone is partially disrupted - eviction should take place.",
},
}
for _, item := range table {
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: item.nodeList,
Clientset: fake.NewSimpleClientset(&v1.PodList{Items: item.podList}),
}
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
testUnhealthyThreshold,
testNodeMonitorGracePeriod,
testNodeStartupGracePeriod,
testNodeMonitorPeriod,
)
nodeController.now = func() metav1.Time { return fakeNow }
nodeController.recorder = testutil.NewFakeRecorder()
nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset)
nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 {
return testRateLimiterQPS
}
nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 {
return testRateLimiterQPS
}
if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
t.Errorf("unexpected error: %v", err)
}
if err := nodeController.monitorNodeHealth(context.TODO()); err != nil {
t.Errorf("%v: unexpected error: %v", item.description, err)
}
for zone, state := range item.expectedInitialStates {
if state != nodeController.zoneStates[zone] {
t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
}
}
nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
for i := range item.updatedNodeStatuses {
fakeNodeHandler.Existing[i].Status = item.updatedNodeStatuses[i]
}
if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil {
t.Errorf("unexpected error: %v", err)
}
if err := nodeController.monitorNodeHealth(context.TODO()); err != nil {
t.Errorf("%v: unexpected error: %v", item.description, err)
}
for zone, state := range item.expectedFollowingStates {
if state != nodeController.zoneStates[zone] {
t.Errorf("%v: Unexpected zone state: %v: %v instead %v", item.description, zone, nodeController.zoneStates[zone], state)
}
}
var podEvicted bool
start := time.Now()
// Infinite loop, used for retrying in case ratelimiter fails to reload for Try function.
// this breaks when we have the status that we need for test case or when we don't see the
// intended result after 1 minute.
logger, _ := ktesting.NewTestContext(t)
for {
podEvicted = nodeController.doEviction(logger, fakeNodeHandler)
if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute {
break
}
}
if item.expectedEvictPods != podEvicted {
t.Errorf("%v: expected pod eviction: %+v, got %+v", item.description, item.expectedEvictPods, podEvicted)
}
}
}
func TestMonitorNodeHealthUpdateStatus(t *testing.T) {
fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
table := []struct {
@ -1704,7 +1106,6 @@ func TestMonitorNodeHealthUpdateStatus(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2248,7 +1649,6 @@ func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2413,7 +1813,6 @@ func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2515,7 +1914,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2718,7 +2116,6 @@ func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
item.fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2766,7 +2163,6 @@ func TestApplyNoExecuteTaints(t *testing.T) {
t.Skip("Skipping test on Windows.")
}
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -2857,7 +2253,6 @@ func TestApplyNoExecuteTaints(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -2922,7 +2317,6 @@ func TestApplyNoExecuteTaints(t *testing.T) {
// TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice
func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -3012,7 +2406,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -3155,7 +2548,6 @@ func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) {
func TestSwapUnreachableNotReadyTaints(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -3209,7 +2601,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) {
},
Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}),
}
timeToPass := evictionTimeout
newNodeStatus := v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
@ -3237,7 +2628,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -3272,7 +2662,7 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) {
t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints)
}
nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} }
nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Time} }
node0.Status = newNodeStatus
node1.Status = healthyNodeNewStatus
@ -3309,7 +2699,6 @@ func TestSwapUnreachableNotReadyTaints(t *testing.T) {
func TestTaintsNodeByCondition(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -3342,7 +2731,6 @@ func TestTaintsNodeByCondition(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -3545,7 +2933,6 @@ func TestNodeEventGeneration(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
5*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -3586,7 +2973,6 @@ func TestNodeEventGeneration(t *testing.T) {
func TestReconcileNodeLabels(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -3619,7 +3005,6 @@ func TestReconcileNodeLabels(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
@ -3736,7 +3121,6 @@ func TestReconcileNodeLabels(t *testing.T) {
func TestTryUpdateNodeHealth(t *testing.T) {
fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC)
fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC)
evictionTimeout := 10 * time.Minute
fakeNodeHandler := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
@ -3763,7 +3147,6 @@ func TestTryUpdateNodeHealth(t *testing.T) {
nodeController, _ := newNodeLifecycleControllerFromClient(
context.TODO(),
fakeNodeHandler,
evictionTimeout,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,

View File

@ -126,7 +126,6 @@ func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) {
1*time.Second, // Node monitor grace period
time.Minute, // Node startup grace period
time.Millisecond, // Node monitor period
1, // Pod eviction timeout
100, // Eviction limiter QPS
100, // Secondary eviction limiter QPS
50, // Large cluster threshold
@ -278,7 +277,6 @@ func TestTaintBasedEvictions(t *testing.T) {
1*time.Second, // Node monitor grace period
time.Minute, // Node startup grace period
time.Millisecond, // Node monitor period
time.Second, // Pod eviction timeout
100, // Eviction limiter QPS
100, // Secondary eviction limiter QPS
50, // Large cluster threshold

View File

@ -95,7 +95,6 @@ func TestTaintNodeByCondition(t *testing.T) {
time.Hour, // Node monitor grace period
time.Second, // Node startup grace period
time.Second, // Node monitor period
time.Second, // Pod eviction timeout
100, // Eviction limiter QPS
100, // Secondary eviction limiter QPS
100, // Large cluster threshold