diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index b0ebcc6df90..f721145b773 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -595,13 +595,14 @@ func (nc *Controller) doNoExecuteTaintingPass() { // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. taintToAdd := v1.Taint{} oppositeTaint := v1.Taint{} - if condition.Status == v1.ConditionFalse { + switch condition.Status { + case v1.ConditionFalse: taintToAdd = *NotReadyTaintTemplate oppositeTaint = *UnreachableTaintTemplate - } else if condition.Status == v1.ConditionUnknown { + case v1.ConditionUnknown: taintToAdd = *UnreachableTaintTemplate oppositeTaint = *NotReadyTaintTemplate - } else { + default: // It seems that the Node is ready again, so there's no need to taint it. klog.V(4).Infof("Node %v was in a taint queue, but it's ready now. Ignoring taint request.", value.Value) return true, 0 @@ -720,7 +721,8 @@ func (nc *Controller) monitorNodeHealth() error { decisionTimestamp := nc.now() if currentReadyCondition != nil { // Check eviction timeout against decisionTimestamp - if observedReadyCondition.Status == v1.ConditionFalse { + switch observedReadyCondition.Status { + case v1.ConditionFalse: if nc.useTaintBasedEvictions { // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { @@ -746,8 +748,7 @@ func (nc *Controller) monitorNodeHealth() error { } } } - } - if observedReadyCondition.Status == v1.ConditionUnknown { + case v1.ConditionUnknown: if nc.useTaintBasedEvictions { // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { @@ -773,8 +774,7 @@ func (nc *Controller) monitorNodeHealth() error { } } } - } - if observedReadyCondition.Status == v1.ConditionTrue { + case v1.ConditionTrue: if nc.useTaintBasedEvictions { removed, err := nc.markNodeAsReachable(node) if err != nil { @@ -807,7 +807,6 @@ func (nc *Controller) monitorNodeHealth() error { // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred. func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) { - var err error var gracePeriod time.Duration var observedReadyCondition v1.NodeCondition _, currentReadyCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) @@ -836,7 +835,6 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node observedReadyCondition = *currentReadyCondition gracePeriod = nc.nodeMonitorGracePeriod } - savedNodeHealth, found := nc.nodeHealthMap[node.Name] // There are following cases to check: // - both saved and new status have no Ready Condition set - we leave everything as it is, @@ -858,7 +856,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node _, savedCondition = nodeutil.GetNodeCondition(savedNodeHealth.status, v1.NodeReady) savedLease = savedNodeHealth.lease } - _, observedCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) + if !found { klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name) savedNodeHealth = &nodeHealthData{ @@ -866,14 +864,14 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node probeTimestamp: nc.now(), readyTransitionTimestamp: nc.now(), } - } else if savedCondition == nil && observedCondition != nil { + } else if savedCondition == nil && currentReadyCondition != nil { klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name) savedNodeHealth = &nodeHealthData{ status: &node.Status, probeTimestamp: nc.now(), readyTransitionTimestamp: nc.now(), } - } else if savedCondition != nil && observedCondition == nil { + } else if savedCondition != nil && currentReadyCondition == nil { klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name) // TODO: figure out what to do in this case. For now we do the same thing as above. savedNodeHealth = &nodeHealthData{ @@ -881,12 +879,12 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node probeTimestamp: nc.now(), readyTransitionTimestamp: nc.now(), } - } else if savedCondition != nil && observedCondition != nil && savedCondition.LastHeartbeatTime != observedCondition.LastHeartbeatTime { + } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime { var transitionTime metav1.Time // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", // otherwise we leave it as it is. - if savedCondition.LastTransitionTime != observedCondition.LastTransitionTime { - klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, observedCondition) + if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { + klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition) transitionTime = nc.now() } else { transitionTime = savedNodeHealth.readyTransitionTimestamp @@ -919,31 +917,9 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node if nc.now().After(savedNodeHealth.probeTimestamp.Add(gracePeriod)) { // NodeReady condition or lease was last set longer ago than gracePeriod, so // update it to Unknown (regardless of its current value) in the master. - if currentReadyCondition == nil { - klog.V(2).Infof("node %v is never updated by kubelet", node.Name) - node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ - Type: v1.NodeReady, - Status: v1.ConditionUnknown, - Reason: "NodeStatusNeverUpdated", - Message: fmt.Sprintf("Kubelet never posted node status."), - LastHeartbeatTime: node.CreationTimestamp, - LastTransitionTime: nc.now(), - }) - } else { - klog.V(4).Infof("node %v hasn't been updated for %+v. Last ready condition is: %+v", - node.Name, nc.now().Time.Sub(savedNodeHealth.probeTimestamp.Time), observedReadyCondition) - if observedReadyCondition.Status != v1.ConditionUnknown { - currentReadyCondition.Status = v1.ConditionUnknown - currentReadyCondition.Reason = "NodeStatusUnknown" - currentReadyCondition.Message = "Kubelet stopped posting node status." - // LastProbeTime is the last time we heard from kubelet. - currentReadyCondition.LastHeartbeatTime = observedReadyCondition.LastHeartbeatTime - currentReadyCondition.LastTransitionTime = nc.now() - } - } - // remaining node conditions should also be set to Unknown - remainingNodeConditionTypes := []v1.NodeConditionType{ + nodeConditionTypes := []v1.NodeConditionType{ + v1.NodeReady, v1.NodeMemoryPressure, v1.NodeDiskPressure, v1.NodePIDPressure, @@ -952,7 +928,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node } nowTimestamp := nc.now() - for _, nodeConditionType := range remainingNodeConditionTypes { + for _, nodeConditionType := range nodeConditionTypes { _, currentCondition := nodeutil.GetNodeCondition(&node.Status, nodeConditionType) if currentCondition == nil { klog.V(2).Infof("Condition %v of node %v was never updated by kubelet", nodeConditionType, node.Name) @@ -975,10 +951,11 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node } } } + // We need to update currentReadyCondition due to its value potentially changed. + _, currentReadyCondition = nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) - _, currentCondition := nodeutil.GetNodeCondition(&node.Status, v1.NodeReady) - if !apiequality.Semantic.DeepEqual(currentCondition, &observedReadyCondition) { - if _, err = nc.kubeClient.CoreV1().Nodes().UpdateStatus(node); err != nil { + if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { + if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(node); err != nil { klog.Errorf("Error updating node %s: %v", node.Name, err) return gracePeriod, observedReadyCondition, currentReadyCondition, err } @@ -992,7 +969,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node } } - return gracePeriod, observedReadyCondition, currentReadyCondition, err + return gracePeriod, observedReadyCondition, currentReadyCondition, nil } func (nc *Controller) handleDisruption(zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index c88245fd23e..e5d417ad464 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -3057,3 +3057,216 @@ func TestReconcileNodeLabels(t *testing.T) { } } } + +func TestTryUpdateNodeHealth(t *testing.T) { + fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) + fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) + evictionTimeout := 10 * time.Minute + + fakeNodeHandler := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), + } + + nodeController, _ := newNodeLifecycleControllerFromClient( + fakeNodeHandler, + evictionTimeout, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + true) + nodeController.now = func() metav1.Time { return fakeNow } + nodeController.recorder = testutil.NewFakeRecorder() + + getStatus := func(cond *v1.NodeCondition) *v1.ConditionStatus { + if cond == nil { + return nil + } + return &cond.Status + } + + tests := []struct { + name string + node *v1.Node + }{ + { + name: "Status true", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status false", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status unknown", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: fakeNow, + LastTransitionTime: fakeNow, + }, + }, + }, + }, + }, + { + name: "Status nil", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeNow, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{}, + }, + }, + }, + { + name: "Status true - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status false - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status unknown - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: fakeOld, + LastTransitionTime: fakeOld, + }, + }, + }, + }, + }, + { + name: "Status nil - after grace period", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: fakeOld, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{}, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + nodeController.nodeHealthMap[test.node.Name] = &nodeHealthData{ + status: &test.node.Status, + probeTimestamp: test.node.CreationTimestamp, + readyTransitionTimestamp: test.node.CreationTimestamp, + } + _, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(test.node) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + _, savedReadyCondition := nodeutil.GetNodeCondition(nodeController.nodeHealthMap[test.node.Name].status, v1.NodeReady) + savedStatus := getStatus(savedReadyCondition) + currentStatus := getStatus(currentReadyCondition) + if currentStatus != savedStatus { + t.Errorf("expected %v, got %v", savedStatus, currentStatus) + } + }) + } +}