diff --git a/pkg/cloudprovider/providers/openstack/openstack_instances.go b/pkg/cloudprovider/providers/openstack/openstack_instances.go index 4cd0571ed86..a6ef7f434c1 100644 --- a/pkg/cloudprovider/providers/openstack/openstack_instances.go +++ b/pkg/cloudprovider/providers/openstack/openstack_instances.go @@ -160,7 +160,8 @@ func (os *OpenStack) InstanceID() (string, error) { // InstanceID returns the cloud provider ID of the specified instance. func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) { - srv, err := getServerByName(i.compute, name, true) + // we should fetch instanceid from all states instead of ACTIVE + srv, err := getServerByName(i.compute, name, false) if err != nil { if err == ErrNotFound { return "", cloudprovider.InstanceNotFound diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index 953d9246c33..6de3fd14a65 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -79,6 +79,11 @@ var ( Effect: v1.TaintEffectNoExecute, } + shutDownTaint = &v1.Taint{ + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + } + nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{ v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure, v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk, @@ -151,9 +156,10 @@ type Controller struct { daemonSetStore extensionslisters.DaemonSetLister daemonSetInformerSynced cache.InformerSynced - nodeLister corelisters.NodeLister - nodeInformerSynced cache.InformerSynced - nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeLister corelisters.NodeLister + nodeInformerSynced cache.InformerSynced + nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeShutdownInCloudProvider func(types.NodeName) (bool, error) recorder record.EventRecorder @@ -239,6 +245,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer, nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeutil.ExistsInCloudProvider(cloud, nodeName) }, + nodeShutdownInCloudProvider: func(nodeName types.NodeName) (bool, error) { + return nodeutil.ShutdownInCloudProvider(cloud, nodeName) + }, recorder: recorder, nodeMonitorPeriod: nodeMonitorPeriod, nodeStartupGracePeriod: nodeStartupGracePeriod, @@ -653,6 +662,11 @@ func (nc *Controller) monitorNodeStatus() error { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } + // remove shutdown taint this is needed always depending do we use taintbased or not + err := nc.markNodeAsNotShutdown(node) + if err != nil { + glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) + } } // Report node event. @@ -666,7 +680,21 @@ func (nc *Controller) monitorNodeStatus() error { // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil { - exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) + // check is node shutdowned, if yes do not deleted it. Instead add taint + exists, err := nc.nodeShutdownInCloudProvider(types.NodeName(node.Name)) + if err != nil && err != cloudprovider.NotImplemented { + glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err) + continue + } + // node shutdown + if exists { + err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, shutDownTaint) + if err != nil { + glog.Errorf("Error patching node taints: %v", err) + } + continue + } + exists, err = nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) if err != nil { glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err) continue @@ -1102,6 +1130,17 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) { return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil } +func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, shutDownTaint) + if err != nil { + glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + return err + } + return nil +} + // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. // The zone is considered: // - fullyDisrupted if there're no Ready Nodes, diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index 365f31dc998..6cba4870f5c 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -1360,6 +1360,118 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) { } } +func TestCloudProviderNodeShutdown(t *testing.T) { + + testCases := []struct { + testName string + node *v1.Node + shutdown bool + }{ + { + testName: "node shutdowned add taint", + shutdown: true, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + { + testName: "node started after shutdown remove taint", + shutdown: false, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + Taints: []v1.Taint{ + { + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.testName, func(t *testing.T) { + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{tc.node}, + Clientset: fake.NewSimpleClientset(), + } + nodeController, _ := newNodeLifecycleControllerFromClient( + nil, + fnh, + 10*time.Minute, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + false) + nodeController.cloud = &fakecloud.FakeCloud{} + nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) } + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return tc.shutdown, nil + } + + if err := nodeController.syncNodeStore(fnh); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeStatus(); err != nil { + t.Errorf("unexpected error: %v", err) + } + + if len(fnh.UpdatedNodes) != 1 { + t.Errorf("Node was not updated") + } + if tc.shutdown { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 { + t.Errorf("Node Taint was not added") + } + if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" { + t.Errorf("Node Taint key is not correct") + } + } else { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 { + t.Errorf("Node Taint was not removed after node is back in ready state") + } + } + }) + } + +} + // TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes // pods and the node when kubelet has not reported, and the cloudprovider says // the node is gone. @@ -1404,6 +1516,9 @@ func TestCloudProviderNoRateLimit(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return false, nil + } // monitorNodeStatus should allow this node to be immediately deleted if err := nodeController.syncNodeStore(fnh); err != nil { t.Errorf("unexpected error: %v", err) @@ -2242,6 +2357,9 @@ func TestNodeEventGeneration(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return false, nil + } nodeController.now = func() metav1.Time { return fakeNow } fakeRecorder := testutil.NewFakeRecorder() nodeController.recorder = fakeRecorder diff --git a/pkg/controller/util/node/controller_utils.go b/pkg/controller/util/node/controller_utils.go index b2c0ec23cf5..a286cdc4de3 100644 --- a/pkg/controller/util/node/controller_utils.go +++ b/pkg/controller/util/node/controller_utils.go @@ -187,6 +187,21 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam return true, nil } +// ShutdownInCloudProvider returns true if the node is shutdowned in +// cloud provider. +func ShutdownInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeName) (bool, error) { + instances, ok := cloud.Instances() + if !ok { + return false, fmt.Errorf("%v", ErrCloudInstance) + } + providerID, err := cloudprovider.GetInstanceProviderID(context.TODO(), cloud, nodeName) + if err != nil { + return false, err + } + shutdown, err := instances.InstanceShutdownByProviderID(context.TODO(), providerID) + return shutdown, err +} + // RecordNodeEvent records a event related to a node. func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) { ref := &v1.ObjectReference{ diff --git a/pkg/scheduler/algorithm/well_known_labels.go b/pkg/scheduler/algorithm/well_known_labels.go index e916481ce30..3e89296d230 100644 --- a/pkg/scheduler/algorithm/well_known_labels.go +++ b/pkg/scheduler/algorithm/well_known_labels.go @@ -62,8 +62,6 @@ const ( // the taint TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized" - // When node is shutdown in external cloud provider - // shutdown flag is needed for immediately volume detach - // after node comes back, the taint is removed + // TaintNodeShutdown when node is shutdown in external cloud provider TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown" )