From 6665fa71441fb136303aa5adcb4b2f2106332604 Mon Sep 17 00:00:00 2001 From: Jesse Haka Date: Fri, 9 Feb 2018 23:05:05 +0200 Subject: [PATCH] taint also node controller fix function fix gofmt fix function return value fix tests skip notimplemented error remove factory unused in openstack we should try to find instanceid from all states instead of ACTIVE, all other cloudproviders do this already fix tests and lint fix gofmt fix nodelifecycletest fix lint errors --- .../openstack/openstack_instances.go | 3 +- .../node_lifecycle_controller.go | 47 ++++++- .../node_lifecycle_controller_test.go | 118 ++++++++++++++++++ pkg/controller/util/node/controller_utils.go | 15 +++ pkg/scheduler/algorithm/well_known_labels.go | 4 +- 5 files changed, 179 insertions(+), 8 deletions(-) diff --git a/pkg/cloudprovider/providers/openstack/openstack_instances.go b/pkg/cloudprovider/providers/openstack/openstack_instances.go index 4cd0571ed86..a6ef7f434c1 100644 --- a/pkg/cloudprovider/providers/openstack/openstack_instances.go +++ b/pkg/cloudprovider/providers/openstack/openstack_instances.go @@ -160,7 +160,8 @@ func (os *OpenStack) InstanceID() (string, error) { // InstanceID returns the cloud provider ID of the specified instance. func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) { - srv, err := getServerByName(i.compute, name, true) + // we should fetch instanceid from all states instead of ACTIVE + srv, err := getServerByName(i.compute, name, false) if err != nil { if err == ErrNotFound { return "", cloudprovider.InstanceNotFound diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index 953d9246c33..6de3fd14a65 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -79,6 +79,11 @@ var ( Effect: v1.TaintEffectNoExecute, } + shutDownTaint = &v1.Taint{ + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + } + nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{ v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure, v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk, @@ -151,9 +156,10 @@ type Controller struct { daemonSetStore extensionslisters.DaemonSetLister daemonSetInformerSynced cache.InformerSynced - nodeLister corelisters.NodeLister - nodeInformerSynced cache.InformerSynced - nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeLister corelisters.NodeLister + nodeInformerSynced cache.InformerSynced + nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeShutdownInCloudProvider func(types.NodeName) (bool, error) recorder record.EventRecorder @@ -239,6 +245,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer, nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeutil.ExistsInCloudProvider(cloud, nodeName) }, + nodeShutdownInCloudProvider: func(nodeName types.NodeName) (bool, error) { + return nodeutil.ShutdownInCloudProvider(cloud, nodeName) + }, recorder: recorder, nodeMonitorPeriod: nodeMonitorPeriod, nodeStartupGracePeriod: nodeStartupGracePeriod, @@ -653,6 +662,11 @@ func (nc *Controller) monitorNodeStatus() error { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } + // remove shutdown taint this is needed always depending do we use taintbased or not + err := nc.markNodeAsNotShutdown(node) + if err != nil { + glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) + } } // Report node event. @@ -666,7 +680,21 @@ func (nc *Controller) monitorNodeStatus() error { // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil { - exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) + // check is node shutdowned, if yes do not deleted it. Instead add taint + exists, err := nc.nodeShutdownInCloudProvider(types.NodeName(node.Name)) + if err != nil && err != cloudprovider.NotImplemented { + glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err) + continue + } + // node shutdown + if exists { + err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, shutDownTaint) + if err != nil { + glog.Errorf("Error patching node taints: %v", err) + } + continue + } + exists, err = nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) if err != nil { glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err) continue @@ -1102,6 +1130,17 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) { return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil } +func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, shutDownTaint) + if err != nil { + glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + return err + } + return nil +} + // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. // The zone is considered: // - fullyDisrupted if there're no Ready Nodes, diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index 365f31dc998..6cba4870f5c 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -1360,6 +1360,118 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) { } } +func TestCloudProviderNodeShutdown(t *testing.T) { + + testCases := []struct { + testName string + node *v1.Node + shutdown bool + }{ + { + testName: "node shutdowned add taint", + shutdown: true, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + { + testName: "node started after shutdown remove taint", + shutdown: false, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + Taints: []v1.Taint{ + { + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.testName, func(t *testing.T) { + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{tc.node}, + Clientset: fake.NewSimpleClientset(), + } + nodeController, _ := newNodeLifecycleControllerFromClient( + nil, + fnh, + 10*time.Minute, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + false) + nodeController.cloud = &fakecloud.FakeCloud{} + nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) } + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return tc.shutdown, nil + } + + if err := nodeController.syncNodeStore(fnh); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeStatus(); err != nil { + t.Errorf("unexpected error: %v", err) + } + + if len(fnh.UpdatedNodes) != 1 { + t.Errorf("Node was not updated") + } + if tc.shutdown { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 { + t.Errorf("Node Taint was not added") + } + if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" { + t.Errorf("Node Taint key is not correct") + } + } else { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 { + t.Errorf("Node Taint was not removed after node is back in ready state") + } + } + }) + } + +} + // TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes // pods and the node when kubelet has not reported, and the cloudprovider says // the node is gone. @@ -1404,6 +1516,9 @@ func TestCloudProviderNoRateLimit(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return false, nil + } // monitorNodeStatus should allow this node to be immediately deleted if err := nodeController.syncNodeStore(fnh); err != nil { t.Errorf("unexpected error: %v", err) @@ -2242,6 +2357,9 @@ func TestNodeEventGeneration(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) { + return false, nil + } nodeController.now = func() metav1.Time { return fakeNow } fakeRecorder := testutil.NewFakeRecorder() nodeController.recorder = fakeRecorder diff --git a/pkg/controller/util/node/controller_utils.go b/pkg/controller/util/node/controller_utils.go index b2c0ec23cf5..a286cdc4de3 100644 --- a/pkg/controller/util/node/controller_utils.go +++ b/pkg/controller/util/node/controller_utils.go @@ -187,6 +187,21 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam return true, nil } +// ShutdownInCloudProvider returns true if the node is shutdowned in +// cloud provider. +func ShutdownInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeName) (bool, error) { + instances, ok := cloud.Instances() + if !ok { + return false, fmt.Errorf("%v", ErrCloudInstance) + } + providerID, err := cloudprovider.GetInstanceProviderID(context.TODO(), cloud, nodeName) + if err != nil { + return false, err + } + shutdown, err := instances.InstanceShutdownByProviderID(context.TODO(), providerID) + return shutdown, err +} + // RecordNodeEvent records a event related to a node. func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) { ref := &v1.ObjectReference{ diff --git a/pkg/scheduler/algorithm/well_known_labels.go b/pkg/scheduler/algorithm/well_known_labels.go index e916481ce30..3e89296d230 100644 --- a/pkg/scheduler/algorithm/well_known_labels.go +++ b/pkg/scheduler/algorithm/well_known_labels.go @@ -62,8 +62,6 @@ const ( // the taint TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized" - // When node is shutdown in external cloud provider - // shutdown flag is needed for immediately volume detach - // after node comes back, the taint is removed + // TaintNodeShutdown when node is shutdown in external cloud provider TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown" )