diff --git a/pkg/cloudprovider/cloud.go b/pkg/cloudprovider/cloud.go index 027a6262f85..530475799df 100644 --- a/pkg/cloudprovider/cloud.go +++ b/pkg/cloudprovider/cloud.go @@ -145,6 +145,8 @@ type Instances interface { // InstanceExistsByProviderID returns true if the instance for the given provider id still is running. // If false is returned with no error, the instance will be immediately deleted by the cloud controller manager. InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) + // InstanceShutdownByProviderID returns true if the instance is shutdown in cloudprovider + InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) } // Route is a representation of an advanced routing rule. diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index 55928ef109d..a41f7eb68ad 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -1350,6 +1350,11 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin return true, nil } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (c *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceID returns the cloud provider ID of the node with the specified nodeName. func (c *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) { // In the future it is possible to also return an endpoint as: diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index 33cf8338ef6..c31003c3792 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -115,6 +115,11 @@ func (az *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID stri return true, nil } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + func (az *Cloud) isCurrentInstance(name types.NodeName) (bool, error) { nodeName := mapNodeNameToVMName(name) metadataName, err := az.metadata.Text("instance/compute/name") diff --git a/pkg/cloudprovider/providers/cloudstack/cloudstack_instances.go b/pkg/cloudprovider/providers/cloudstack/cloudstack_instances.go index db8faa80718..7e51811328f 100644 --- a/pkg/cloudprovider/providers/cloudstack/cloudstack_instances.go +++ b/pkg/cloudprovider/providers/cloudstack/cloudstack_instances.go @@ -153,3 +153,8 @@ func (cs *CSCloud) InstanceExistsByProviderID(ctx context.Context, providerID st return true, nil } + +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (cs *CSCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} diff --git a/pkg/cloudprovider/providers/cloudstack/metadata.go b/pkg/cloudprovider/providers/cloudstack/metadata.go index 70b289d8ebc..26eb9040702 100644 --- a/pkg/cloudprovider/providers/cloudstack/metadata.go +++ b/pkg/cloudprovider/providers/cloudstack/metadata.go @@ -114,6 +114,11 @@ func (m *metadata) InstanceExistsByProviderID(ctx context.Context, providerID st return false, errors.New("InstanceExistsByProviderID not implemented") } +// InstanceShutdownByProviderID returns if the instance is shutdown. +func (m *metadata) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // GetZone returns the Zone containing the region that the program is running in. func (m *metadata) GetZone(ctx context.Context) (cloudprovider.Zone, error) { zone := cloudprovider.Zone{} diff --git a/pkg/cloudprovider/providers/fake/fake.go b/pkg/cloudprovider/providers/fake/fake.go index ed05b523211..b4e065a5943 100644 --- a/pkg/cloudprovider/providers/fake/fake.go +++ b/pkg/cloudprovider/providers/fake/fake.go @@ -50,8 +50,10 @@ type FakeCloud struct { Exists bool Err error - ExistsByProviderID bool - ErrByProviderID error + ExistsByProviderID bool + ErrByProviderID error + NodeShutdown bool + ErrShutdownByProviderID error Calls []string Addresses []v1.NodeAddress @@ -233,6 +235,12 @@ func (f *FakeCloud) InstanceExistsByProviderID(ctx context.Context, providerID s return f.ExistsByProviderID, f.ErrByProviderID } +// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes +func (f *FakeCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + f.addCall("instance-shutdown-by-provider-id") + return f.NodeShutdown, f.ErrShutdownByProviderID +} + // List is a test-spy implementation of Instances.List. // It adds an entry "list" into the internal method call record. func (f *FakeCloud) List(filter string) ([]types.NodeName, error) { diff --git a/pkg/cloudprovider/providers/gce/gce_instances.go b/pkg/cloudprovider/providers/gce/gce_instances.go index c3a41ab274a..69c487ecfc4 100644 --- a/pkg/cloudprovider/providers/gce/gce_instances.go +++ b/pkg/cloudprovider/providers/gce/gce_instances.go @@ -141,6 +141,11 @@ func (gce *GCECloud) instanceByProviderID(providerID string) (*gceInstance, erro return instance, nil } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (gce *GCECloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceTypeByProviderID returns the cloudprovider instance type of the node // with the specified unique providerID This method will not be called from the // node that is requesting this ID. i.e. metadata service and other local diff --git a/pkg/cloudprovider/providers/openstack/openstack_instances.go b/pkg/cloudprovider/providers/openstack/openstack_instances.go index 43c60febded..de1f915f738 100644 --- a/pkg/cloudprovider/providers/openstack/openstack_instances.go +++ b/pkg/cloudprovider/providers/openstack/openstack_instances.go @@ -134,6 +134,11 @@ func (i *Instances) InstanceExistsByProviderID(ctx context.Context, providerID s return true, nil } +// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes +func (i *Instances) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceID returns the kubelet's cloud provider ID. func (os *OpenStack) InstanceID() (string, error) { if len(os.localInstanceID) == 0 { diff --git a/pkg/cloudprovider/providers/ovirt/ovirt.go b/pkg/cloudprovider/providers/ovirt/ovirt.go index e18f008a73d..686ba8e25ee 100644 --- a/pkg/cloudprovider/providers/ovirt/ovirt.go +++ b/pkg/cloudprovider/providers/ovirt/ovirt.go @@ -202,6 +202,11 @@ func (v *OVirtCloud) InstanceExistsByProviderID(ctx context.Context, providerID return false, cloudprovider.NotImplemented } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (v *OVirtCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceID returns the cloud provider ID of the node with the specified NodeName. func (v *OVirtCloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) { name := mapNodeNameToInstanceName(nodeName) diff --git a/pkg/cloudprovider/providers/photon/photon.go b/pkg/cloudprovider/providers/photon/photon.go index 2bfdd639abf..9411c15f2d5 100644 --- a/pkg/cloudprovider/providers/photon/photon.go +++ b/pkg/cloudprovider/providers/photon/photon.go @@ -460,6 +460,11 @@ func (pc *PCCloud) InstanceExistsByProviderID(ctx context.Context, providerID st return false, cloudprovider.NotImplemented } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (pc *PCCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceID returns the cloud provider ID of the specified instance. func (pc *PCCloud) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) { name := string(nodeName) diff --git a/pkg/cloudprovider/providers/vsphere/vsphere.go b/pkg/cloudprovider/providers/vsphere/vsphere.go index 47f18701409..cac821eb556 100644 --- a/pkg/cloudprovider/providers/vsphere/vsphere.go +++ b/pkg/cloudprovider/providers/vsphere/vsphere.go @@ -605,6 +605,11 @@ func (vs *VSphere) InstanceExistsByProviderID(ctx context.Context, providerID st return false, err } +// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes +func (vs *VSphere) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, cloudprovider.NotImplemented +} + // InstanceID returns the cloud provider ID of the node with the specified Name. func (vs *VSphere) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) { diff --git a/pkg/controller/BUILD b/pkg/controller/BUILD index a63a1312310..d7e732e4cd4 100644 --- a/pkg/controller/BUILD +++ b/pkg/controller/BUILD @@ -56,6 +56,7 @@ go_library( "//pkg/apis/core:go_default_library", "//pkg/apis/core/install:go_default_library", "//pkg/apis/core/validation:go_default_library", + "//pkg/scheduler/algorithm:go_default_library", "//pkg/serviceaccount:go_default_library", "//pkg/util/hash:go_default_library", "//pkg/util/taints:go_default_library", diff --git a/pkg/controller/cloud/BUILD b/pkg/controller/cloud/BUILD index cb35dda1476..d6ba3f1e5b6 100644 --- a/pkg/controller/cloud/BUILD +++ b/pkg/controller/cloud/BUILD @@ -17,6 +17,7 @@ go_library( "//pkg/api/v1/node:go_default_library", "//pkg/cloudprovider:go_default_library", "//pkg/controller:go_default_library", + "//pkg/controller/util/node:go_default_library", "//pkg/kubelet/apis:go_default_library", "//pkg/scheduler/algorithm:go_default_library", "//pkg/util/node:go_default_library", diff --git a/pkg/controller/cloud/node_controller.go b/pkg/controller/cloud/node_controller.go index fa837993425..c10f69bb5d6 100644 --- a/pkg/controller/cloud/node_controller.go +++ b/pkg/controller/cloud/node_controller.go @@ -37,6 +37,8 @@ import ( clientretry "k8s.io/client-go/util/retry" nodeutilv1 "k8s.io/kubernetes/pkg/api/v1/node" "k8s.io/kubernetes/pkg/cloudprovider" + "k8s.io/kubernetes/pkg/controller" + nodectrlutil "k8s.io/kubernetes/pkg/controller/util/node" kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis" "k8s.io/kubernetes/pkg/scheduler/algorithm" nodeutil "k8s.io/kubernetes/pkg/util/node" @@ -243,6 +245,24 @@ func (cnc *CloudNodeController) MonitorNode() { // from the cloud provider. If node cannot be found in cloudprovider, then delete the node immediately if currentReadyCondition != nil { if currentReadyCondition.Status != v1.ConditionTrue { + // we need to check this first to get taint working in similar in all cloudproviders + // current problem is that shutdown nodes are not working in similar way ie. all cloudproviders + // does not delete node from kubernetes cluster when instance it is shutdown see issue #46442 + shutdown, err := nodectrlutil.ShutdownInCloudProvider(context.TODO(), cnc.cloud, node) + if err != nil { + glog.Errorf("Error getting data for node %s from cloud: %v", node.Name, err) + } + + if shutdown && err == nil { + // if node is shutdown add shutdown taint + err = controller.AddOrUpdateTaintOnNode(cnc.kubeClient, node.Name, controller.ShutdownTaint) + if err != nil { + glog.Errorf("Error patching node taints: %v", err) + } + // Continue checking the remaining nodes since the current one is shutdown. + continue + } + // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. exists, err := ensureNodeExistsByProviderIDOrInstanceID(instances, node) @@ -275,6 +295,12 @@ func (cnc *CloudNodeController) MonitorNode() { } }(node.Name) + } else { + // if taint exist remove taint + err = controller.RemoveTaintOffNode(cnc.kubeClient, node.Name, node, controller.ShutdownTaint) + if err != nil { + glog.Errorf("Error patching node taints: %v", err) + } } } } diff --git a/pkg/controller/cloud/node_controller_test.go b/pkg/controller/cloud/node_controller_test.go index 57add649617..019b68d6a1b 100644 --- a/pkg/controller/cloud/node_controller_test.go +++ b/pkg/controller/cloud/node_controller_test.go @@ -148,6 +148,115 @@ func TestEnsureNodeExistsByProviderIDOrNodeName(t *testing.T) { } +func TestNodeShutdown(t *testing.T) { + + testCases := []struct { + testName string + node *v1.Node + existsByProviderID bool + shutdown bool + }{ + { + testName: "node shutdowned add taint", + existsByProviderID: true, + shutdown: true, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + { + testName: "node started after shutdown remove taint", + existsByProviderID: true, + shutdown: false, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + Taints: []v1.Taint{ + { + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.testName, func(t *testing.T) { + fc := &fakecloud.FakeCloud{ + ExistsByProviderID: tc.existsByProviderID, + NodeShutdown: tc.shutdown, + } + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{tc.node}, + Clientset: fake.NewSimpleClientset(), + PatchWaitChan: make(chan struct{}), + } + + factory := informers.NewSharedInformerFactory(fnh, controller.NoResyncPeriodFunc()) + + eventBroadcaster := record.NewBroadcaster() + cloudNodeController := &CloudNodeController{ + kubeClient: fnh, + nodeInformer: factory.Core().V1().Nodes(), + cloud: fc, + nodeMonitorPeriod: 1 * time.Second, + recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}), + nodeStatusUpdateFrequency: 1 * time.Second, + } + eventBroadcaster.StartLogging(glog.Infof) + + cloudNodeController.Run() + + select { + case <-fnh.PatchWaitChan: + case <-time.After(1 * time.Second): + t.Errorf("Timed out waiting %v for node to be updated", wait.ForeverTestTimeout) + } + + assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated") + if tc.shutdown { + assert.Equal(t, 1, len(fnh.UpdatedNodes[0].Spec.Taints), "Node Taint was not added") + assert.Equal(t, "node.cloudprovider.kubernetes.io/shutdown", fnh.UpdatedNodes[0].Spec.Taints[0].Key, "Node Taint key is not correct") + } else { + assert.Equal(t, 0, len(fnh.UpdatedNodes[0].Spec.Taints), "Node Taint was not removed after node is back in ready state") + } + + }) + } + +} + // This test checks that the node is deleted when kubelet stops reporting // and cloud provider says node is gone func TestNodeDeleted(t *testing.T) { diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index d7d755ebfd1..cc3f5828008 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -45,6 +45,7 @@ import ( podutil "k8s.io/kubernetes/pkg/api/v1/pod" _ "k8s.io/kubernetes/pkg/apis/core/install" "k8s.io/kubernetes/pkg/apis/core/validation" + "k8s.io/kubernetes/pkg/scheduler/algorithm" hashutil "k8s.io/kubernetes/pkg/util/hash" taintutils "k8s.io/kubernetes/pkg/util/taints" @@ -86,6 +87,11 @@ var UpdateTaintBackoff = wait.Backoff{ Jitter: 1.0, } +var ShutdownTaint = &v1.Taint{ + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, +} + var ( KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc ) diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index 5c80c77c8a5..81774d19654 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -22,6 +22,7 @@ limitations under the License. package nodelifecycle import ( + "context" "fmt" "sync" "time" @@ -151,9 +152,10 @@ type Controller struct { daemonSetStore extensionslisters.DaemonSetLister daemonSetInformerSynced cache.InformerSynced - nodeLister corelisters.NodeLister - nodeInformerSynced cache.InformerSynced - nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeLister corelisters.NodeLister + nodeInformerSynced cache.InformerSynced + nodeExistsInCloudProvider func(types.NodeName) (bool, error) + nodeShutdownInCloudProvider func(context.Context, *v1.Node) (bool, error) recorder record.EventRecorder @@ -239,6 +241,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer, nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeutil.ExistsInCloudProvider(cloud, nodeName) }, + nodeShutdownInCloudProvider: func(ctx context.Context, node *v1.Node) (bool, error) { + return nodeutil.ShutdownInCloudProvider(ctx, cloud, node) + }, recorder: recorder, nodeMonitorPeriod: nodeMonitorPeriod, nodeStartupGracePeriod: nodeStartupGracePeriod, @@ -667,6 +672,11 @@ func (nc *Controller) monitorNodeStatus() error { glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) } } + // remove shutdown taint this is needed always depending do we use taintbased or not + err := nc.markNodeAsNotShutdown(node) + if err != nil { + glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) + } } // Report node event. @@ -680,6 +690,19 @@ func (nc *Controller) monitorNodeStatus() error { // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil { + // check is node shutdowned, if yes do not deleted it. Instead add taint + shutdown, err := nc.nodeShutdownInCloudProvider(context.TODO(), node) + if err != nil { + glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err) + } + // node shutdown + if shutdown && err == nil { + err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, controller.ShutdownTaint) + if err != nil { + glog.Errorf("Error patching node taints: %v", err) + } + continue + } exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name)) if err != nil { glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err) @@ -1118,6 +1141,17 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) { return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil } +func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, controller.ShutdownTaint) + if err != nil { + glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + return err + } + return nil +} + // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. // The zone is considered: // - fullyDisrupted if there're no Ready Nodes, diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index 741a08d04dd..35f37e2ba37 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -17,6 +17,7 @@ limitations under the License. package nodelifecycle import ( + "context" "strings" "testing" "time" @@ -1360,6 +1361,118 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) { } } +func TestCloudProviderNodeShutdown(t *testing.T) { + + testCases := []struct { + testName string + node *v1.Node + shutdown bool + }{ + { + testName: "node shutdowned add taint", + shutdown: true, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + { + testName: "node started after shutdown remove taint", + shutdown: false, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + Taints: []v1.Taint{ + { + Key: algorithm.TaintNodeShutdown, + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionTrue, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + }, + }, + } + for _, tc := range testCases { + t.Run(tc.testName, func(t *testing.T) { + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{tc.node}, + Clientset: fake.NewSimpleClientset(), + } + nodeController, _ := newNodeLifecycleControllerFromClient( + nil, + fnh, + 10*time.Minute, + testRateLimiterQPS, + testRateLimiterQPS, + testLargeClusterThreshold, + testUnhealthyThreshold, + testNodeMonitorGracePeriod, + testNodeStartupGracePeriod, + testNodeMonitorPeriod, + false) + nodeController.cloud = &fakecloud.FakeCloud{} + nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) } + nodeController.recorder = testutil.NewFakeRecorder() + nodeController.nodeShutdownInCloudProvider = func(ctx context.Context, node *v1.Node) (bool, error) { + return tc.shutdown, nil + } + + if err := nodeController.syncNodeStore(fnh); err != nil { + t.Errorf("unexpected error: %v", err) + } + if err := nodeController.monitorNodeStatus(); err != nil { + t.Errorf("unexpected error: %v", err) + } + + if len(fnh.UpdatedNodes) != 1 { + t.Errorf("Node was not updated") + } + if tc.shutdown { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 { + t.Errorf("Node Taint was not added") + } + if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" { + t.Errorf("Node Taint key is not correct") + } + } else { + if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 { + t.Errorf("Node Taint was not removed after node is back in ready state") + } + } + }) + } + +} + // TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes // pods and the node when kubelet has not reported, and the cloudprovider says // the node is gone. @@ -1404,6 +1517,9 @@ func TestCloudProviderNoRateLimit(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(ctx context.Context, node *v1.Node) (bool, error) { + return false, nil + } // monitorNodeStatus should allow this node to be immediately deleted if err := nodeController.syncNodeStore(fnh); err != nil { t.Errorf("unexpected error: %v", err) @@ -2224,6 +2340,9 @@ func TestNodeEventGeneration(t *testing.T) { nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) { return false, nil } + nodeController.nodeShutdownInCloudProvider = func(ctx context.Context, node *v1.Node) (bool, error) { + return false, nil + } nodeController.now = func() metav1.Time { return fakeNow } fakeRecorder := testutil.NewFakeRecorder() nodeController.recorder = fakeRecorder diff --git a/pkg/controller/testutil/test_utils.go b/pkg/controller/testutil/test_utils.go index 7fb71e9f7fd..770df607c41 100644 --- a/pkg/controller/testutil/test_utils.go +++ b/pkg/controller/testutil/test_utils.go @@ -67,6 +67,7 @@ type FakeNodeHandler struct { // Synchronization lock sync.Mutex DeleteWaitChan chan struct{} + PatchWaitChan chan struct{} } // FakeLegacyHandler is a fake implemtation of CoreV1Interface. @@ -270,6 +271,9 @@ func (m *FakeNodeHandler) Patch(name string, pt types.PatchType, data []byte, su m.lock.Lock() defer func() { m.RequestCount++ + if m.PatchWaitChan != nil { + m.PatchWaitChan <- struct{}{} + } m.lock.Unlock() }() var nodeCopy v1.Node diff --git a/pkg/controller/util/node/controller_utils.go b/pkg/controller/util/node/controller_utils.go index 9a719e0b268..36fbddda037 100644 --- a/pkg/controller/util/node/controller_utils.go +++ b/pkg/controller/util/node/controller_utils.go @@ -187,6 +187,20 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam return true, nil } +// ShutdownInCloudProvider returns true if the node is shutdowned in +// cloud provider. +func ShutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) { + instances, ok := cloud.Instances() + if !ok { + return false, fmt.Errorf("%v", ErrCloudInstance) + } + shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID) + if err == cloudprovider.NotImplemented { + return false, nil + } + return shutdown, err +} + // RecordNodeEvent records a event related to a node. func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) { ref := &v1.ObjectReference{ diff --git a/pkg/scheduler/algorithm/well_known_labels.go b/pkg/scheduler/algorithm/well_known_labels.go index 4b3ba39d1af..887b5b862fc 100644 --- a/pkg/scheduler/algorithm/well_known_labels.go +++ b/pkg/scheduler/algorithm/well_known_labels.go @@ -71,4 +71,7 @@ const ( // from the cloud-controller-manager intitializes this node, and then removes // the taint TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized" + + // TaintNodeShutdown when node is shutdown in external cloud provider + TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown" ) diff --git a/pkg/volume/cinder/attacher_test.go b/pkg/volume/cinder/attacher_test.go index ee0887a8182..409d018843b 100644 --- a/pkg/volume/cinder/attacher_test.go +++ b/pkg/volume/cinder/attacher_test.go @@ -728,6 +728,10 @@ func (instances *instances) InstanceExistsByProviderID(ctx context.Context, prov return false, errors.New("unimplemented") } +func (instances *instances) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + func (instances *instances) List(filter string) ([]types.NodeName, error) { return []types.NodeName{}, errors.New("Not implemented") }