mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 11:50:44 +00:00
Merge pull request #59968 from kubernetes/revert-59323-nodetaint
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Revert "add node shutdown taint" Reverts kubernetes/kubernetes#59323 Node becomes unready, but is never removed. I've found the following in [kube-controller-manager.log](https://storage.googleapis.com/kubernetes-jenkins/logs/ci-kubernetes-e2e-gci-gce-autoscaling/6055/artifacts/bootstrap-e2e-master/cluster-autoscaler.log) from test run for one such node: `E0216 01:14:27.084923 1 node_lifecycle_controller.go:686] Error determining if node bootstrap-e2e-minion-group-01b1 shutdown in cloud: failed to get instance ID from cloud provider: instance not found` This goes on for the rest of the run (~6h). Looks like the node is stuck in Unready state because of this check: https://github.com/kubernetes/kubernetes/blob/master/pkg/controller/nodelifecycle/node_lifecycle_controller.go#L684. Previously, there was no such check and the node was removed. Reverting as this would affect all users attempting to resize their node groups on GCE. ```release-note NONE ```
This commit is contained in:
commit
6d0b71740f
@ -148,8 +148,6 @@ type Instances interface {
|
|||||||
// InstanceExistsByProviderID returns true if the instance for the given provider id still is running.
|
// InstanceExistsByProviderID returns true if the instance for the given provider id still is running.
|
||||||
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
|
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
|
||||||
InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error)
|
InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error)
|
||||||
// InstanceShutdownByProviderID returns true if the instance is shutdown in cloudprovider
|
|
||||||
InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Route is a representation of an advanced routing rule.
|
// Route is a representation of an advanced routing rule.
|
||||||
|
@ -1368,11 +1368,6 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (c *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the node with the specified nodeName.
|
// InstanceID returns the cloud provider ID of the node with the specified nodeName.
|
||||||
func (c *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
func (c *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
||||||
// In the future it is possible to also return an endpoint as:
|
// In the future it is possible to also return an endpoint as:
|
||||||
|
@ -103,11 +103,6 @@ func (az *Cloud) isCurrentInstance(name types.NodeName) (bool, error) {
|
|||||||
return (metadataName == nodeName), err
|
return (metadataName == nodeName), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (az *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the specified instance.
|
// InstanceID returns the cloud provider ID of the specified instance.
|
||||||
// Note that if the instance does not exist or is no longer running, we must return ("", cloudprovider.InstanceNotFound)
|
// Note that if the instance does not exist or is no longer running, we must return ("", cloudprovider.InstanceNotFound)
|
||||||
func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
|
func (az *Cloud) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
|
||||||
|
@ -158,8 +158,3 @@ func (cs *CSCloud) InstanceExistsByProviderID(ctx context.Context, providerID st
|
|||||||
|
|
||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (cs *CSCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
@ -119,11 +119,6 @@ func (m *metadata) InstanceExistsByProviderID(ctx context.Context, providerID st
|
|||||||
return false, errors.New("InstanceExistsByProviderID not implemented")
|
return false, errors.New("InstanceExistsByProviderID not implemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns if the instance is shutdown.
|
|
||||||
func (m *metadata) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetZone returns the Zone containing the region that the program is running in.
|
// GetZone returns the Zone containing the region that the program is running in.
|
||||||
func (m *metadata) GetZone(ctx context.Context) (cloudprovider.Zone, error) {
|
func (m *metadata) GetZone(ctx context.Context) (cloudprovider.Zone, error) {
|
||||||
zone := cloudprovider.Zone{}
|
zone := cloudprovider.Zone{}
|
||||||
|
@ -50,10 +50,8 @@ type FakeCloud struct {
|
|||||||
Exists bool
|
Exists bool
|
||||||
Err error
|
Err error
|
||||||
|
|
||||||
ExistsByProviderID bool
|
ExistsByProviderID bool
|
||||||
ErrByProviderID error
|
ErrByProviderID error
|
||||||
NodeShutdown bool
|
|
||||||
ErrShutdownByProviderID error
|
|
||||||
|
|
||||||
Calls []string
|
Calls []string
|
||||||
Addresses []v1.NodeAddress
|
Addresses []v1.NodeAddress
|
||||||
@ -243,12 +241,6 @@ func (f *FakeCloud) InstanceExistsByProviderID(ctx context.Context, providerID s
|
|||||||
return f.ExistsByProviderID, f.ErrByProviderID
|
return f.ExistsByProviderID, f.ErrByProviderID
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
|
|
||||||
func (f *FakeCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
f.addCall("instance-shutdown-by-provider-id")
|
|
||||||
return f.NodeShutdown, f.ErrShutdownByProviderID
|
|
||||||
}
|
|
||||||
|
|
||||||
// List is a test-spy implementation of Instances.List.
|
// List is a test-spy implementation of Instances.List.
|
||||||
// It adds an entry "list" into the internal method call record.
|
// It adds an entry "list" into the internal method call record.
|
||||||
func (f *FakeCloud) List(filter string) ([]types.NodeName, error) {
|
func (f *FakeCloud) List(filter string) ([]types.NodeName, error) {
|
||||||
|
@ -190,11 +190,6 @@ func (gce *GCECloud) InstanceExistsByProviderID(ctx context.Context, providerID
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (gce *GCECloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the node with the specified NodeName.
|
// InstanceID returns the cloud provider ID of the node with the specified NodeName.
|
||||||
func (gce *GCECloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
func (gce *GCECloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
||||||
instanceName := mapNodeNameToInstanceName(nodeName)
|
instanceName := mapNodeNameToInstanceName(nodeName)
|
||||||
|
@ -141,11 +141,6 @@ func (i *Instances) InstanceExistsByProviderID(ctx context.Context, providerID s
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
|
|
||||||
func (i *Instances) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the kubelet's cloud provider ID.
|
// InstanceID returns the kubelet's cloud provider ID.
|
||||||
func (os *OpenStack) InstanceID() (string, error) {
|
func (os *OpenStack) InstanceID() (string, error) {
|
||||||
if len(os.localInstanceID) == 0 {
|
if len(os.localInstanceID) == 0 {
|
||||||
@ -160,8 +155,7 @@ func (os *OpenStack) InstanceID() (string, error) {
|
|||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the specified instance.
|
// InstanceID returns the cloud provider ID of the specified instance.
|
||||||
func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
|
func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
|
||||||
// we should fetch instanceid from all states instead of ACTIVE
|
srv, err := getServerByName(i.compute, name, true)
|
||||||
srv, err := getServerByName(i.compute, name, false)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == ErrNotFound {
|
if err == ErrNotFound {
|
||||||
return "", cloudprovider.InstanceNotFound
|
return "", cloudprovider.InstanceNotFound
|
||||||
|
@ -212,11 +212,6 @@ func (v *OVirtCloud) InstanceExistsByProviderID(ctx context.Context, providerID
|
|||||||
return false, cloudprovider.NotImplemented
|
return false, cloudprovider.NotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (v *OVirtCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the node with the specified NodeName.
|
// InstanceID returns the cloud provider ID of the node with the specified NodeName.
|
||||||
func (v *OVirtCloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
func (v *OVirtCloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
||||||
name := mapNodeNameToInstanceName(nodeName)
|
name := mapNodeNameToInstanceName(nodeName)
|
||||||
|
@ -477,11 +477,6 @@ func (pc *PCCloud) InstanceExistsByProviderID(ctx context.Context, providerID st
|
|||||||
return false, cloudprovider.NotImplemented
|
return false, cloudprovider.NotImplemented
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (pc *PCCloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the specified instance.
|
// InstanceID returns the cloud provider ID of the specified instance.
|
||||||
func (pc *PCCloud) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) {
|
func (pc *PCCloud) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) {
|
||||||
name := string(nodeName)
|
name := string(nodeName)
|
||||||
|
@ -609,11 +609,6 @@ func (vs *VSphere) InstanceExistsByProviderID(ctx context.Context, providerID st
|
|||||||
return false, err
|
return false, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// InstanceShutdownByProviderID returns true if the instance is in safe state to detach volumes
|
|
||||||
func (vs *VSphere) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, cloudprovider.NotImplemented
|
|
||||||
}
|
|
||||||
|
|
||||||
// InstanceID returns the cloud provider ID of the node with the specified Name.
|
// InstanceID returns the cloud provider ID of the node with the specified Name.
|
||||||
func (vs *VSphere) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) {
|
func (vs *VSphere) InstanceID(ctx context.Context, nodeName k8stypes.NodeName) (string, error) {
|
||||||
|
|
||||||
|
@ -37,23 +37,16 @@ import (
|
|||||||
clientretry "k8s.io/client-go/util/retry"
|
clientretry "k8s.io/client-go/util/retry"
|
||||||
nodeutilv1 "k8s.io/kubernetes/pkg/api/v1/node"
|
nodeutilv1 "k8s.io/kubernetes/pkg/api/v1/node"
|
||||||
"k8s.io/kubernetes/pkg/cloudprovider"
|
"k8s.io/kubernetes/pkg/cloudprovider"
|
||||||
"k8s.io/kubernetes/pkg/controller"
|
|
||||||
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
|
||||||
"k8s.io/kubernetes/pkg/scheduler/algorithm"
|
"k8s.io/kubernetes/pkg/scheduler/algorithm"
|
||||||
nodeutil "k8s.io/kubernetes/pkg/util/node"
|
nodeutil "k8s.io/kubernetes/pkg/util/node"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var UpdateNodeSpecBackoff = wait.Backoff{
|
||||||
UpdateNodeSpecBackoff = wait.Backoff{
|
Steps: 20,
|
||||||
Steps: 20,
|
Duration: 50 * time.Millisecond,
|
||||||
Duration: 50 * time.Millisecond,
|
Jitter: 1.0,
|
||||||
Jitter: 1.0}
|
}
|
||||||
|
|
||||||
ShutDownTaint = &v1.Taint{
|
|
||||||
Key: algorithm.TaintNodeShutdown,
|
|
||||||
Effect: v1.TaintEffectNoSchedule,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
type CloudNodeController struct {
|
type CloudNodeController struct {
|
||||||
nodeInformer coreinformers.NodeInformer
|
nodeInformer coreinformers.NodeInformer
|
||||||
@ -250,28 +243,9 @@ func (cnc *CloudNodeController) MonitorNode() {
|
|||||||
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node immediately
|
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node immediately
|
||||||
if currentReadyCondition != nil {
|
if currentReadyCondition != nil {
|
||||||
if currentReadyCondition.Status != v1.ConditionTrue {
|
if currentReadyCondition.Status != v1.ConditionTrue {
|
||||||
// we need to check this first to get taint working in similar in all cloudproviders
|
|
||||||
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
|
|
||||||
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
|
|
||||||
exists, err := instances.InstanceShutdownByProviderID(context.TODO(), node.Spec.ProviderID)
|
|
||||||
if err != nil && err != cloudprovider.NotImplemented {
|
|
||||||
glog.Errorf("Error getting data for node %s from cloud: %v", node.Name, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if exists {
|
|
||||||
// if node is shutdown add shutdown taint
|
|
||||||
err = controller.AddOrUpdateTaintOnNode(cnc.kubeClient, node.Name, ShutDownTaint)
|
|
||||||
if err != nil {
|
|
||||||
glog.Errorf("Error patching node taints: %v", err)
|
|
||||||
}
|
|
||||||
// Continue checking the remaining nodes since the current one is fine.
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check with the cloud provider to see if the node still exists. If it
|
// Check with the cloud provider to see if the node still exists. If it
|
||||||
// doesn't, delete the node immediately.
|
// doesn't, delete the node immediately.
|
||||||
exists, err = ensureNodeExistsByProviderIDOrExternalID(instances, node)
|
exists, err := ensureNodeExistsByProviderIDOrExternalID(instances, node)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("Error getting data for node %s from cloud: %v", node.Name, err)
|
glog.Errorf("Error getting data for node %s from cloud: %v", node.Name, err)
|
||||||
continue
|
continue
|
||||||
@ -301,12 +275,6 @@ func (cnc *CloudNodeController) MonitorNode() {
|
|||||||
}
|
}
|
||||||
}(node.Name)
|
}(node.Name)
|
||||||
|
|
||||||
} else {
|
|
||||||
// if taint exist remove taint
|
|
||||||
err = controller.RemoveTaintOffNode(cnc.kubeClient, node.Name, node, ShutDownTaint)
|
|
||||||
if err != nil {
|
|
||||||
glog.Errorf("Error patching node taints: %v", err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -148,115 +148,6 @@ func TestEnsureNodeExistsByProviderIDOrNodeName(t *testing.T) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNodeShutdown(t *testing.T) {
|
|
||||||
|
|
||||||
testCases := []struct {
|
|
||||||
testName string
|
|
||||||
node *v1.Node
|
|
||||||
existsByProviderID bool
|
|
||||||
shutdown bool
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
testName: "node shutdowned add taint",
|
|
||||||
existsByProviderID: true,
|
|
||||||
shutdown: true,
|
|
||||||
node: &v1.Node{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: "node0",
|
|
||||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
Spec: v1.NodeSpec{
|
|
||||||
ProviderID: "node0",
|
|
||||||
},
|
|
||||||
Status: v1.NodeStatus{
|
|
||||||
Conditions: []v1.NodeCondition{
|
|
||||||
{
|
|
||||||
Type: v1.NodeReady,
|
|
||||||
Status: v1.ConditionUnknown,
|
|
||||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
testName: "node started after shutdown remove taint",
|
|
||||||
existsByProviderID: true,
|
|
||||||
shutdown: false,
|
|
||||||
node: &v1.Node{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: "node0",
|
|
||||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
Spec: v1.NodeSpec{
|
|
||||||
ProviderID: "node0",
|
|
||||||
Taints: []v1.Taint{
|
|
||||||
{
|
|
||||||
Key: algorithm.TaintNodeShutdown,
|
|
||||||
Effect: v1.TaintEffectNoSchedule,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Status: v1.NodeStatus{
|
|
||||||
Conditions: []v1.NodeCondition{
|
|
||||||
{
|
|
||||||
Type: v1.NodeReady,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for _, tc := range testCases {
|
|
||||||
t.Run(tc.testName, func(t *testing.T) {
|
|
||||||
fc := &fakecloud.FakeCloud{
|
|
||||||
ExistsByProviderID: tc.existsByProviderID,
|
|
||||||
NodeShutdown: tc.shutdown,
|
|
||||||
}
|
|
||||||
fnh := &testutil.FakeNodeHandler{
|
|
||||||
Existing: []*v1.Node{tc.node},
|
|
||||||
Clientset: fake.NewSimpleClientset(),
|
|
||||||
PatchWaitChan: make(chan struct{}),
|
|
||||||
}
|
|
||||||
|
|
||||||
factory := informers.NewSharedInformerFactory(fnh, controller.NoResyncPeriodFunc())
|
|
||||||
|
|
||||||
eventBroadcaster := record.NewBroadcaster()
|
|
||||||
cloudNodeController := &CloudNodeController{
|
|
||||||
kubeClient: fnh,
|
|
||||||
nodeInformer: factory.Core().V1().Nodes(),
|
|
||||||
cloud: fc,
|
|
||||||
nodeMonitorPeriod: 1 * time.Second,
|
|
||||||
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}),
|
|
||||||
nodeStatusUpdateFrequency: 1 * time.Second,
|
|
||||||
}
|
|
||||||
eventBroadcaster.StartLogging(glog.Infof)
|
|
||||||
|
|
||||||
cloudNodeController.Run()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-fnh.PatchWaitChan:
|
|
||||||
case <-time.After(1 * time.Second):
|
|
||||||
t.Errorf("Timed out waiting %v for node to be updated", wait.ForeverTestTimeout)
|
|
||||||
}
|
|
||||||
|
|
||||||
assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated")
|
|
||||||
if tc.shutdown {
|
|
||||||
assert.Equal(t, 1, len(fnh.UpdatedNodes[0].Spec.Taints), "Node Taint was not added")
|
|
||||||
assert.Equal(t, "node.cloudprovider.kubernetes.io/shutdown", fnh.UpdatedNodes[0].Spec.Taints[0].Key, "Node Taint key is not correct")
|
|
||||||
} else {
|
|
||||||
assert.Equal(t, 0, len(fnh.UpdatedNodes[0].Spec.Taints), "Node Taint was not removed after node is back in ready state")
|
|
||||||
}
|
|
||||||
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// This test checks that the node is deleted when kubelet stops reporting
|
// This test checks that the node is deleted when kubelet stops reporting
|
||||||
// and cloud provider says node is gone
|
// and cloud provider says node is gone
|
||||||
func TestNodeDeleted(t *testing.T) {
|
func TestNodeDeleted(t *testing.T) {
|
||||||
|
@ -79,11 +79,6 @@ var (
|
|||||||
Effect: v1.TaintEffectNoExecute,
|
Effect: v1.TaintEffectNoExecute,
|
||||||
}
|
}
|
||||||
|
|
||||||
shutDownTaint = &v1.Taint{
|
|
||||||
Key: algorithm.TaintNodeShutdown,
|
|
||||||
Effect: v1.TaintEffectNoSchedule,
|
|
||||||
}
|
|
||||||
|
|
||||||
nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{
|
nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{
|
||||||
v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure,
|
v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure,
|
||||||
v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk,
|
v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk,
|
||||||
@ -156,10 +151,9 @@ type Controller struct {
|
|||||||
daemonSetStore extensionslisters.DaemonSetLister
|
daemonSetStore extensionslisters.DaemonSetLister
|
||||||
daemonSetInformerSynced cache.InformerSynced
|
daemonSetInformerSynced cache.InformerSynced
|
||||||
|
|
||||||
nodeLister corelisters.NodeLister
|
nodeLister corelisters.NodeLister
|
||||||
nodeInformerSynced cache.InformerSynced
|
nodeInformerSynced cache.InformerSynced
|
||||||
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
|
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
|
||||||
nodeShutdownInCloudProvider func(types.NodeName) (bool, error)
|
|
||||||
|
|
||||||
recorder record.EventRecorder
|
recorder record.EventRecorder
|
||||||
|
|
||||||
@ -245,9 +239,6 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
|
|||||||
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
|
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
|
||||||
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
|
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
|
||||||
},
|
},
|
||||||
nodeShutdownInCloudProvider: func(nodeName types.NodeName) (bool, error) {
|
|
||||||
return nodeutil.ShutdownInCloudProvider(cloud, nodeName)
|
|
||||||
},
|
|
||||||
recorder: recorder,
|
recorder: recorder,
|
||||||
nodeMonitorPeriod: nodeMonitorPeriod,
|
nodeMonitorPeriod: nodeMonitorPeriod,
|
||||||
nodeStartupGracePeriod: nodeStartupGracePeriod,
|
nodeStartupGracePeriod: nodeStartupGracePeriod,
|
||||||
@ -662,11 +653,6 @@ func (nc *Controller) monitorNodeStatus() error {
|
|||||||
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
|
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// remove shutdown taint this is needed always depending do we use taintbased or not
|
|
||||||
err := nc.markNodeAsNotShutdown(node)
|
|
||||||
if err != nil {
|
|
||||||
glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Report node event.
|
// Report node event.
|
||||||
@ -680,21 +666,7 @@ func (nc *Controller) monitorNodeStatus() error {
|
|||||||
// Check with the cloud provider to see if the node still exists. If it
|
// Check with the cloud provider to see if the node still exists. If it
|
||||||
// doesn't, delete the node immediately.
|
// doesn't, delete the node immediately.
|
||||||
if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
|
if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
|
||||||
// check is node shutdowned, if yes do not deleted it. Instead add taint
|
exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
|
||||||
exists, err := nc.nodeShutdownInCloudProvider(types.NodeName(node.Name))
|
|
||||||
if err != nil && err != cloudprovider.NotImplemented {
|
|
||||||
glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// node shutdown
|
|
||||||
if exists {
|
|
||||||
err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, shutDownTaint)
|
|
||||||
if err != nil {
|
|
||||||
glog.Errorf("Error patching node taints: %v", err)
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
exists, err = nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
|
glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
|
||||||
continue
|
continue
|
||||||
@ -1130,17 +1102,6 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
|
|||||||
return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
|
return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error {
|
|
||||||
nc.evictorLock.Lock()
|
|
||||||
defer nc.evictorLock.Unlock()
|
|
||||||
err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, shutDownTaint)
|
|
||||||
if err != nil {
|
|
||||||
glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
|
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
|
||||||
// The zone is considered:
|
// The zone is considered:
|
||||||
// - fullyDisrupted if there're no Ready Nodes,
|
// - fullyDisrupted if there're no Ready Nodes,
|
||||||
|
@ -1360,118 +1360,6 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCloudProviderNodeShutdown(t *testing.T) {
|
|
||||||
|
|
||||||
testCases := []struct {
|
|
||||||
testName string
|
|
||||||
node *v1.Node
|
|
||||||
shutdown bool
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
testName: "node shutdowned add taint",
|
|
||||||
shutdown: true,
|
|
||||||
node: &v1.Node{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: "node0",
|
|
||||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
Spec: v1.NodeSpec{
|
|
||||||
ProviderID: "node0",
|
|
||||||
},
|
|
||||||
Status: v1.NodeStatus{
|
|
||||||
Conditions: []v1.NodeCondition{
|
|
||||||
{
|
|
||||||
Type: v1.NodeReady,
|
|
||||||
Status: v1.ConditionUnknown,
|
|
||||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
testName: "node started after shutdown remove taint",
|
|
||||||
shutdown: false,
|
|
||||||
node: &v1.Node{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: "node0",
|
|
||||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
Spec: v1.NodeSpec{
|
|
||||||
ProviderID: "node0",
|
|
||||||
Taints: []v1.Taint{
|
|
||||||
{
|
|
||||||
Key: algorithm.TaintNodeShutdown,
|
|
||||||
Effect: v1.TaintEffectNoSchedule,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Status: v1.NodeStatus{
|
|
||||||
Conditions: []v1.NodeCondition{
|
|
||||||
{
|
|
||||||
Type: v1.NodeReady,
|
|
||||||
Status: v1.ConditionTrue,
|
|
||||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for _, tc := range testCases {
|
|
||||||
t.Run(tc.testName, func(t *testing.T) {
|
|
||||||
fnh := &testutil.FakeNodeHandler{
|
|
||||||
Existing: []*v1.Node{tc.node},
|
|
||||||
Clientset: fake.NewSimpleClientset(),
|
|
||||||
}
|
|
||||||
nodeController, _ := newNodeLifecycleControllerFromClient(
|
|
||||||
nil,
|
|
||||||
fnh,
|
|
||||||
10*time.Minute,
|
|
||||||
testRateLimiterQPS,
|
|
||||||
testRateLimiterQPS,
|
|
||||||
testLargeClusterThreshold,
|
|
||||||
testUnhealthyThreshold,
|
|
||||||
testNodeMonitorGracePeriod,
|
|
||||||
testNodeStartupGracePeriod,
|
|
||||||
testNodeMonitorPeriod,
|
|
||||||
false)
|
|
||||||
nodeController.cloud = &fakecloud.FakeCloud{}
|
|
||||||
nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) }
|
|
||||||
nodeController.recorder = testutil.NewFakeRecorder()
|
|
||||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
|
||||||
return tc.shutdown, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := nodeController.syncNodeStore(fnh); err != nil {
|
|
||||||
t.Errorf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
if err := nodeController.monitorNodeStatus(); err != nil {
|
|
||||||
t.Errorf("unexpected error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(fnh.UpdatedNodes) != 1 {
|
|
||||||
t.Errorf("Node was not updated")
|
|
||||||
}
|
|
||||||
if tc.shutdown {
|
|
||||||
if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 {
|
|
||||||
t.Errorf("Node Taint was not added")
|
|
||||||
}
|
|
||||||
if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" {
|
|
||||||
t.Errorf("Node Taint key is not correct")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 {
|
|
||||||
t.Errorf("Node Taint was not removed after node is back in ready state")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes
|
// TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes
|
||||||
// pods and the node when kubelet has not reported, and the cloudprovider says
|
// pods and the node when kubelet has not reported, and the cloudprovider says
|
||||||
// the node is gone.
|
// the node is gone.
|
||||||
@ -1516,9 +1404,6 @@ func TestCloudProviderNoRateLimit(t *testing.T) {
|
|||||||
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
// monitorNodeStatus should allow this node to be immediately deleted
|
// monitorNodeStatus should allow this node to be immediately deleted
|
||||||
if err := nodeController.syncNodeStore(fnh); err != nil {
|
if err := nodeController.syncNodeStore(fnh); err != nil {
|
||||||
t.Errorf("unexpected error: %v", err)
|
t.Errorf("unexpected error: %v", err)
|
||||||
@ -2357,9 +2242,6 @@ func TestNodeEventGeneration(t *testing.T) {
|
|||||||
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
nodeController.now = func() metav1.Time { return fakeNow }
|
nodeController.now = func() metav1.Time { return fakeNow }
|
||||||
fakeRecorder := testutil.NewFakeRecorder()
|
fakeRecorder := testutil.NewFakeRecorder()
|
||||||
nodeController.recorder = fakeRecorder
|
nodeController.recorder = fakeRecorder
|
||||||
|
@ -67,7 +67,6 @@ type FakeNodeHandler struct {
|
|||||||
// Synchronization
|
// Synchronization
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
DeleteWaitChan chan struct{}
|
DeleteWaitChan chan struct{}
|
||||||
PatchWaitChan chan struct{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FakeLegacyHandler is a fake implemtation of CoreV1Interface.
|
// FakeLegacyHandler is a fake implemtation of CoreV1Interface.
|
||||||
@ -271,9 +270,6 @@ func (m *FakeNodeHandler) Patch(name string, pt types.PatchType, data []byte, su
|
|||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer func() {
|
defer func() {
|
||||||
m.RequestCount++
|
m.RequestCount++
|
||||||
if m.PatchWaitChan != nil {
|
|
||||||
m.PatchWaitChan <- struct{}{}
|
|
||||||
}
|
|
||||||
m.lock.Unlock()
|
m.lock.Unlock()
|
||||||
}()
|
}()
|
||||||
var nodeCopy v1.Node
|
var nodeCopy v1.Node
|
||||||
|
@ -187,21 +187,6 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam
|
|||||||
return true, nil
|
return true, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ShutdownInCloudProvider returns true if the node is shutdowned in
|
|
||||||
// cloud provider.
|
|
||||||
func ShutdownInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeName) (bool, error) {
|
|
||||||
instances, ok := cloud.Instances()
|
|
||||||
if !ok {
|
|
||||||
return false, fmt.Errorf("%v", ErrCloudInstance)
|
|
||||||
}
|
|
||||||
providerID, err := cloudprovider.GetInstanceProviderID(context.TODO(), cloud, nodeName)
|
|
||||||
if err != nil {
|
|
||||||
return false, err
|
|
||||||
}
|
|
||||||
shutdown, err := instances.InstanceShutdownByProviderID(context.TODO(), providerID)
|
|
||||||
return shutdown, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// RecordNodeEvent records a event related to a node.
|
// RecordNodeEvent records a event related to a node.
|
||||||
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
|
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
|
||||||
ref := &v1.ObjectReference{
|
ref := &v1.ObjectReference{
|
||||||
|
@ -61,7 +61,4 @@ const (
|
|||||||
// from the cloud-controller-manager intitializes this node, and then removes
|
// from the cloud-controller-manager intitializes this node, and then removes
|
||||||
// the taint
|
// the taint
|
||||||
TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized"
|
TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized"
|
||||||
|
|
||||||
// TaintNodeShutdown when node is shutdown in external cloud provider
|
|
||||||
TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown"
|
|
||||||
)
|
)
|
||||||
|
@ -732,10 +732,6 @@ func (instances *instances) InstanceExistsByProviderID(ctx context.Context, prov
|
|||||||
return false, errors.New("unimplemented")
|
return false, errors.New("unimplemented")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (instances *instances) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
|
||||||
return false, errors.New("unimplemented")
|
|
||||||
}
|
|
||||||
|
|
||||||
func (instances *instances) List(filter string) ([]types.NodeName, error) {
|
func (instances *instances) List(filter string) ([]types.NodeName, error) {
|
||||||
return []types.NodeName{}, errors.New("Not implemented")
|
return []types.NodeName{}, errors.New("Not implemented")
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user