consolidate node deletion logic between node lifecycle and cloud node controller

This commit is contained in:
andrewsykim
2018-10-28 21:57:23 -04:00
parent 6be4f1bbf3
commit 5329f09663
18 changed files with 750 additions and 702 deletions

View File

@@ -22,7 +22,6 @@ limitations under the License.
package nodelifecycle
import (
"context"
"fmt"
"hash/fnv"
"io"
@@ -37,7 +36,6 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@@ -54,7 +52,6 @@ import (
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/flowcontrol"
"k8s.io/client-go/util/workqueue"
cloudprovider "k8s.io/cloud-provider"
v1node "k8s.io/kubernetes/pkg/api/v1/node"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
@@ -150,7 +147,6 @@ type Controller struct {
taintManager *scheduler.NoExecuteTaintManager
podInformerSynced cache.InformerSynced
cloud cloudprovider.Interface
kubeClient clientset.Interface
// This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
@@ -179,12 +175,10 @@ type Controller struct {
daemonSetStore extensionslisters.DaemonSetLister
daemonSetInformerSynced cache.InformerSynced
leaseLister coordlisters.LeaseLister
leaseInformerSynced cache.InformerSynced
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
nodeShutdownInCloudProvider func(context.Context, *v1.Node) (bool, error)
leaseLister coordlisters.LeaseLister
leaseInformerSynced cache.InformerSynced
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
recorder record.EventRecorder
@@ -247,7 +241,6 @@ func NewNodeLifecycleController(
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
daemonSetInformer extensionsinformers.DaemonSetInformer,
cloud cloudprovider.Interface,
kubeClient clientset.Interface,
nodeMonitorPeriod time.Duration,
nodeStartupGracePeriod time.Duration,
@@ -280,17 +273,10 @@ func NewNodeLifecycleController(
}
nc := &Controller{
cloud: cloud,
kubeClient: kubeClient,
now: metav1.Now,
knownNodeSet: make(map[string]*v1.Node),
nodeHealthMap: make(map[string]*nodeHealthData),
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
},
nodeShutdownInCloudProvider: func(ctx context.Context, node *v1.Node) (bool, error) {
return nodeutil.ShutdownInCloudProvider(ctx, cloud, node)
},
kubeClient: kubeClient,
now: metav1.Now,
knownNodeSet: make(map[string]*v1.Node),
nodeHealthMap: make(map[string]*nodeHealthData),
recorder: recorder,
nodeMonitorPeriod: nodeMonitorPeriod,
nodeStartupGracePeriod: nodeStartupGracePeriod,
@@ -779,11 +765,6 @@ func (nc *Controller) monitorNodeHealth() error {
klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
}
}
// remove shutdown taint this is needed always depending do we use taintbased or not
err := nc.markNodeAsNotShutdown(node)
if err != nil {
klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
}
}
// Report node event.
@@ -793,42 +774,6 @@ func (nc *Controller) monitorNodeHealth() error {
utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
}
}
// Check with the cloud provider to see if the node still exists. If it
// doesn't, delete the node immediately.
if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
// check is node shutdowned, if yes do not deleted it. Instead add taint
shutdown, err := nc.nodeShutdownInCloudProvider(context.TODO(), node)
if err != nil {
klog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err)
}
// node shutdown
if shutdown && err == nil {
err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, controller.ShutdownTaint)
if err != nil {
klog.Errorf("Error patching node taints: %v", err)
}
continue
}
exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
if err != nil {
klog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
continue
}
if !exists {
klog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
nodeutil.RecordNodeEvent(nc.recorder, node.Name, string(node.UID), v1.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
go func(nodeName string) {
defer utilruntime.HandleCrash()
// Kubelet is not reporting and Cloud Provider says node
// is gone. Delete it without worrying about grace
// periods.
if err := nodeutil.ForcefullyDeleteNode(nc.kubeClient, nodeName); err != nil {
klog.Errorf("Unable to forcefully delete node %q: %v", nodeName, err)
}
}(node.Name)
}
}
}
}
nc.handleDisruption(zoneToNodeConditions, nodes)
@@ -1268,17 +1213,6 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
}
func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, controller.ShutdownTaint)
if err != nil {
klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
return err
}
return nil
}
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
// The zone is considered:
// - fullyDisrupted if there're no Ready Nodes,