From d88af7806ca1fff280edfe8241580c06fece3c05 Mon Sep 17 00:00:00 2001 From: gmarek Date: Mon, 6 Feb 2017 13:58:48 +0100 Subject: [PATCH 1/3] NodeController sets NodeTaints instead of deleting Pods --- .../app/controllermanager.go | 1 + .../app/options/options.go | 2 + hack/verify-flags/known-flags.txt | 2 + pkg/apis/componentconfig/types.go | 2 + pkg/controller/controller_utils.go | 17 +- pkg/controller/node/nodecontroller.go | 345 +++++++++++++----- pkg/controller/node/nodecontroller_test.go | 15 +- .../rbac/bootstrappolicy/controller_policy.go | 2 +- test/e2e/framework/util.go | 62 +++- test/e2e/network_partition.go | 180 +++++++++ 10 files changed, 534 insertions(+), 94 deletions(-) diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index 26b8efdca0c..b32ea0e4605 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -444,6 +444,7 @@ func StartControllers(controllers map[string]InitFunc, s *options.CMServer, root int(s.NodeCIDRMaskSize), s.AllocateNodeCIDRs, s.EnableTaintManager, + s.UseTaintBasedEvictions, ) if err != nil { return fmt.Errorf("failed to initialize nodecontroller: %v", err) diff --git a/cmd/kube-controller-manager/app/options/options.go b/cmd/kube-controller-manager/app/options/options.go index e9fd82f4aa2..49bf1d057f6 100644 --- a/cmd/kube-controller-manager/app/options/options.go +++ b/cmd/kube-controller-manager/app/options/options.go @@ -105,6 +105,7 @@ func NewCMServer() *CMServer { ClusterSigningKeyFile: "/etc/kubernetes/ca/ca.key", ReconcilerSyncLoopPeriod: metav1.Duration{Duration: 60 * time.Second}, EnableTaintManager: true, + UseTaintBasedEvictions: false, }, } s.LeaderElection.LeaderElect = true @@ -198,6 +199,7 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled fs.BoolVar(&s.DisableAttachDetachReconcilerSync, "disable-attach-detach-reconcile-sync", false, "Disable volume attach detach reconciler sync. Disabling this may cause volumes to be mismatched with pods. Use wisely.") fs.DurationVar(&s.ReconcilerSyncLoopPeriod.Duration, "attach-detach-reconcile-sync-period", s.ReconcilerSyncLoopPeriod.Duration, "The reconciler sync wait time between volume attach detach. This duration must be larger than one second, and increasing this value from the default may allow for volumes to be mismatched with pods.") fs.BoolVar(&s.EnableTaintManager, "enable-taint-manager", s.EnableTaintManager, "WARNING: Beta feature. If set to true enables NoExecute Taints and will evict all not-tolerating Pod running on Nodes tainted with this kind of Taints.") + fs.BoolVar(&s.UseTaintBasedEvictions, "use-taint-based-evictions", s.UseTaintBasedEvictions, "WARNING: Alpha feature. If set to true NodeController will use taints to evict Pods from notReady and unreachable Nodes.") leaderelection.BindFlags(&s.LeaderElection, fs) diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 8b06ca66d01..b0449a7bd2f 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -639,6 +639,8 @@ upgrade-image upgrade-target use-kubernetes-cluster-service use-service-account-credentials +use-kubernetes-version +use-taint-based-evictions user-whitelist verb verify-only diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 602e928803c..53b0ff3cc7f 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -794,6 +794,8 @@ type KubeControllerManagerConfiguration struct { // If set to true enables NoExecute Taints and will evict all not-tolerating // Pod running on Nodes tainted with this kind of Taints. EnableTaintManager bool + // If set to true NodeController will use taints to evict Pods from notReady and unreachable Nodes. + UseTaintBasedEvictions bool } // VolumeConfiguration contains *all* enumerated flags meant to configure all volume diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 709d377c545..6da788c5f2b 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -873,7 +873,22 @@ func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taint *v1.Ta // RemoveTaintOffNode is for cleaning up taints temporarily added to node, // won't fail if target taint doesn't exist or has been removed. -func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint *v1.Taint) error { +// If passed a node it'll check if there's anything to be done, if taint is not present it won't issue +// any API calls. +func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint *v1.Taint, node *v1.Node) error { + // Short circuit for limiting amout of API calls. + if node != nil { + match := false + for i := range node.Spec.Taints { + if node.Spec.Taints[i].MatchTaint(taint) { + match = true + break + } + } + if !match { + return nil + } + } firstTry := true return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error { var err error diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go index 9ed022bcf87..324015871f3 100644 --- a/pkg/controller/node/nodecontroller.go +++ b/pkg/controller/node/nodecontroller.go @@ -45,6 +45,7 @@ import ( corelisters "k8s.io/kubernetes/pkg/client/listers/core/v1" extensionslisters "k8s.io/kubernetes/pkg/client/listers/extensions/v1beta1" "k8s.io/kubernetes/pkg/cloudprovider" + "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/util/metrics" utilnode "k8s.io/kubernetes/pkg/util/node" "k8s.io/kubernetes/pkg/util/system" @@ -63,6 +64,16 @@ var ( // The minimum kubelet version for which the nodecontroller // can safely flip pod.Status to NotReady. podStatusReconciliationVersion = utilversion.MustParseSemantic("v1.2.0") + + UnreachableTaintTemplate = &v1.Taint{ + Key: metav1.TaintNodeUnreachable, + Effect: v1.TaintEffectNoExecute, + } + + NotReadyTaintTemplate = &v1.Taint{ + Key: metav1.TaintNodeNotReady, + Effect: v1.TaintEffectNoExecute, + } ) const ( @@ -132,8 +143,10 @@ type NodeController struct { // Lock to access evictor workers evictorLock sync.Mutex // workers that evicts pods from unresponsive nodes. - zonePodEvictor map[string]*RateLimitedTimedQueue - podEvictionTimeout time.Duration + zonePodEvictor map[string]*RateLimitedTimedQueue + // workers that are responsible for tainting nodes. + zoneNotReadyOrUnreachableTainer map[string]*RateLimitedTimedQueue + podEvictionTimeout time.Duration // The maximum duration before a pod evicted from a node can be forcefully terminated. maximumGracePeriod time.Duration recorder record.EventRecorder @@ -166,6 +179,10 @@ type NodeController struct { // if set to true NodeController will start TaintManager that will evict Pods from // tainted nodes, if they're not tolerated. runTaintManager bool + + // if set to true NodeController will taint Nodes with 'TaintNodeNotReady' and 'TaintNodeUnreachable' + // taints instead of evicting Pods itself. + useTaintBasedEvictions bool } // NewNodeController returns a new node controller to sync instances from cloudprovider. @@ -190,7 +207,8 @@ func NewNodeController( serviceCIDR *net.IPNet, nodeCIDRMaskSize int, allocateNodeCIDRs bool, - runTaintManager bool) (*NodeController, error) { + runTaintManager bool, + useTaintBasedEvictions bool) (*NodeController, error) { eventBroadcaster := record.NewBroadcaster() recorder := eventBroadcaster.NewRecorder(api.Scheme, clientv1.EventSource{Component: "controllermanager"}) eventBroadcaster.StartLogging(glog.Infof) @@ -216,30 +234,32 @@ func NewNodeController( } nc := &NodeController{ - cloud: cloud, - knownNodeSet: make(map[string]*v1.Node), - kubeClient: kubeClient, - recorder: recorder, - podEvictionTimeout: podEvictionTimeout, - maximumGracePeriod: 5 * time.Minute, - zonePodEvictor: make(map[string]*RateLimitedTimedQueue), - nodeStatusMap: make(map[string]nodeStatusData), - nodeMonitorGracePeriod: nodeMonitorGracePeriod, - nodeMonitorPeriod: nodeMonitorPeriod, - nodeStartupGracePeriod: nodeStartupGracePeriod, - lookupIP: net.LookupIP, - now: metav1.Now, - clusterCIDR: clusterCIDR, - serviceCIDR: serviceCIDR, - allocateNodeCIDRs: allocateNodeCIDRs, - forcefullyDeletePod: func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) }, - nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) }, - evictionLimiterQPS: evictionLimiterQPS, - secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, - largeClusterThreshold: largeClusterThreshold, - unhealthyZoneThreshold: unhealthyZoneThreshold, - zoneStates: make(map[string]zoneState), - runTaintManager: runTaintManager, + cloud: cloud, + knownNodeSet: make(map[string]*v1.Node), + kubeClient: kubeClient, + recorder: recorder, + podEvictionTimeout: podEvictionTimeout, + maximumGracePeriod: 5 * time.Minute, + zonePodEvictor: make(map[string]*RateLimitedTimedQueue), + zoneNotReadyOrUnreachableTainer: make(map[string]*RateLimitedTimedQueue), + nodeStatusMap: make(map[string]nodeStatusData), + nodeMonitorGracePeriod: nodeMonitorGracePeriod, + nodeMonitorPeriod: nodeMonitorPeriod, + nodeStartupGracePeriod: nodeStartupGracePeriod, + lookupIP: net.LookupIP, + now: metav1.Now, + clusterCIDR: clusterCIDR, + serviceCIDR: serviceCIDR, + allocateNodeCIDRs: allocateNodeCIDRs, + forcefullyDeletePod: func(p *v1.Pod) error { return forcefullyDeletePod(kubeClient, p) }, + nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) }, + evictionLimiterQPS: evictionLimiterQPS, + secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, + largeClusterThreshold: largeClusterThreshold, + unhealthyZoneThreshold: unhealthyZoneThreshold, + zoneStates: make(map[string]zoneState), + runTaintManager: runTaintManager, + useTaintBasedEvictions: useTaintBasedEvictions && runTaintManager, } nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc nc.enterFullDisruptionFunc = nc.HealthyQPSFunc @@ -426,38 +446,100 @@ func (nc *NodeController) Run() { go nc.taintManager.Run(wait.NeverStop) } - // Managing eviction of nodes: - // When we delete pods off a node, if the node was not empty at the time we then - // queue an eviction watcher. If we hit an error, retry deletion. - go wait.Until(func() { - nc.evictorLock.Lock() - defer nc.evictorLock.Unlock() - for k := range nc.zonePodEvictor { - nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) { - node, err := nc.nodeLister.Get(value.Value) - if apierrors.IsNotFound(err) { - glog.Warningf("Node %v no longer present in nodeLister!", value.Value) - } else if err != nil { - glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) - } else { - zone := utilnode.GetZoneKey(node) - EvictionsNumber.WithLabelValues(zone).Inc() - } + if nc.useTaintBasedEvictions { + // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated + // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. + go wait.Until(func() { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + for k := range nc.zoneNotReadyOrUnreachableTainer { + // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). + nc.zoneNotReadyOrUnreachableTainer[k].Try(func(value TimedValue) (bool, time.Duration) { + node, err := nc.nodeLister.Get(value.Value) + if apierrors.IsNotFound(err) { + glog.Warningf("Node %v no longer present in nodeLister!", value.Value) + return true, 0 + } else if err != nil { + glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) + // retry in 50 millisecond + return false, 50 * time.Millisecond + } else { + zone := utilnode.GetZoneKey(node) + EvictionsNumber.WithLabelValues(zone).Inc() + } + _, condition := v1.GetNodeCondition(&node.Status, v1.NodeReady) + // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. + taintToAdd := v1.Taint{} + oppositeTaint := v1.Taint{} + if condition.Status == v1.ConditionFalse { + taintToAdd = *NotReadyTaintTemplate + oppositeTaint = *UnreachableTaintTemplate + } else if condition.Status == v1.ConditionUnknown { + taintToAdd = *UnreachableTaintTemplate + oppositeTaint = *NotReadyTaintTemplate + } else { + // It seems that the Node is ready again, so there's no need to taint it. + return true, 0 + } - nodeUid, _ := value.UID.(string) - remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore) - if err != nil { - utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) - return false, 0 - } - - if remaining { - glog.Infof("Pods awaiting deletion due to NodeController eviction") - } - return true, 0 - }) - } - }, nodeEvictionPeriod, wait.NeverStop) + taintToAdd.TimeAdded = metav1.Now() + err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, value.Value, &taintToAdd) + if err != nil { + utilruntime.HandleError( + fmt.Errorf( + "unable to taint %v unresponsive Node %q: %v", + taintToAdd.Key, + value.Value, + err)) + return false, 0 + } + err = controller.RemoveTaintOffNode(nc.kubeClient, value.Value, &oppositeTaint, node) + if err != nil { + utilruntime.HandleError( + fmt.Errorf( + "unable to remove %v unneeded taint from unresponsive Node %q: %v", + oppositeTaint.Key, + value.Value, + err)) + return false, 0 + } + return true, 0 + }) + } + }, nodeEvictionPeriod, wait.NeverStop) + } else { + // Managing eviction of nodes: + // When we delete pods off a node, if the node was not empty at the time we then + // queue an eviction watcher. If we hit an error, retry deletion. + go wait.Until(func() { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + for k := range nc.zonePodEvictor { + // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). + nc.zonePodEvictor[k].Try(func(value TimedValue) (bool, time.Duration) { + node, err := nc.nodeLister.Get(value.Value) + if apierrors.IsNotFound(err) { + glog.Warningf("Node %v no longer present in nodeLister!", value.Value) + } else if err != nil { + glog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) + } else { + zone := utilnode.GetZoneKey(node) + EvictionsNumber.WithLabelValues(zone).Inc() + } + nodeUid, _ := value.UID.(string) + remaining, err := deletePods(nc.kubeClient, nc.recorder, value.Value, nodeUid, nc.daemonSetStore) + if err != nil { + utilruntime.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) + return false, 0 + } + if remaining { + glog.Infof("Pods awaiting deletion due to NodeController eviction") + } + return true, 0 + }) + } + }, nodeEvictionPeriod, wait.NeverStop) + } }() } @@ -478,15 +560,26 @@ func (nc *NodeController) monitorNodeStatus() error { nc.knownNodeSet[added[i].Name] = added[i] // When adding new Nodes we need to check if new zone appeared, and if so add new evictor. zone := utilnode.GetZoneKey(added[i]) - if _, found := nc.zonePodEvictor[zone]; !found { - nc.zonePodEvictor[zone] = - NewRateLimitedTimedQueue( - flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) + if _, found := nc.zoneStates[zone]; !found { + nc.zoneStates[zone] = stateInitial + if !nc.useTaintBasedEvictions { + nc.zonePodEvictor[zone] = + NewRateLimitedTimedQueue( + flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) + } else { + nc.zoneNotReadyOrUnreachableTainer[zone] = + NewRateLimitedTimedQueue( + flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, evictionRateLimiterBurst)) + } // Init the metric for the new zone. glog.Infof("Initializing eviction metric for zone: %v", zone) EvictionsNumber.WithLabelValues(zone).Add(0) } - nc.cancelPodEviction(added[i]) + if nc.useTaintBasedEvictions { + nc.markNodeAsHealthy(added[i]) + } else { + nc.cancelPodEviction(added[i]) + } } for i := range deleted { @@ -532,21 +625,61 @@ func (nc *NodeController) monitorNodeStatus() error { decisionTimestamp := nc.now() if currentReadyCondition != nil { // Check eviction timeout against decisionTimestamp - if observedReadyCondition.Status == v1.ConditionFalse && - decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { - if nc.evictPods(node) { - glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout) + if observedReadyCondition.Status == v1.ConditionFalse { + if nc.useTaintBasedEvictions { + if nc.markNodeForTainting(node) { + glog.V(2).Infof("Tainting Node %v with NotReady taint on %v", + node.Name, + decisionTimestamp, + ) + } + } else { + if decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { + if nc.evictPods(node) { + glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", + node.Name, + decisionTimestamp, + nc.nodeStatusMap[node.Name].readyTransitionTimestamp, + nc.podEvictionTimeout, + ) + } + } } } - if observedReadyCondition.Status == v1.ConditionUnknown && - decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) { - if nc.evictPods(node) { - glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod) + if observedReadyCondition.Status == v1.ConditionUnknown { + if nc.useTaintBasedEvictions { + if nc.markNodeForTainting(node) { + glog.V(2).Infof("Tainting Node %v with NotReady taint on %v", + node.Name, + decisionTimestamp, + ) + } + } else { + if decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) { + if nc.evictPods(node) { + glog.V(2).Infof("Evicting pods on node %s: %v is later than %v + %v", + node.Name, + decisionTimestamp, + nc.nodeStatusMap[node.Name].readyTransitionTimestamp, + nc.podEvictionTimeout-gracePeriod, + ) + } + } } } if observedReadyCondition.Status == v1.ConditionTrue { - if nc.cancelPodEviction(node) { - glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) + if nc.useTaintBasedEvictions { + removed, err := nc.markNodeAsHealthy(node) + if err != nil { + glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) + } + if removed { + glog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name) + } + } else { + if nc.cancelPodEviction(node) { + glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) + } } } @@ -600,6 +733,7 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1 } newZoneStates[k] = newState if _, had := nc.zoneStates[k]; !had { + glog.Errorf("Setting initial state for unseen zone: %v", k) nc.zoneStates[k] = stateInitial } } @@ -629,11 +763,22 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1 if allAreFullyDisrupted { glog.V(0).Info("NodeController detected that all Nodes are not-Ready. Entering master disruption mode.") for i := range nodes { - nc.cancelPodEviction(nodes[i]) + if nc.useTaintBasedEvictions { + _, err := nc.markNodeAsHealthy(nodes[i]) + if err != nil { + glog.Errorf("Failed to remove taints from Node %v", nodes[i].Name) + } + } else { + nc.cancelPodEviction(nodes[i]) + } } // We stop all evictions. - for k := range nc.zonePodEvictor { - nc.zonePodEvictor[k].SwapLimiter(0) + for k := range nc.zoneStates { + if nc.useTaintBasedEvictions { + nc.zoneNotReadyOrUnreachableTainer[k].SwapLimiter(0) + } else { + nc.zonePodEvictor[k].SwapLimiter(0) + } } for k := range nc.zoneStates { nc.zoneStates[k] = stateFullDisruption @@ -653,7 +798,7 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1 nc.nodeStatusMap[nodes[i].Name] = v } // We reset all rate limiters to settings appropriate for the given state. - for k := range nc.zonePodEvictor { + for k := range nc.zoneStates { nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k]) nc.zoneStates[k] = newZoneStates[k] } @@ -676,13 +821,27 @@ func (nc *NodeController) handleDisruption(zoneToNodeConditions map[string][]*v1 func (nc *NodeController) setLimiterInZone(zone string, zoneSize int, state zoneState) { switch state { case stateNormal: - nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS) + if nc.useTaintBasedEvictions { + nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter(nc.evictionLimiterQPS) + } else { + nc.zonePodEvictor[zone].SwapLimiter(nc.evictionLimiterQPS) + } case statePartialDisruption: - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterPartialDisruptionFunc(zoneSize)) + if nc.useTaintBasedEvictions { + nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter( + nc.enterPartialDisruptionFunc(zoneSize)) + } else { + nc.zonePodEvictor[zone].SwapLimiter( + nc.enterPartialDisruptionFunc(zoneSize)) + } case stateFullDisruption: - nc.zonePodEvictor[zone].SwapLimiter( - nc.enterFullDisruptionFunc(zoneSize)) + if nc.useTaintBasedEvictions { + nc.zoneNotReadyOrUnreachableTainer[zone].SwapLimiter( + nc.enterFullDisruptionFunc(zoneSize)) + } else { + nc.zonePodEvictor[zone].SwapLimiter( + nc.enterFullDisruptionFunc(zoneSize)) + } } } @@ -898,6 +1057,28 @@ func (nc *NodeController) evictPods(node *v1.Node) bool { return nc.zonePodEvictor[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)) } +func (nc *NodeController) markNodeForTainting(node *v1.Node) bool { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + return nc.zoneNotReadyOrUnreachableTainer[utilnode.GetZoneKey(node)].Add(node.Name, string(node.UID)) +} + +func (nc *NodeController) markNodeAsHealthy(node *v1.Node) (bool, error) { + nc.evictorLock.Lock() + defer nc.evictorLock.Unlock() + err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, UnreachableTaintTemplate, node) + if err != nil { + glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + return false, err + } + err = controller.RemoveTaintOffNode(nc.kubeClient, node.Name, NotReadyTaintTemplate, node) + if err != nil { + glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + return false, err + } + return nc.zoneNotReadyOrUnreachableTainer[utilnode.GetZoneKey(node)].Remove(node.Name), nil +} + // Default value for cluster eviction rate - we take nodeNum for consistency with ReducedQPSFunc. func (nc *NodeController) HealthyQPSFunc(nodeNum int) float32 { return nc.evictionLimiterQPS diff --git a/pkg/controller/node/nodecontroller_test.go b/pkg/controller/node/nodecontroller_test.go index 4c183fa603b..45f3960b802 100644 --- a/pkg/controller/node/nodecontroller_test.go +++ b/pkg/controller/node/nodecontroller_test.go @@ -100,6 +100,7 @@ func NewNodeControllerFromClient( nodeCIDRMaskSize, allocateNodeCIDRs, false, + false, ) if err != nil { return nil, err @@ -573,11 +574,15 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { } zones := testutil.GetZones(item.fakeNodeHandler) for _, zone := range zones { - nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) { - nodeUid, _ := value.UID.(string) - deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetInformer.Lister()) - return true, 0 - }) + if _, ok := nodeController.zonePodEvictor[zone]; ok { + nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) { + nodeUid, _ := value.UID.(string) + deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetInformer.Lister()) + return true, 0 + }) + } else { + t.Fatalf("Zone %v was unitialized!", zone) + } } podEvicted := false diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go index f214bf48e5c..771b6a18311 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/controller_policy.go @@ -167,7 +167,7 @@ func init() { addControllerRole(rbac.ClusterRole{ ObjectMeta: metav1.ObjectMeta{Name: saRolePrefix + "node-controller"}, Rules: []rbac.PolicyRule{ - rbac.NewRule("get", "list", "update", "delete").Groups(legacyGroup).Resources("nodes").RuleOrDie(), + rbac.NewRule("get", "list", "update", "delete", "patch").Groups(legacyGroup).Resources("nodes").RuleOrDie(), rbac.NewRule("update").Groups(legacyGroup).Resources("nodes/status").RuleOrDie(), // used for pod eviction rbac.NewRule("update").Groups(legacyGroup).Resources("pods/status").RuleOrDie(), diff --git a/test/e2e/framework/util.go b/test/e2e/framework/util.go index cbaaceb795a..42384bee790 100644 --- a/test/e2e/framework/util.go +++ b/test/e2e/framework/util.go @@ -82,6 +82,7 @@ import ( gcecloud "k8s.io/kubernetes/pkg/cloudprovider/providers/gce" "k8s.io/kubernetes/pkg/controller" deploymentutil "k8s.io/kubernetes/pkg/controller/deployment/util" + nodectlr "k8s.io/kubernetes/pkg/controller/node" "k8s.io/kubernetes/pkg/kubectl" "k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/master/ports" @@ -774,7 +775,7 @@ func WaitForMatchPodsCondition(c clientset.Interface, opts metav1.ListOptions, d if len(conditionNotMatch) <= 0 { return err } - Logf("%d pods are not %s", len(conditionNotMatch), desc) + Logf("%d pods are not %s: %v", len(conditionNotMatch), desc, conditionNotMatch) } return fmt.Errorf("gave up waiting for matching pods to be '%s' after %v", desc, timeout) } @@ -2496,7 +2497,7 @@ func ExpectNodeHasLabel(c clientset.Interface, nodeName string, labelKey string, } func RemoveTaintOffNode(c clientset.Interface, nodeName string, taint v1.Taint) { - ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, &taint)) + ExpectNoError(controller.RemoveTaintOffNode(c, nodeName, &taint, nil)) VerifyThatTaintIsGone(c, nodeName, &taint) } @@ -2525,14 +2526,24 @@ func VerifyThatTaintIsGone(c clientset.Interface, nodeName string, taint *v1.Tai func ExpectNodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) { By("verifying the node has the taint " + taint.ToString()) + if has, err := NodeHasTaint(c, nodeName, taint); !has { + ExpectNoError(err) + Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName) + } +} + +func NodeHasTaint(c clientset.Interface, nodeName string, taint *v1.Taint) (bool, error) { node, err := c.Core().Nodes().Get(nodeName, metav1.GetOptions{}) - ExpectNoError(err) + if err != nil { + return false, err + } nodeTaints := node.Spec.Taints if len(nodeTaints) == 0 || !v1.TaintExists(nodeTaints, taint) { - Failf("Failed to find taint %s on node %s", taint.ToString(), nodeName) + return false, nil } + return true, nil } func getScalerForKind(internalClientset internalclientset.Interface, kind schema.GroupKind) (kubectl.Scaler, error) { @@ -3961,7 +3972,47 @@ func isNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionT for _, cond := range node.Status.Conditions { // Ensure that the condition type and the status matches as desired. if cond.Type == conditionType { - if (cond.Status == v1.ConditionTrue) == wantTrue { + // For NodeReady condition we need to check Taints as well + if cond.Type == v1.NodeReady { + hasNodeControllerTaints := false + // For NodeReady we need to check if Taints are gone as well + taints := node.Spec.Taints + for _, taint := range taints { + if taint.MatchTaint(nodectlr.UnreachableTaintTemplate) || taint.MatchTaint(nodectlr.NotReadyTaintTemplate) { + hasNodeControllerTaints = true + break + } + } + if wantTrue { + if (cond.Status == v1.ConditionTrue) && !hasNodeControllerTaints { + return true + } else { + msg := "" + if !hasNodeControllerTaints { + msg = fmt.Sprintf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v", + conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message) + } else { + msg = fmt.Sprintf("Condition %s of node %s is %v, but Node is tainted by NodeController with %v. Failure", + conditionType, node.Name, cond.Status == v1.ConditionTrue, taints) + } + if !silent { + Logf(msg) + } + return false + } + } else { + // TODO: check if the Node is tainted once we enable NC notReady/unreachable taints by default + if cond.Status != v1.ConditionTrue { + return true + } + if !silent { + Logf("Condition %s of node %s is %v instead of %t. Reason: %v, message: %v", + conditionType, node.Name, cond.Status == v1.ConditionTrue, wantTrue, cond.Reason, cond.Message) + } + return false + } + } + if (wantTrue && (cond.Status == v1.ConditionTrue)) || (!wantTrue && (cond.Status != v1.ConditionTrue)) { return true } else { if !silent { @@ -3971,6 +4022,7 @@ func isNodeConditionSetAsExpected(node *v1.Node, conditionType v1.NodeConditionT return false } } + } if !silent { Logf("Couldn't find condition %v on node %v", conditionType, node.Name) diff --git a/test/e2e/network_partition.go b/test/e2e/network_partition.go index 421fe431862..480d6551f9d 100644 --- a/test/e2e/network_partition.go +++ b/test/e2e/network_partition.go @@ -27,10 +27,13 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" + "k8s.io/client-go/tools/cache" + "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/v1" "k8s.io/kubernetes/pkg/client/clientset_generated/clientset" + nodepkg "k8s.io/kubernetes/pkg/controller/node" "k8s.io/kubernetes/test/e2e/framework" testutils "k8s.io/kubernetes/test/utils" @@ -476,4 +479,181 @@ var _ = framework.KubeDescribe("Network Partition [Disruptive] [Slow]", func() { } }) }) + + framework.KubeDescribe("Pods", func() { + Context("should be evicted from unready Node", func() { + BeforeEach(func() { + framework.SkipUnlessProviderIs("gce", "gke", "aws") + framework.SkipUnlessNodeCountIsAtLeast(2) + }) + + // What happens in this test: + // Network traffic from a node to master is cut off to simulate network partition + // Expect to observe: + // 1. Node is marked NotReady after timeout by nodecontroller (40seconds) + // 2. All pods on node are marked NotReady shortly after #1 + // 3. After enough time passess all Pods are evicted from the given Node + It("[Feature:TaintEviction] All pods on the unreachable node should be marked as NotReady upon the node turn NotReady "+ + "AND all pods should be evicted after eviction timeout passes", func() { + By("choose a node - we will block all network traffic on this node") + var podOpts metav1.ListOptions + nodes := framework.GetReadySchedulableNodesOrDie(c) + framework.FilterNodes(nodes, func(node v1.Node) bool { + if !framework.IsNodeConditionSetAsExpected(&node, v1.NodeReady, true) { + return false + } + podOpts = metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, node.Name).String()} + pods, err := c.Core().Pods(metav1.NamespaceAll).List(podOpts) + if err != nil || len(pods.Items) <= 0 { + return false + } + return true + }) + if len(nodes.Items) <= 0 { + framework.Failf("No eligible node were found: %d", len(nodes.Items)) + } + node := nodes.Items[0] + podOpts = metav1.ListOptions{FieldSelector: fields.OneTermEqualSelector(api.PodHostField, node.Name).String()} + if err := framework.WaitForMatchPodsCondition(c, podOpts, "Running and Ready", podReadyTimeout, testutils.PodRunningReadyOrSucceeded); err != nil { + framework.Failf("Pods on node %s are not ready and running within %v: %v", node.Name, podReadyTimeout, err) + } + pods, err := c.Core().Pods(metav1.NamespaceAll).List(podOpts) + framework.ExpectNoError(err) + podTolerationTimes := map[string]time.Duration{} + // This test doesn't add tolerations by itself, but because they may be present in the cluster + // it needs to account for that. + for _, pod := range pods.Items { + namespacedName := fmt.Sprintf("%v/%v", pod.Namespace, pod.Name) + tolerations := pod.Spec.Tolerations + framework.ExpectNoError(err) + for _, toleration := range tolerations { + if toleration.ToleratesTaint(nodepkg.UnreachableTaintTemplate) { + if toleration.TolerationSeconds != nil { + podTolerationTimes[namespacedName] = time.Duration(*toleration.TolerationSeconds) * time.Second + break + } else { + podTolerationTimes[namespacedName] = -1 + } + } + } + if _, ok := podTolerationTimes[namespacedName]; !ok { + podTolerationTimes[namespacedName] = 0 + } + } + neverEvictedPods := []string{} + maxTolerationTime := time.Duration(0) + for podName, tolerationTime := range podTolerationTimes { + if tolerationTime < 0 { + neverEvictedPods = append(neverEvictedPods, podName) + } else { + if tolerationTime > maxTolerationTime { + maxTolerationTime = tolerationTime + } + } + } + framework.Logf( + "Only %v should be running after partition. Maximum TolerationSeconds among other Pods is %v", + neverEvictedPods, + maxTolerationTime, + ) + + By("Set up watch on node status") + nodeSelector := fields.OneTermEqualSelector("metadata.name", node.Name) + stopCh := make(chan struct{}) + newNode := make(chan *v1.Node) + var controller cache.Controller + _, controller = cache.NewInformer( + &cache.ListWatch{ + ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { + options.FieldSelector = nodeSelector.String() + obj, err := f.ClientSet.Core().Nodes().List(options) + return runtime.Object(obj), err + }, + WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { + options.FieldSelector = nodeSelector.String() + return f.ClientSet.Core().Nodes().Watch(options) + }, + }, + &v1.Node{}, + 0, + cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj interface{}) { + n, ok := newObj.(*v1.Node) + Expect(ok).To(Equal(true)) + newNode <- n + + }, + }, + ) + + defer func() { + // Will not explicitly close newNode channel here due to + // race condition where stopCh and newNode are closed but informer onUpdate still executes. + close(stopCh) + }() + go controller.Run(stopCh) + + By(fmt.Sprintf("Block traffic from node %s to the master", node.Name)) + host := framework.GetNodeExternalIP(&node) + master := framework.GetMasterAddress(c) + defer func() { + By(fmt.Sprintf("Unblock traffic from node %s to the master", node.Name)) + framework.UnblockNetwork(host, master) + + if CurrentGinkgoTestDescription().Failed { + return + } + + By("Expect to observe node status change from NotReady to Ready after network connectivity recovers") + expectNodeReadiness(true, newNode) + }() + + framework.BlockNetwork(host, master) + + By("Expect to observe node and pod status change from Ready to NotReady after network partition") + expectNodeReadiness(false, newNode) + framework.ExpectNoError(wait.Poll(1*time.Second, timeout, func() (bool, error) { + return framework.NodeHasTaint(c, node.Name, nodepkg.UnreachableTaintTemplate) + })) + if err = framework.WaitForMatchPodsCondition(c, podOpts, "NotReady", podNotReadyTimeout, testutils.PodNotReady); err != nil { + framework.Failf("Pods on node %s did not become NotReady within %v: %v", node.Name, podNotReadyTimeout, err) + } + + sleepTime := maxTolerationTime + 20*time.Second + By(fmt.Sprintf("Sleeping for %v and checking if all Pods were evicted", sleepTime)) + time.Sleep(sleepTime) + pods, err = c.Core().Pods(v1.NamespaceAll).List(podOpts) + framework.ExpectNoError(err) + seenRunning := []string{} + for _, pod := range pods.Items { + namespacedName := fmt.Sprintf("%v/%v", pod.Namespace, pod.Name) + shouldBeTerminating := true + for _, neverEvictedPod := range neverEvictedPods { + if neverEvictedPod == namespacedName { + shouldBeTerminating = false + } + } + if pod.DeletionTimestamp == nil { + seenRunning = append(seenRunning, namespacedName) + if shouldBeTerminating { + framework.Failf("Pod %v should have been deleted but was seen running", namespacedName) + } + } + } + + for _, neverEvictedPod := range neverEvictedPods { + running := false + for _, runningPod := range seenRunning { + if runningPod == neverEvictedPod { + running = true + break + } + } + if !running { + framework.Failf("Pod %v was evicted even though it shouldn't", neverEvictedPod) + } + } + }) + }) + }) }) From 6637592b1de6522b197f65202b6830888626ab28 Mon Sep 17 00:00:00 2001 From: gmarek Date: Wed, 8 Feb 2017 14:05:00 +0100 Subject: [PATCH 2/3] generated --- pkg/controller/node/BUILD | 1 + .../rbac/bootstrappolicy/testdata/controller-roles.yaml | 1 + test/e2e/BUILD | 1 + test/e2e/framework/BUILD | 1 + 4 files changed, 4 insertions(+) diff --git a/pkg/controller/node/BUILD b/pkg/controller/node/BUILD index 7e9bfab94c8..a236942359a 100644 --- a/pkg/controller/node/BUILD +++ b/pkg/controller/node/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/client/listers/core/v1:go_default_library", "//pkg/client/listers/extensions/v1beta1:go_default_library", "//pkg/cloudprovider:go_default_library", + "//pkg/controller:go_default_library", "//pkg/kubelet/util/format:go_default_library", "//pkg/util/metrics:go_default_library", "//pkg/util/node:go_default_library", diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml index 4f486535d0f..63b37380bbd 100644 --- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml +++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/controller-roles.yaml @@ -549,6 +549,7 @@ items: - delete - get - list + - patch - update - apiGroups: - "" diff --git a/test/e2e/BUILD b/test/e2e/BUILD index b44e288dd9d..d6a4c3c4682 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -133,6 +133,7 @@ go_library( "//pkg/controller/deployment/util:go_default_library", "//pkg/controller/endpoint:go_default_library", "//pkg/controller/job:go_default_library", + "//pkg/controller/node:go_default_library", "//pkg/controller/replicaset:go_default_library", "//pkg/controller/replication:go_default_library", "//pkg/kubectl:go_default_library", diff --git a/test/e2e/framework/BUILD b/test/e2e/framework/BUILD index 4bc7f8011fd..1ab6ea32bf2 100644 --- a/test/e2e/framework/BUILD +++ b/test/e2e/framework/BUILD @@ -57,6 +57,7 @@ go_library( "//pkg/cloudprovider/providers/gce:go_default_library", "//pkg/controller:go_default_library", "//pkg/controller/deployment/util:go_default_library", + "//pkg/controller/node:go_default_library", "//pkg/kubectl:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/metrics:go_default_library", From f9d6086217f36bcea6024115f81642d6dfdcaf8b Mon Sep 17 00:00:00 2001 From: gmarek Date: Thu, 23 Feb 2017 14:58:52 +0100 Subject: [PATCH 3/3] Fix leftover Taint-related helper function --- pkg/controller/controller_utils.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 6da788c5f2b..1bf2848ba8c 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -922,7 +922,7 @@ func PatchNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node, n return fmt.Errorf("failed to marshal old node %#v for node %q: %v", oldNode, nodeName, err) } - newAnnotations := newNode.Annotations + newTaints := newNode.Spec.Taints objCopy, err := api.Scheme.DeepCopy(oldNode) if err != nil { return fmt.Errorf("failed to copy node object %#v: %v", oldNode, err) @@ -931,7 +931,7 @@ func PatchNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node, n if !ok { return fmt.Errorf("failed to cast copy onto node object %#v: %v", newNode, err) } - newNode.Annotations = newAnnotations + newNode.Spec.Taints = newTaints newData, err := json.Marshal(newNode) if err != nil { return fmt.Errorf("failed to marshal new node %#v for node %q: %v", newNode, nodeName, err)