From fd600ab65c19080104ab5db0511b002397c0ae39 Mon Sep 17 00:00:00 2001
From: gmarek <gmarek@google.com>
Date: Tue, 12 Jul 2016 09:38:57 +0200
Subject: [PATCH] Add hooks for cluster health detection

---
 ...{deletion_utils.go => controller_utils.go} |  25 +--
 pkg/controller/node/nodecontroller.go         | 158 +++++++++++-------
 pkg/util/node/node.go                         |  22 +++
 .../priorities/selector_spreading.go          |  26 +--
 4 files changed, 132 insertions(+), 99 deletions(-)
 rename pkg/controller/node/{deletion_utils.go => controller_utils.go} (96%)

diff --git a/pkg/controller/node/deletion_utils.go b/pkg/controller/node/controller_utils.go
similarity index 96%
rename from pkg/controller/node/deletion_utils.go
rename to pkg/controller/node/controller_utils.go
index aec2c9ace61..3d4eb5aa840 100644
--- a/pkg/controller/node/deletion_utils.go
+++ b/pkg/controller/node/controller_utils.go
@@ -22,7 +22,6 @@ import (
 	"time"
 
 	"k8s.io/kubernetes/pkg/api"
-	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/client/cache"
 	clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
 	"k8s.io/kubernetes/pkg/client/record"
@@ -36,6 +35,20 @@ import (
 	"github.com/golang/glog"
 )
 
+// This function is expected to get a slice of NodeReadyConditions for all Nodes in a given zone.
+func ComputeZoneState(nodeReadyConditions []*api.NodeCondition) zoneState {
+	seenReady := false
+	for i := range nodeReadyConditions {
+		if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == api.ConditionTrue {
+			seenReady = true
+		}
+	}
+	if seenReady {
+		return stateNormal
+	}
+	return stateFullSegmentation
+}
+
 // cleanupOrphanedPods deletes pods that are bound to nodes that don't
 // exist.
 func cleanupOrphanedPods(pods []*api.Pod, nodeStore cache.Store, forcefulDeletePodFunc func(*api.Pod) error) {
@@ -124,16 +137,6 @@ func forcefullyDeleteNode(kubeClient clientset.Interface, nodeName string, force
 	return nil
 }
 
-// forceUpdateAllProbeTimes bumps all observed timestamps in saved nodeStatuses to now. This makes
-// all eviction timer to reset.
-func forceUpdateAllProbeTimes(now unversioned.Time, statusData map[string]nodeStatusData) {
-	for k, v := range statusData {
-		v.probeTimestamp = now
-		v.readyTransitionTimestamp = now
-		statusData[k] = v
-	}
-}
-
 // maybeDeleteTerminatingPod non-gracefully deletes pods that are terminating
 // that should not be gracefully terminated.
 func (nc *NodeController) maybeDeleteTerminatingPod(obj interface{}, nodeStore cache.Store, forcefulDeletePodFunc func(*api.Pod) error) {
diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go
index 2ac2d1e4c38..5532e97e576 100644
--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@@ -38,8 +38,8 @@ import (
 	"k8s.io/kubernetes/pkg/runtime"
 	"k8s.io/kubernetes/pkg/util/flowcontrol"
 	"k8s.io/kubernetes/pkg/util/metrics"
+	utilnode "k8s.io/kubernetes/pkg/util/node"
 	utilruntime "k8s.io/kubernetes/pkg/util/runtime"
-	"k8s.io/kubernetes/pkg/util/sets"
 	"k8s.io/kubernetes/pkg/util/system"
 	"k8s.io/kubernetes/pkg/util/wait"
 	"k8s.io/kubernetes/pkg/version"
@@ -58,6 +58,14 @@ const (
 	nodeEvictionPeriod = 100 * time.Millisecond
 )
 
+type zoneState string
+
+const (
+	stateNormal              = zoneState("Normal")
+	stateFullSegmentation    = zoneState("FullSegmentation")
+	statePartialSegmentation = zoneState("PartialSegmentation")
+)
+
 type nodeStatusData struct {
 	probeTimestamp           unversioned.Time
 	readyTransitionTimestamp unversioned.Time
@@ -70,7 +78,7 @@ type NodeController struct {
 	clusterCIDR             *net.IPNet
 	serviceCIDR             *net.IPNet
 	deletingPodsRateLimiter flowcontrol.RateLimiter
-	knownNodeSet            sets.String
+	knownNodeSet            map[string]*api.Node
 	kubeClient              clientset.Interface
 	// Method for easy mocking in unittest.
 	lookupIP func(host string) ([]net.IP, error)
@@ -124,11 +132,9 @@ type NodeController struct {
 
 	forcefullyDeletePod       func(*api.Pod) error
 	nodeExistsInCloudProvider func(string) (bool, error)
+	computeZoneStateFunc      func(nodeConditions []*api.NodeCondition) zoneState
 
-	// If in network segmentation mode NodeController won't evict Pods from unhealthy Nodes.
-	// It is enabled when all Nodes observed by the NodeController are NotReady and disabled
-	// when NC sees any healthy Node. This is a temporary fix for v1.3.
-	networkSegmentationMode bool
+	zoneStates map[string]zoneState
 }
 
 // NewNodeController returns a new node controller to sync instances from cloudprovider.
@@ -172,7 +178,7 @@ func NewNodeController(
 
 	nc := &NodeController{
 		cloud:                     cloud,
-		knownNodeSet:              make(sets.String),
+		knownNodeSet:              make(map[string]*api.Node),
 		kubeClient:                kubeClient,
 		recorder:                  recorder,
 		podEvictionTimeout:        podEvictionTimeout,
@@ -191,6 +197,8 @@ func NewNodeController(
 		allocateNodeCIDRs:         allocateNodeCIDRs,
 		forcefullyDeletePod:       func(p *api.Pod) error { return forcefullyDeletePod(kubeClient, p) },
 		nodeExistsInCloudProvider: func(nodeName string) (bool, error) { return nodeExistsInCloudProvider(cloud, nodeName) },
+		computeZoneStateFunc:      ComputeZoneState,
+		zoneStates:                make(map[string]zoneState),
 	}
 
 	nc.podStore.Indexer, nc.podController = framework.NewIndexerInformer(
@@ -360,31 +368,22 @@ func (nc *NodeController) monitorNodeStatus() error {
 	if err != nil {
 		return err
 	}
-	for _, node := range nodes.Items {
-		if !nc.knownNodeSet.Has(node.Name) {
-			glog.V(1).Infof("NodeController observed a new Node: %#v", node)
-			recordNodeEvent(nc.recorder, node.Name, api.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", node.Name))
-			nc.cancelPodEviction(node.Name)
-			nc.knownNodeSet.Insert(node.Name)
-		}
-	}
-	// If there's a difference between lengths of known Nodes and observed nodes
-	// we must have removed some Node.
-	if len(nc.knownNodeSet) != len(nodes.Items) {
-		observedSet := make(sets.String)
-		for _, node := range nodes.Items {
-			observedSet.Insert(node.Name)
-		}
-		deleted := nc.knownNodeSet.Difference(observedSet)
-		for nodeName := range deleted {
-			glog.V(1).Infof("NodeController observed a Node deletion: %v", nodeName)
-			recordNodeEvent(nc.recorder, nodeName, api.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", nodeName))
-			nc.evictPods(nodeName)
-			nc.knownNodeSet.Delete(nodeName)
-		}
+	added, deleted := nc.checkForNodeAddedDeleted(nodes)
+	for i := range added {
+		glog.V(1).Infof("NodeController observed a new Node: %#v", added[i].Name)
+		recordNodeEvent(nc.recorder, added[i].Name, api.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in NodeController", added[i].Name))
+		nc.cancelPodEviction(added[i])
+		nc.knownNodeSet[added[i].Name] = added[i]
 	}
 
-	seenReady := false
+	for i := range deleted {
+		glog.V(1).Infof("NodeController observed a Node deletion: %v", deleted[i].Name)
+		recordNodeEvent(nc.recorder, deleted[i].Name, api.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from NodeController", deleted[i].Name))
+		nc.evictPods(deleted[i])
+		delete(nc.knownNodeSet, deleted[i].Name)
+	}
+
+	zoneToNodeConditions := map[string][]*api.NodeCondition{}
 	for i := range nodes.Items {
 		var gracePeriod time.Duration
 		var observedReadyCondition api.NodeCondition
@@ -407,29 +406,28 @@ func (nc *NodeController) monitorNodeStatus() error {
 				"Skipping - no pods will be evicted.", node.Name)
 			continue
 		}
+		// We do not treat a master node as a part of the cluster for network segmentation checking.
+		if !system.IsMasterNode(node) {
+			zoneToNodeConditions[utilnode.GetZoneKey(node)] = append(zoneToNodeConditions[utilnode.GetZoneKey(node)], currentReadyCondition)
+		}
 
 		decisionTimestamp := nc.now()
-
 		if currentReadyCondition != nil {
 			// Check eviction timeout against decisionTimestamp
 			if observedReadyCondition.Status == api.ConditionFalse &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
-				if nc.evictPods(node.Name) {
+				if nc.evictPods(node) {
 					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
 				}
 			}
 			if observedReadyCondition.Status == api.ConditionUnknown &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
-				if nc.evictPods(node.Name) {
+				if nc.evictPods(node) {
 					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
 				}
 			}
 			if observedReadyCondition.Status == api.ConditionTrue {
-				// We do not treat a master node as a part of the cluster for network segmentation checking.
-				if !system.IsMasterNode(node) {
-					seenReady = true
-				}
-				if nc.cancelPodEviction(node.Name) {
+				if nc.cancelPodEviction(node) {
 					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
 				}
 			}
@@ -468,19 +466,35 @@ func (nc *NodeController) monitorNodeStatus() error {
 		}
 	}
 
-	// NC don't see any Ready Node. We assume that the network is segmented and Nodes cannot connect to API server and
-	// update their statuses. NC enteres network segmentation mode and cancels all evictions in progress.
-	if !seenReady {
-		nc.networkSegmentationMode = true
-		nc.stopAllPodEvictions()
-		glog.V(2).Info("NodeController is entering network segmentation mode.")
-	} else {
-		if nc.networkSegmentationMode {
-			forceUpdateAllProbeTimes(nc.now(), nc.nodeStatusMap)
-			nc.networkSegmentationMode = false
-			glog.V(2).Info("NodeController exited network segmentation mode.")
+	for k, v := range zoneToNodeConditions {
+		newState := nc.computeZoneStateFunc(v)
+		if newState == nc.zoneStates[k] {
+			continue
 		}
+		if newState == stateFullSegmentation {
+			glog.V(2).Infof("NodeController is entering network segmentation mode in zone %v.", k)
+		} else if newState == stateNormal {
+			glog.V(2).Infof("NodeController exited network segmentation mode in zone %v.", k)
+		}
+		for i := range nodes.Items {
+			if utilnode.GetZoneKey(&nodes.Items[i]) == k {
+				if newState == stateFullSegmentation {
+					// When zone is fully segmented we stop the eviction all together.
+					nc.cancelPodEviction(&nodes.Items[i])
+				}
+				if newState == stateNormal && nc.zoneStates[k] == stateFullSegmentation {
+					// When exiting segmentation mode update probe timestamps on all Nodes.
+					now := nc.now()
+					v := nc.nodeStatusMap[nodes.Items[i].Name]
+					v.probeTimestamp = now
+					v.readyTransitionTimestamp = now
+					nc.nodeStatusMap[nodes.Items[i].Name] = v
+				}
+			}
+		}
+		nc.zoneStates[k] = newState
 	}
+
 	return nil
 }
 
@@ -649,15 +663,38 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 	return gracePeriod, observedReadyCondition, currentReadyCondition, err
 }
 
+func (nc *NodeController) checkForNodeAddedDeleted(nodes *api.NodeList) (added, deleted []*api.Node) {
+	for i := range nodes.Items {
+		if _, has := nc.knownNodeSet[nodes.Items[i].Name]; !has {
+			added = append(added, &nodes.Items[i])
+		}
+	}
+	// If there's a difference between lengths of known Nodes and observed nodes
+	// we must have removed some Node.
+	if len(nc.knownNodeSet)+len(added) != len(nodes.Items) {
+		knowSetCopy := map[string]*api.Node{}
+		for k, v := range nc.knownNodeSet {
+			knowSetCopy[k] = v
+		}
+		for i := range nodes.Items {
+			delete(knowSetCopy, nodes.Items[i].Name)
+		}
+		for i := range knowSetCopy {
+			deleted = append(deleted, knowSetCopy[i])
+		}
+	}
+	return
+}
+
 // cancelPodEviction removes any queued evictions, typically because the node is available again. It
 // returns true if an eviction was queued.
-func (nc *NodeController) cancelPodEviction(nodeName string) bool {
+func (nc *NodeController) cancelPodEviction(node *api.Node) bool {
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
-	wasDeleting := nc.podEvictor.Remove(nodeName)
-	wasTerminating := nc.terminationEvictor.Remove(nodeName)
+	wasDeleting := nc.podEvictor.Remove(node.Name)
+	wasTerminating := nc.terminationEvictor.Remove(node.Name)
 	if wasDeleting || wasTerminating {
-		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", nodeName)
+		glog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name)
 		return true
 	}
 	return false
@@ -665,20 +702,11 @@ func (nc *NodeController) cancelPodEviction(nodeName string) bool {
 
 // evictPods queues an eviction for the provided node name, and returns false if the node is already
 // queued for eviction.
-func (nc *NodeController) evictPods(nodeName string) bool {
-	if nc.networkSegmentationMode {
+func (nc *NodeController) evictPods(node *api.Node) bool {
+	if nc.zoneStates[utilnode.GetZoneKey(node)] == stateFullSegmentation {
 		return false
 	}
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
-	return nc.podEvictor.Add(nodeName)
-}
-
-// stopAllPodEvictions removes any queued evictions for all Nodes.
-func (nc *NodeController) stopAllPodEvictions() {
-	nc.evictorLock.Lock()
-	defer nc.evictorLock.Unlock()
-	glog.V(3).Infof("Cancelling all pod evictions.")
-	nc.podEvictor.Clear()
-	nc.terminationEvictor.Clear()
+	return nc.podEvictor.Add(node.Name)
 }
diff --git a/pkg/util/node/node.go b/pkg/util/node/node.go
index b6b6ab188fc..66f916c94fe 100644
--- a/pkg/util/node/node.go
+++ b/pkg/util/node/node.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
 )
 
 func GetHostname(hostnameOverride string) string {
@@ -59,3 +60,24 @@ func GetNodeHostIP(node *api.Node) (net.IP, error) {
 	}
 	return nil, fmt.Errorf("host IP unknown; known addresses: %v", addresses)
 }
+
+// Helper function that builds a string identifier that is unique per failure-zone
+// Returns empty-string for no zone
+func GetZoneKey(node *api.Node) string {
+	labels := node.Labels
+	if labels == nil {
+		return ""
+	}
+
+	region, _ := labels[unversioned.LabelZoneRegion]
+	failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
+
+	if region == "" && failureDomain == "" {
+		return ""
+	}
+
+	// We include the null character just in case region or failureDomain has a colon
+	// (We do assume there's no null characters in a region or failureDomain)
+	// As a nice side-benefit, the null character is not printed by fmt.Print or glog
+	return region + ":\x00:" + failureDomain
+}
diff --git a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
index ae80eaa1348..c9494233fac 100644
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@@ -23,6 +23,7 @@ import (
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/labels"
+	utilnode "k8s.io/kubernetes/pkg/util/node"
 	utilruntime "k8s.io/kubernetes/pkg/util/runtime"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
@@ -54,27 +55,6 @@ func NewSelectorSpreadPriority(podLister algorithm.PodLister, serviceLister algo
 	return selectorSpread.CalculateSpreadPriority
 }
 
-// Helper function that builds a string identifier that is unique per failure-zone
-// Returns empty-string for no zone
-func getZoneKey(node *api.Node) string {
-	labels := node.Labels
-	if labels == nil {
-		return ""
-	}
-
-	region, _ := labels[unversioned.LabelZoneRegion]
-	failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
-
-	if region == "" && failureDomain == "" {
-		return ""
-	}
-
-	// We include the null character just in case region or failureDomain has a colon
-	// (We do assume there's no null characters in a region or failureDomain)
-	// As a nice side-benefit, the null character is not printed by fmt.Print or glog
-	return region + ":\x00:" + failureDomain
-}
-
 // CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
 // When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
 // It favors nodes that have fewer existing matching pods.
@@ -189,7 +169,7 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo ma
 			continue
 		}
 
-		zoneId := getZoneKey(node)
+		zoneId := utilnode.GetZoneKey(node)
 		if zoneId == "" {
 			continue
 		}
@@ -220,7 +200,7 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, nodeNameToInfo ma
 
 		// If there is zone information present, incorporate it
 		if haveZones {
-			zoneId := getZoneKey(node)
+			zoneId := utilnode.GetZoneKey(node)
 			if zoneId != "" {
 				zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
 				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)