NodeLifecycleController treats node lease renewal as a heartbeat signal

This commit is contained in:
Zhen Wang
2018-10-01 11:32:56 -07:00
parent cda667e217
commit e35d808aa2
5 changed files with 715 additions and 32 deletions

View File

@@ -31,6 +31,7 @@ import (
"github.com/golang/glog"
coordv1beta1 "k8s.io/api/coordination/v1beta1"
"k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -39,10 +40,13 @@ import (
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coordinformers "k8s.io/client-go/informers/coordination/v1beta1"
coreinformers "k8s.io/client-go/informers/core/v1"
extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
coordlisters "k8s.io/client-go/listers/coordination/v1beta1"
corelisters "k8s.io/client-go/listers/core/v1"
extensionslisters "k8s.io/client-go/listers/extensions/v1beta1"
"k8s.io/client-go/tools/cache"
@@ -54,6 +58,7 @@ import (
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/algorithm"
"k8s.io/kubernetes/pkg/util/metrics"
utilnode "k8s.io/kubernetes/pkg/util/node"
@@ -136,6 +141,7 @@ type nodeHealthData struct {
probeTimestamp metav1.Time
readyTransitionTimestamp metav1.Time
status *v1.NodeStatus
lease *coordv1beta1.Lease
}
// Controller is the controller that manages node's life cycle.
@@ -172,6 +178,8 @@ type Controller struct {
daemonSetStore extensionslisters.DaemonSetLister
daemonSetInformerSynced cache.InformerSynced
leaseLister coordlisters.LeaseLister
leaseInformerSynced cache.InformerSynced
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
@@ -190,19 +198,23 @@ type Controller struct {
nodeStartupGracePeriod time.Duration
// Controller will not proactively sync node health, but will monitor node
// health signal updated from kubelet. If it doesn't receive update for this
// amount of time, it will start posting "NodeReady==ConditionUnknown". The
// amount of time before which Controller start evicting pods is controlled
// via flag 'pod-eviction-timeout'.
// health signal updated from kubelet. There are 2 kinds of node healthiness
// signals: NodeStatus and NodeLease. NodeLease signal is generated only when
// NodeLease feature is enabled. If it doesn't receive update for this amount
// of time, it will start posting "NodeReady==ConditionUnknown". The amount of
// time before which Controller start evicting pods is controlled via flag
// 'pod-eviction-timeout'.
// Note: be cautious when changing the constant, it must work with
// nodeStatusUpdateFrequency in kubelet. There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than
// nodeStatusUpdateFrequency, where N means number of retries allowed for
// kubelet to post node health signal. It is pointless to make
// nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since
// there will only be fresh values from Kubelet at an interval of
// nodeStatusUpdateFrequency. The constant must be less than
// podEvictionTimeout.
// nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
// controller. The node health signal update frequency is the minimal of the
// two.
// There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than the node health signal
// update frequency, where N means number of retries allowed for kubelet to
// post node status/lease. It is pointless to make nodeMonitorGracePeriod
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
// update frequency. The constant must be less than podEvictionTimeout.
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
// value takes longer for user to see up-to-date node health.
nodeMonitorGracePeriod time.Duration
@@ -229,7 +241,9 @@ type Controller struct {
}
// NewNodeLifecycleController returns a new taint controller.
func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
func NewNodeLifecycleController(
leaseInformer coordinformers.LeaseInformer,
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
daemonSetInformer extensionsinformers.DaemonSetInformer,
cloud cloudprovider.Interface,
@@ -373,6 +387,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
}),
})
nc.leaseLister = leaseInformer.Lister()
nc.leaseInformerSynced = leaseInformer.Informer().HasSynced
nc.nodeLister = nodeInformer.Lister()
nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
@@ -389,7 +406,7 @@ func (nc *Controller) Run(stopCh <-chan struct{}) {
glog.Infof("Starting node controller")
defer glog.Infof("Shutting down node controller")
if !controller.WaitForCacheSync("taint", stopCh, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
if !controller.WaitForCacheSync("taint", stopCh, nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
return
}
@@ -811,7 +828,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
_, currentReadyCondition := v1node.GetNodeCondition(&node.Status, v1.NodeReady)
if currentReadyCondition == nil {
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
// to node.CreationTimestamp to avoid handle the corner case.
observedReadyCondition = v1.NodeCondition{
Type: v1.NodeReady,
@@ -820,10 +837,14 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
LastTransitionTime: node.CreationTimestamp,
}
gracePeriod = nc.nodeStartupGracePeriod
nc.nodeHealthMap[node.Name] = &nodeHealthData{
status: &node.Status,
probeTimestamp: node.CreationTimestamp,
readyTransitionTimestamp: node.CreationTimestamp,
if _, found := nc.nodeHealthMap[node.Name]; found {
nc.nodeHealthMap[node.Name].status = &node.Status
} else {
nc.nodeHealthMap[node.Name] = &nodeHealthData{
status: &node.Status,
probeTimestamp: node.CreationTimestamp,
readyTransitionTimestamp: node.CreationTimestamp,
}
}
} else {
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
@@ -847,8 +868,10 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
// - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
// if that's the case, but it does not seem necessary.
var savedCondition *v1.NodeCondition
var savedLease *coordv1beta1.Lease
if found {
_, savedCondition = v1node.GetNodeCondition(savedNodeHealth.status, v1.NodeReady)
savedLease = savedNodeHealth.lease
}
_, observedCondition := v1node.GetNodeCondition(&node.Status, v1.NodeReady)
if !found {
@@ -894,11 +917,23 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
readyTransitionTimestamp: transitionTime,
}
}
var observedLease *coordv1beta1.Lease
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) {
// Always update the probe time if node lease is renewed.
// Note: If kubelet never posted the node status, but continues renewing the
// heartbeat leases, the node controller will assume the node is healthy and
// take no action.
observedLease, _ = nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
savedNodeHealth.lease = observedLease
savedNodeHealth.probeTimestamp = nc.now()
}
}
nc.nodeHealthMap[node.Name] = savedNodeHealth
if nc.now().After(savedNodeHealth.probeTimestamp.Add(gracePeriod)) {
// NodeReady condition was last set longer ago than gracePeriod, so update it to Unknown
// (regardless of its current value) in the master.
// NodeReady condition or lease was last set longer ago than gracePeriod, so
// update it to Unknown (regardless of its current value) in the master.
if currentReadyCondition == nil {
glog.V(2).Infof("node %v is never updated by kubelet", node.Name)
node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
@@ -967,6 +1002,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
status: &node.Status,
probeTimestamp: nc.nodeHealthMap[node.Name].probeTimestamp,
readyTransitionTimestamp: nc.now(),
lease: observedLease,
}
return gracePeriod, observedReadyCondition, currentReadyCondition, nil
}