Merge pull request #26415 from wojtek-t/network_not_ready

Automatic merge from submit-queue

Add a NodeCondition "NetworkUnavaiable" to prevent scheduling onto a node until the routes have been created 

This is new version of #26267 (based on top of that one).

The new workflow is:
- we have an "NetworkNotReady" condition
- Kubelet when it creates a node, it sets it to "true"
- RouteController will set it to "false" when the route is created
- Scheduler is scheduling only on nodes that doesn't have "NetworkNotReady ==true" condition

@gmarek @bgrant0607 @zmerlynn @cjcullen @derekwaynecarr @danwinship @dcbw @lavalamp @vishh
This commit is contained in:
k8s-merge-robot 2016-05-29 03:06:59 -07:00
commit 577cdf937d
8 changed files with 183 additions and 39 deletions

View File

@ -95,8 +95,25 @@ func GetPodReadyCondition(status PodStatus) *PodCondition {
// GetPodCondition extracts the provided condition from the given status and returns that.
// Returns nil and -1 if the condition is not present, and the the index of the located condition.
func GetPodCondition(status *PodStatus, conditionType PodConditionType) (int, *PodCondition) {
for i, c := range status.Conditions {
if c.Type == conditionType {
if status == nil {
return -1, nil
}
for i := range status.Conditions {
if status.Conditions[i].Type == conditionType {
return i, &status.Conditions[i]
}
}
return -1, nil
}
// GetNodeCondition extracts the provided condition from the given status and returns that.
// Returns nil and -1 if the condition is not present, and the the index of the located condition.
func GetNodeCondition(status *NodeStatus, conditionType NodeConditionType) (int, *NodeCondition) {
if status == nil {
return -1, nil
}
for i := range status.Conditions {
if status.Conditions[i].Type == conditionType {
return i, &status.Conditions[i]
}
}

View File

@ -2014,6 +2014,8 @@ const (
NodeOutOfDisk NodeConditionType = "OutOfDisk"
// NodeMemoryPressure means the kubelet is under pressure due to insufficient available memory.
NodeMemoryPressure NodeConditionType = "MemoryPressure"
// NodeNetworkUnavailable means that network for the node is not correctly configured.
NodeNetworkUnavailable NodeConditionType = "NetworkUnavailable"
)
type NodeCondition struct {

View File

@ -2418,6 +2418,8 @@ const (
NodeOutOfDisk NodeConditionType = "OutOfDisk"
// NodeMemoryPressure means the kubelet is under pressure due to insufficient available memory.
NodeMemoryPressure NodeConditionType = "MemoryPressure"
// NodeNetworkUnavailable means that network for the node is not correctly configured.
NodeNetworkUnavailable NodeConditionType = "NetworkUnavailable"
)
// NodeCondition contains condition infromation for a node.

View File

@ -417,20 +417,6 @@ func (nc *NodeController) recycleCIDR(obj interface{}) {
}
}
// getCondition returns a condition object for the specific condition
// type, nil if the condition is not set.
func (nc *NodeController) getCondition(status *api.NodeStatus, conditionType api.NodeConditionType) *api.NodeCondition {
if status == nil {
return nil
}
for i := range status.Conditions {
if status.Conditions[i].Type == conditionType {
return &status.Conditions[i]
}
}
return nil
}
var gracefulDeletionVersion = version.MustParse("v1.1.0")
// maybeDeleteTerminatingPod non-gracefully deletes pods that are terminating
@ -716,7 +702,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
var err error
var gracePeriod time.Duration
var observedReadyCondition api.NodeCondition
currentReadyCondition := nc.getCondition(&node.Status, api.NodeReady)
_, currentReadyCondition := api.GetNodeCondition(&node.Status, api.NodeReady)
if currentReadyCondition == nil {
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
@ -756,9 +742,9 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
// if that's the case, but it does not seem necessary.
var savedCondition *api.NodeCondition
if found {
savedCondition = nc.getCondition(&savedNodeStatus.status, api.NodeReady)
_, savedCondition = api.GetNodeCondition(&savedNodeStatus.status, api.NodeReady)
}
observedCondition := nc.getCondition(&node.Status, api.NodeReady)
_, observedCondition := api.GetNodeCondition(&node.Status, api.NodeReady)
if !found {
glog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name)
savedNodeStatus = nodeStatusData{
@ -834,7 +820,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
// Like NodeReady condition, NodeOutOfDisk was last set longer ago than gracePeriod, so update
// it to Unknown (regardless of its current value) in the master.
// TODO(madhusudancs): Refactor this with readyCondition to remove duplicated code.
oodCondition := nc.getCondition(&node.Status, api.NodeOutOfDisk)
_, oodCondition := api.GetNodeCondition(&node.Status, api.NodeOutOfDisk)
if oodCondition == nil {
glog.V(2).Infof("Out of disk condition of node %v is never updated by kubelet", node.Name)
node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
@ -856,7 +842,8 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
}
}
if !api.Semantic.DeepEqual(nc.getCondition(&node.Status, api.NodeReady), &observedReadyCondition) {
_, currentCondition := api.GetNodeCondition(&node.Status, api.NodeReady)
if !api.Semantic.DeepEqual(currentCondition, &observedReadyCondition) {
if _, err = nc.kubeClient.Core().Nodes().UpdateStatus(node); err != nil {
glog.Errorf("Error updating node %s: %v", node.Name, err)
return gracePeriod, observedReadyCondition, currentReadyCondition, err

View File

@ -24,6 +24,7 @@ import (
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
"k8s.io/kubernetes/pkg/cloudprovider"
"k8s.io/kubernetes/pkg/util/metrics"
@ -36,6 +37,8 @@ const (
maxConcurrentRouteCreations int = 200
// Maximum number of retries of route creations.
maxRetries int = 5
// Maximum number of retries of node status update.
updateNodeStatusMaxRetries int = 3
)
type RouteController struct {
@ -121,6 +124,8 @@ func (rc *RouteController) reconcile(nodes []api.Node, routes []*cloudprovider.R
rateLimiter <- struct{}{}
err := rc.routes.CreateRoute(rc.clusterName, nameHint, route)
<-rateLimiter
rc.updateNetworkingCondition(nodeName, err == nil)
if err != nil {
glog.Errorf("Could not create route %s %s for node %s after %v: %v", nameHint, route.DestinationCIDR, nodeName, time.Now().Sub(startTime), err)
} else {
@ -129,6 +134,8 @@ func (rc *RouteController) reconcile(nodes []api.Node, routes []*cloudprovider.R
}
}
}(node.Name, nameHint, route)
} else {
rc.updateNetworkingCondition(node.Name, true)
}
nodeCIDRs[node.Name] = node.Spec.PodCIDR
}
@ -155,6 +162,65 @@ func (rc *RouteController) reconcile(nodes []api.Node, routes []*cloudprovider.R
return nil
}
func updateNetworkingCondition(node *api.Node, routeCreated bool) {
_, networkingCondition := api.GetNodeCondition(&node.Status, api.NodeNetworkUnavailable)
currentTime := unversioned.Now()
if routeCreated {
if networkingCondition != nil && networkingCondition.Status != api.ConditionFalse {
networkingCondition.Status = api.ConditionFalse
networkingCondition.Reason = "RouteCreated"
networkingCondition.Message = "RouteController created a route"
networkingCondition.LastTransitionTime = currentTime
} else if networkingCondition == nil {
node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
Type: api.NodeNetworkUnavailable,
Status: api.ConditionFalse,
Reason: "RouteCreated",
Message: "RouteController created a route",
LastTransitionTime: currentTime,
})
}
} else {
if networkingCondition != nil && networkingCondition.Status != api.ConditionTrue {
networkingCondition.Status = api.ConditionTrue
networkingCondition.Reason = "NoRouteCreated"
networkingCondition.Message = "RouteController failed to create a route"
networkingCondition.LastTransitionTime = currentTime
} else if networkingCondition == nil {
node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
Type: api.NodeNetworkUnavailable,
Status: api.ConditionTrue,
Reason: "NoRouteCreated",
Message: "RouteController failed to create a route",
LastTransitionTime: currentTime,
})
}
}
}
func (rc *RouteController) updateNetworkingCondition(nodeName string, routeCreated bool) error {
var err error
for i := 0; i < updateNodeStatusMaxRetries; i++ {
node, err := rc.kubeClient.Core().Nodes().Get(nodeName)
if err != nil {
glog.Errorf("Error geting node: %v", err)
continue
}
updateNetworkingCondition(node, routeCreated)
// TODO: Use Patch instead once #26381 is merged.
// See kubernetes/node-problem-detector#9 for details.
if _, err = rc.kubeClient.Core().Nodes().UpdateStatus(node); err == nil {
return nil
}
if i+1 < updateNodeStatusMaxRetries {
glog.Errorf("Error updating node %s, retrying: %v", node.Name, err)
} else {
glog.Errorf("Error updating node %s: %v", node.Name, err)
}
}
return err
}
func (rc *RouteController) isResponsibleForRoute(route *cloudprovider.Route) bool {
_, cidr, err := net.ParseCIDR(route.DestinationCIDR)
if err != nil {

View File

@ -22,6 +22,8 @@ import (
"time"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/fake"
"k8s.io/kubernetes/pkg/client/testing/core"
"k8s.io/kubernetes/pkg/cloudprovider"
fakecloud "k8s.io/kubernetes/pkg/cloudprovider/providers/fake"
)
@ -67,16 +69,22 @@ func TestIsResponsibleForRoute(t *testing.T) {
func TestReconcile(t *testing.T) {
cluster := "my-k8s"
node1 := api.Node{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}}
node2 := api.Node{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}}
nodeNoCidr := api.Node{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: ""}}
testCases := []struct {
nodes []api.Node
initialRoutes []*cloudprovider.Route
expectedRoutes []*cloudprovider.Route
nodes []api.Node
initialRoutes []*cloudprovider.Route
expectedRoutes []*cloudprovider.Route
expectedNetworkUnavailable []bool
clientset *fake.Clientset
}{
// 2 nodes, routes already there
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}},
node1,
node2,
},
initialRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
@ -86,12 +94,14 @@ func TestReconcile(t *testing.T) {
{cluster + "-01", "node-1", "10.120.0.0/24"},
{cluster + "-02", "node-2", "10.120.1.0/24"},
},
expectedNetworkUnavailable: []bool{true, true},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, node2}}),
},
// 2 nodes, one route already there
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}},
node1,
node2,
},
initialRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
@ -100,24 +110,28 @@ func TestReconcile(t *testing.T) {
{cluster + "-01", "node-1", "10.120.0.0/24"},
{cluster + "-02", "node-2", "10.120.1.0/24"},
},
expectedNetworkUnavailable: []bool{true, true},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, node2}}),
},
// 2 nodes, no routes yet
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}},
node1,
node2,
},
initialRoutes: []*cloudprovider.Route{},
expectedRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
{cluster + "-02", "node-2", "10.120.1.0/24"},
},
expectedNetworkUnavailable: []bool{true, true},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, node2}}),
},
// 2 nodes, a few too many routes
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}},
node1,
node2,
},
initialRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
@ -129,12 +143,14 @@ func TestReconcile(t *testing.T) {
{cluster + "-01", "node-1", "10.120.0.0/24"},
{cluster + "-02", "node-2", "10.120.1.0/24"},
},
expectedNetworkUnavailable: []bool{true, true},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, node2}}),
},
// 2 nodes, 2 routes, but only 1 is right
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: "10.120.1.0/24"}},
node1,
node2,
},
initialRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
@ -144,17 +160,21 @@ func TestReconcile(t *testing.T) {
{cluster + "-01", "node-1", "10.120.0.0/24"},
{cluster + "-02", "node-2", "10.120.1.0/24"},
},
expectedNetworkUnavailable: []bool{true, true},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, node2}}),
},
// 2 nodes, one node without CIDR assigned.
{
nodes: []api.Node{
{ObjectMeta: api.ObjectMeta{Name: "node-1", UID: "01"}, Spec: api.NodeSpec{PodCIDR: "10.120.0.0/24"}},
{ObjectMeta: api.ObjectMeta{Name: "node-2", UID: "02"}, Spec: api.NodeSpec{PodCIDR: ""}},
node1,
nodeNoCidr,
},
initialRoutes: []*cloudprovider.Route{},
expectedRoutes: []*cloudprovider.Route{
{cluster + "-01", "node-1", "10.120.0.0/24"},
},
expectedNetworkUnavailable: []bool{true, false},
clientset: fake.NewSimpleClientset(&api.NodeList{Items: []api.Node{node1, nodeNoCidr}}),
},
}
for i, testCase := range testCases {
@ -170,10 +190,37 @@ func TestReconcile(t *testing.T) {
t.Error("Error in test: fakecloud doesn't support Routes()")
}
_, cidr, _ := net.ParseCIDR("10.120.0.0/16")
rc := New(routes, nil, cluster, cidr)
rc := New(routes, testCase.clientset, cluster, cidr)
if err := rc.reconcile(testCase.nodes, testCase.initialRoutes); err != nil {
t.Errorf("%d. Error from rc.reconcile(): %v", i, err)
}
for _, action := range testCase.clientset.Actions() {
if action.GetVerb() == "update" && action.GetResource().Resource == "nodes" {
node := action.(core.UpdateAction).GetObject().(*api.Node)
_, condition := api.GetNodeCondition(&node.Status, api.NodeNetworkUnavailable)
if condition == nil {
t.Errorf("%d. Missing NodeNetworkUnavailable condition for Node %v", i, node.Name)
} else {
check := func(index int) bool {
return (condition.Status == api.ConditionFalse) == testCase.expectedNetworkUnavailable[index]
}
index := -1
for j := range testCase.nodes {
if testCase.nodes[j].Name == node.Name {
index = j
}
}
if index == -1 {
// Something's wrong
continue
}
if !check(index) {
t.Errorf("%d. Invalid NodeNetworkUnavailable condition for Node %v, expected %v, got %v",
i, node.Name, testCase.expectedNetworkUnavailable[index], (condition.Status == api.ConditionFalse))
}
}
}
}
var finalRoutes []*cloudprovider.Route
var err error
timeoutChan := time.After(200 * time.Millisecond)

View File

@ -1018,6 +1018,16 @@ func (kl *Kubelet) initialNodeStatus() (*api.Node, error) {
Unschedulable: !kl.registerSchedulable,
},
}
// Initially, set NodeNetworkUnavailable to true.
if kl.providerRequiresNetworkingConfiguration() {
node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
Type: api.NodeNetworkUnavailable,
Status: api.ConditionTrue,
Reason: "NoRouteCreated",
Message: "Node created without a route",
LastTransitionTime: unversioned.NewTime(kl.clock.Now()),
})
}
// @question: should this be place after the call to the cloud provider? which also applies labels
for k, v := range kl.nodeLabels {
@ -1088,6 +1098,14 @@ func (kl *Kubelet) initialNodeStatus() (*api.Node, error) {
return node, nil
}
func (kl *Kubelet) providerRequiresNetworkingConfiguration() bool {
if kl.cloud == nil || kl.flannelExperimentalOverlay {
return false
}
_, supported := kl.cloud.Routes()
return supported
}
// registerWithApiserver registers the node with the cluster master. It is safe
// to call multiple times, but not concurrently (kl.registrationCompleted is
// not locked).

View File

@ -437,14 +437,19 @@ func (f *ConfigFactory) responsibleForPod(pod *api.Pod) bool {
func getNodeConditionPredicate() cache.NodeConditionPredicate {
return func(node api.Node) bool {
for _, cond := range node.Status.Conditions {
// We consider the node for scheduling only when its NodeReady condition status
// is ConditionTrue and its NodeOutOfDisk condition status is ConditionFalse.
// We consider the node for scheduling only when its:
// - NodeReady condition status is ConditionTrue,
// - NodeOutOfDisk condition status is ConditionFalse,
// - NodeNetworkUnavailable condition status is ConditionFalse.
if cond.Type == api.NodeReady && cond.Status != api.ConditionTrue {
glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
} else if cond.Type == api.NodeOutOfDisk && cond.Status != api.ConditionFalse {
glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
} else if cond.Type == api.NodeNetworkUnavailable && cond.Status != api.ConditionFalse {
glog.V(4).Infof("Ignoring node %v with %v condition status %v", node.Name, cond.Type, cond.Status)
return false
}
}
return true