From 2b554079af6f88a4ca0bca0eebcee4e04b71b5ff Mon Sep 17 00:00:00 2001 From: Zach Shepherd Date: Thu, 9 Jan 2020 15:00:37 -0800 Subject: [PATCH] Ensure a provider ID is set on a node if expected A transient issue might occur that causes an error to be returned by InstanceID(). When this is ignored, the external cloud provider taint will be removed and neither AddCloudNode() nor UpdateCloudNode() will try to set a provider ID in the future. By returning the error we can ensure that the external cloud provider taint is not removed prematurely, allowing the operation to be retried (until the provider ID can be set). Preserve support for external cloud providers that do not use IDs by continuing if a NotImplemented error is returned, making a distinction between lack of support for provider IDs and an actual error. Introduce pair of unit tests that show a provider ID will eventually be set if an error is returned, unless that error is a NotImplemented, in which case the external cloud provider taint will be removed. --- pkg/controller/cloud/node_controller.go | 13 +- pkg/controller/cloud/node_controller_test.go | 166 ++++++++++++++++++ staging/src/k8s.io/cloud-provider/cloud.go | 4 + .../src/k8s.io/cloud-provider/fake/fake.go | 11 +- 4 files changed, 189 insertions(+), 5 deletions(-) diff --git a/pkg/controller/cloud/node_controller.go b/pkg/controller/cloud/node_controller.go index 22a2cc5cf81..99950b8e555 100644 --- a/pkg/controller/cloud/node_controller.go +++ b/pkg/controller/cloud/node_controller.go @@ -418,11 +418,16 @@ func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider(ctx context.Co n.Spec.ProviderID = providerID } }) + } else if err == cloudprovider.NotImplemented { + // if the cloud provider being used does not support provider IDs, + // we can safely continue since we will attempt to set node + // addresses given the node name in getNodeAddressesByProviderIDOrName + klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name) } else { - // we should attempt to set providerID on node, but - // we can continue if we fail since we will attempt to set - // node addresses given the node name in getNodeAddressesByProviderIDOrName - klog.Errorf("failed to set node provider id: %v", err) + // if the cloud provider being used supports provider IDs, we want + // to propagate the error so that we re-try in the future; if we + // do not, the taint will be removed, and this will not be retried + return nil, err } } diff --git a/pkg/controller/cloud/node_controller_test.go b/pkg/controller/cloud/node_controller_test.go index 37470060f78..f762affe8a3 100644 --- a/pkg/controller/cloud/node_controller_test.go +++ b/pkg/controller/cloud/node_controller_test.go @@ -19,6 +19,7 @@ package cloud import ( "context" "errors" + "fmt" "reflect" "testing" "time" @@ -1152,3 +1153,168 @@ func TestNodeProviderIDAlreadySet(t *testing.T) { // CCM node controller should not overwrite provider if it's already set assert.Equal(t, "test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly") } + +// This test checks that a node's provider ID will subsequently be set after an error has occurred +func TestNodeProviderIDError(t *testing.T) { + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: schedulerapi.TaintExternalCloudProvider, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }, + }, + Clientset: fake.NewSimpleClientset(&v1.PodList{}), + DeleteWaitChan: make(chan struct{}), + } + + factory := informers.NewSharedInformerFactory(fnh, 0) + + fakeCloud := &fakecloud.Cloud{ + InstanceTypes: map[types.NodeName]string{}, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Provider: "test", + ExtID: map[types.NodeName]string{}, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): fmt.Errorf("fake error"), + }, + Err: nil, + } + + eventBroadcaster := record.NewBroadcaster() + cloudNodeController := &CloudNodeController{ + kubeClient: fnh, + nodeInformer: factory.Core().V1().Nodes(), + cloud: fakeCloud, + nodeStatusUpdateFrequency: 1 * time.Second, + recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}), + } + eventBroadcaster.StartLogging(klog.Infof) + + cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0]) + + assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated") + + cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0]) + + assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated") + + fakeCloud.ExtID[types.NodeName("node0")] = "test-provider-id" + delete(fakeCloud.ExtIDErr, types.NodeName("node0")) + + cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0]) + + assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated") + assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated") + assert.Equal(t, "test://test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly") +} + +// This test checks that a NotImplemented error when getting a node's provider ID will not prevent removal of the taint +func TestNodeProviderIDNotImplemented(t *testing.T) { + fnh := &testutil.FakeNodeHandler{ + Existing: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionUnknown, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), + }, + }, + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: schedulerapi.TaintExternalCloudProvider, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + }, + }, + Clientset: fake.NewSimpleClientset(&v1.PodList{}), + DeleteWaitChan: make(chan struct{}), + } + + factory := informers.NewSharedInformerFactory(fnh, 0) + + fakeCloud := &fakecloud.Cloud{ + InstanceTypes: map[types.NodeName]string{}, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Provider: "test", + ExtID: map[types.NodeName]string{}, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.NotImplemented, + }, + Err: nil, + } + + eventBroadcaster := record.NewBroadcaster() + cloudNodeController := &CloudNodeController{ + kubeClient: fnh, + nodeInformer: factory.Core().V1().Nodes(), + cloud: fakeCloud, + nodeStatusUpdateFrequency: 1 * time.Second, + recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}), + } + eventBroadcaster.StartLogging(klog.Infof) + + cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0]) + + assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated") + assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated") + assert.Equal(t, "", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID set to unexpected value") +} diff --git a/staging/src/k8s.io/cloud-provider/cloud.go b/staging/src/k8s.io/cloud-provider/cloud.go index 38703f10302..b006c726cd6 100644 --- a/staging/src/k8s.io/cloud-provider/cloud.go +++ b/staging/src/k8s.io/cloud-provider/cloud.go @@ -98,6 +98,10 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types. } instanceID, err := instances.InstanceID(ctx, nodeName) if err != nil { + if err == NotImplemented { + return "", err + } + return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err) } return cloud.ProviderName() + "://" + instanceID, nil diff --git a/staging/src/k8s.io/cloud-provider/fake/fake.go b/staging/src/k8s.io/cloud-provider/fake/fake.go index 10e97285a69..b473ceca6b3 100644 --- a/staging/src/k8s.io/cloud-provider/fake/fake.go +++ b/staging/src/k8s.io/cloud-provider/fake/fake.go @@ -68,6 +68,7 @@ type Cloud struct { Addresses []v1.NodeAddress addressesMux sync.Mutex ExtID map[types.NodeName]string + ExtIDErr map[types.NodeName]error InstanceTypes map[types.NodeName]string Machines []types.NodeName NodeResources *v1.NodeResources @@ -252,9 +253,17 @@ func (f *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string return f.Addresses, f.Err } -// InstanceID returns the cloud provider ID of the node with the specified Name. +// InstanceID returns the cloud provider ID of the node with the specified Name, unless an entry +// for the node exists in ExtIDError, in which case it returns the desired error (to facilitate +// testing of error handling). func (f *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) { f.addCall("instance-id") + + err, ok := f.ExtIDErr[nodeName] + if ok { + return "", err + } + return f.ExtID[nodeName], nil }