Ensure a provider ID is set on a node if expected

A transient issue might occur that causes an error to be returned by
InstanceID(). When this is ignored, the external cloud provider taint
will be removed and neither AddCloudNode() nor UpdateCloudNode() will
try to set a provider ID in the future.

By returning the error we can ensure that the external cloud provider
taint is not removed prematurely, allowing the operation to be retried
(until the provider ID can be set).

Preserve support for external cloud providers that do not use IDs by
continuing if a NotImplemented error is returned, making a distinction
between lack of support for provider IDs and an actual error.

Introduce pair of unit tests that show a provider ID will eventually
be set if an error is returned, unless that error is a NotImplemented,
in which case the external cloud provider taint will be removed.
This commit is contained in:
Zach Shepherd 2020-01-09 15:00:37 -08:00
parent eedf1945b5
commit 2b554079af
No known key found for this signature in database
GPG Key ID: 55B3F5594BAC7F92
4 changed files with 189 additions and 5 deletions

View File

@ -418,11 +418,16 @@ func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider(ctx context.Co
n.Spec.ProviderID = providerID
}
})
} else if err == cloudprovider.NotImplemented {
// if the cloud provider being used does not support provider IDs,
// we can safely continue since we will attempt to set node
// addresses given the node name in getNodeAddressesByProviderIDOrName
klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name)
} else {
// we should attempt to set providerID on node, but
// we can continue if we fail since we will attempt to set
// node addresses given the node name in getNodeAddressesByProviderIDOrName
klog.Errorf("failed to set node provider id: %v", err)
// if the cloud provider being used supports provider IDs, we want
// to propagate the error so that we re-try in the future; if we
// do not, the taint will be removed, and this will not be retried
return nil, err
}
}

View File

@ -19,6 +19,7 @@ package cloud
import (
"context"
"errors"
"fmt"
"reflect"
"testing"
"time"
@ -1152,3 +1153,168 @@ func TestNodeProviderIDAlreadySet(t *testing.T) {
// CCM node controller should not overwrite provider if it's already set
assert.Equal(t, "test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly")
}
// This test checks that a node's provider ID will subsequently be set after an error has occurred
func TestNodeProviderIDError(t *testing.T) {
fnh := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: schedulerapi.TaintExternalCloudProvider,
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
},
Clientset: fake.NewSimpleClientset(&v1.PodList{}),
DeleteWaitChan: make(chan struct{}),
}
factory := informers.NewSharedInformerFactory(fnh, 0)
fakeCloud := &fakecloud.Cloud{
InstanceTypes: map[types.NodeName]string{},
Addresses: []v1.NodeAddress{
{
Type: v1.NodeHostName,
Address: "node0.cloud.internal",
},
{
Type: v1.NodeInternalIP,
Address: "10.0.0.1",
},
{
Type: v1.NodeExternalIP,
Address: "132.143.154.163",
},
},
Provider: "test",
ExtID: map[types.NodeName]string{},
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): fmt.Errorf("fake error"),
},
Err: nil,
}
eventBroadcaster := record.NewBroadcaster()
cloudNodeController := &CloudNodeController{
kubeClient: fnh,
nodeInformer: factory.Core().V1().Nodes(),
cloud: fakeCloud,
nodeStatusUpdateFrequency: 1 * time.Second,
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}),
}
eventBroadcaster.StartLogging(klog.Infof)
cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0])
assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated")
cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0])
assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated")
fakeCloud.ExtID[types.NodeName("node0")] = "test-provider-id"
delete(fakeCloud.ExtIDErr, types.NodeName("node0"))
cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0])
assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated")
assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated")
assert.Equal(t, "test://test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly")
}
// This test checks that a NotImplemented error when getting a node's provider ID will not prevent removal of the taint
func TestNodeProviderIDNotImplemented(t *testing.T) {
fnh := &testutil.FakeNodeHandler{
Existing: []*v1.Node{
{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: schedulerapi.TaintExternalCloudProvider,
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
},
Clientset: fake.NewSimpleClientset(&v1.PodList{}),
DeleteWaitChan: make(chan struct{}),
}
factory := informers.NewSharedInformerFactory(fnh, 0)
fakeCloud := &fakecloud.Cloud{
InstanceTypes: map[types.NodeName]string{},
Addresses: []v1.NodeAddress{
{
Type: v1.NodeHostName,
Address: "node0.cloud.internal",
},
{
Type: v1.NodeInternalIP,
Address: "10.0.0.1",
},
{
Type: v1.NodeExternalIP,
Address: "132.143.154.163",
},
},
Provider: "test",
ExtID: map[types.NodeName]string{},
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.NotImplemented,
},
Err: nil,
}
eventBroadcaster := record.NewBroadcaster()
cloudNodeController := &CloudNodeController{
kubeClient: fnh,
nodeInformer: factory.Core().V1().Nodes(),
cloud: fakeCloud,
nodeStatusUpdateFrequency: 1 * time.Second,
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}),
}
eventBroadcaster.StartLogging(klog.Infof)
cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0])
assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated")
assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated")
assert.Equal(t, "", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID set to unexpected value")
}

View File

@ -98,6 +98,10 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
}
instanceID, err := instances.InstanceID(ctx, nodeName)
if err != nil {
if err == NotImplemented {
return "", err
}
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
}
return cloud.ProviderName() + "://" + instanceID, nil

View File

@ -68,6 +68,7 @@ type Cloud struct {
Addresses []v1.NodeAddress
addressesMux sync.Mutex
ExtID map[types.NodeName]string
ExtIDErr map[types.NodeName]error
InstanceTypes map[types.NodeName]string
Machines []types.NodeName
NodeResources *v1.NodeResources
@ -252,9 +253,17 @@ func (f *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string
return f.Addresses, f.Err
}
// InstanceID returns the cloud provider ID of the node with the specified Name.
// InstanceID returns the cloud provider ID of the node with the specified Name, unless an entry
// for the node exists in ExtIDError, in which case it returns the desired error (to facilitate
// testing of error handling).
func (f *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
f.addCall("instance-id")
err, ok := f.ExtIDErr[nodeName]
if ok {
return "", err
}
return f.ExtID[nodeName], nil
}