mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-23 03:41:45 +00:00
Ensure a provider ID is set on a node if expected
A transient issue might occur that causes an error to be returned by InstanceID(). When this is ignored, the external cloud provider taint will be removed and neither AddCloudNode() nor UpdateCloudNode() will try to set a provider ID in the future. By returning the error we can ensure that the external cloud provider taint is not removed prematurely, allowing the operation to be retried (until the provider ID can be set). Preserve support for external cloud providers that do not use IDs by continuing if a NotImplemented error is returned, making a distinction between lack of support for provider IDs and an actual error. Introduce pair of unit tests that show a provider ID will eventually be set if an error is returned, unless that error is a NotImplemented, in which case the external cloud provider taint will be removed.
This commit is contained in:
parent
eedf1945b5
commit
2b554079af
@ -418,11 +418,16 @@ func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider(ctx context.Co
|
||||
n.Spec.ProviderID = providerID
|
||||
}
|
||||
})
|
||||
} else if err == cloudprovider.NotImplemented {
|
||||
// if the cloud provider being used does not support provider IDs,
|
||||
// we can safely continue since we will attempt to set node
|
||||
// addresses given the node name in getNodeAddressesByProviderIDOrName
|
||||
klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name)
|
||||
} else {
|
||||
// we should attempt to set providerID on node, but
|
||||
// we can continue if we fail since we will attempt to set
|
||||
// node addresses given the node name in getNodeAddressesByProviderIDOrName
|
||||
klog.Errorf("failed to set node provider id: %v", err)
|
||||
// if the cloud provider being used supports provider IDs, we want
|
||||
// to propagate the error so that we re-try in the future; if we
|
||||
// do not, the taint will be removed, and this will not be retried
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@ package cloud
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
@ -1152,3 +1153,168 @@ func TestNodeProviderIDAlreadySet(t *testing.T) {
|
||||
// CCM node controller should not overwrite provider if it's already set
|
||||
assert.Equal(t, "test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly")
|
||||
}
|
||||
|
||||
// This test checks that a node's provider ID will subsequently be set after an error has occurred
|
||||
func TestNodeProviderIDError(t *testing.T) {
|
||||
fnh := &testutil.FakeNodeHandler{
|
||||
Existing: []*v1.Node{
|
||||
{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionUnknown,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
Taints: []v1.Taint{
|
||||
{
|
||||
Key: schedulerapi.TaintExternalCloudProvider,
|
||||
Value: "true",
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Clientset: fake.NewSimpleClientset(&v1.PodList{}),
|
||||
DeleteWaitChan: make(chan struct{}),
|
||||
}
|
||||
|
||||
factory := informers.NewSharedInformerFactory(fnh, 0)
|
||||
|
||||
fakeCloud := &fakecloud.Cloud{
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
Addresses: []v1.NodeAddress{
|
||||
{
|
||||
Type: v1.NodeHostName,
|
||||
Address: "node0.cloud.internal",
|
||||
},
|
||||
{
|
||||
Type: v1.NodeInternalIP,
|
||||
Address: "10.0.0.1",
|
||||
},
|
||||
{
|
||||
Type: v1.NodeExternalIP,
|
||||
Address: "132.143.154.163",
|
||||
},
|
||||
},
|
||||
Provider: "test",
|
||||
ExtID: map[types.NodeName]string{},
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): fmt.Errorf("fake error"),
|
||||
},
|
||||
Err: nil,
|
||||
}
|
||||
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
cloudNodeController := &CloudNodeController{
|
||||
kubeClient: fnh,
|
||||
nodeInformer: factory.Core().V1().Nodes(),
|
||||
cloud: fakeCloud,
|
||||
nodeStatusUpdateFrequency: 1 * time.Second,
|
||||
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}),
|
||||
}
|
||||
eventBroadcaster.StartLogging(klog.Infof)
|
||||
|
||||
cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0])
|
||||
|
||||
assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated")
|
||||
|
||||
cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0])
|
||||
|
||||
assert.Equal(t, 0, len(fnh.UpdatedNodes), "Node was unexpectedly updated")
|
||||
|
||||
fakeCloud.ExtID[types.NodeName("node0")] = "test-provider-id"
|
||||
delete(fakeCloud.ExtIDErr, types.NodeName("node0"))
|
||||
|
||||
cloudNodeController.UpdateCloudNode(context.TODO(), nil, fnh.Existing[0])
|
||||
|
||||
assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated")
|
||||
assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated")
|
||||
assert.Equal(t, "test://test-provider-id", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID not set correctly")
|
||||
}
|
||||
|
||||
// This test checks that a NotImplemented error when getting a node's provider ID will not prevent removal of the taint
|
||||
func TestNodeProviderIDNotImplemented(t *testing.T) {
|
||||
fnh := &testutil.FakeNodeHandler{
|
||||
Existing: []*v1.Node{
|
||||
{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionUnknown,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
Taints: []v1.Taint{
|
||||
{
|
||||
Key: schedulerapi.TaintExternalCloudProvider,
|
||||
Value: "true",
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Clientset: fake.NewSimpleClientset(&v1.PodList{}),
|
||||
DeleteWaitChan: make(chan struct{}),
|
||||
}
|
||||
|
||||
factory := informers.NewSharedInformerFactory(fnh, 0)
|
||||
|
||||
fakeCloud := &fakecloud.Cloud{
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
Addresses: []v1.NodeAddress{
|
||||
{
|
||||
Type: v1.NodeHostName,
|
||||
Address: "node0.cloud.internal",
|
||||
},
|
||||
{
|
||||
Type: v1.NodeInternalIP,
|
||||
Address: "10.0.0.1",
|
||||
},
|
||||
{
|
||||
Type: v1.NodeExternalIP,
|
||||
Address: "132.143.154.163",
|
||||
},
|
||||
},
|
||||
Provider: "test",
|
||||
ExtID: map[types.NodeName]string{},
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.NotImplemented,
|
||||
},
|
||||
Err: nil,
|
||||
}
|
||||
|
||||
eventBroadcaster := record.NewBroadcaster()
|
||||
cloudNodeController := &CloudNodeController{
|
||||
kubeClient: fnh,
|
||||
nodeInformer: factory.Core().V1().Nodes(),
|
||||
cloud: fakeCloud,
|
||||
nodeStatusUpdateFrequency: 1 * time.Second,
|
||||
recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-node-controller"}),
|
||||
}
|
||||
eventBroadcaster.StartLogging(klog.Infof)
|
||||
|
||||
cloudNodeController.AddCloudNode(context.TODO(), fnh.Existing[0])
|
||||
|
||||
assert.Equal(t, 1, len(fnh.UpdatedNodes), "Node was not updated")
|
||||
assert.Equal(t, "node0", fnh.UpdatedNodes[0].Name, "Node was not updated")
|
||||
assert.Equal(t, "", fnh.UpdatedNodes[0].Spec.ProviderID, "Node ProviderID set to unexpected value")
|
||||
}
|
||||
|
@ -98,6 +98,10 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
|
||||
}
|
||||
instanceID, err := instances.InstanceID(ctx, nodeName)
|
||||
if err != nil {
|
||||
if err == NotImplemented {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
|
||||
}
|
||||
return cloud.ProviderName() + "://" + instanceID, nil
|
||||
|
@ -68,6 +68,7 @@ type Cloud struct {
|
||||
Addresses []v1.NodeAddress
|
||||
addressesMux sync.Mutex
|
||||
ExtID map[types.NodeName]string
|
||||
ExtIDErr map[types.NodeName]error
|
||||
InstanceTypes map[types.NodeName]string
|
||||
Machines []types.NodeName
|
||||
NodeResources *v1.NodeResources
|
||||
@ -252,9 +253,17 @@ func (f *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string
|
||||
return f.Addresses, f.Err
|
||||
}
|
||||
|
||||
// InstanceID returns the cloud provider ID of the node with the specified Name.
|
||||
// InstanceID returns the cloud provider ID of the node with the specified Name, unless an entry
|
||||
// for the node exists in ExtIDError, in which case it returns the desired error (to facilitate
|
||||
// testing of error handling).
|
||||
func (f *Cloud) InstanceID(ctx context.Context, nodeName types.NodeName) (string, error) {
|
||||
f.addCall("instance-id")
|
||||
|
||||
err, ok := f.ExtIDErr[nodeName]
|
||||
if ok {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return f.ExtID[nodeName], nil
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user