[cloud-provider] require providerID to initialize node

The node controller has two reconcilations loops:

1. workqueue receiving events from watchers, to implement
the node initialization

2. periodic loop to reconcile cloud-provider addresses and
node objects, since there is no watch for the cloud-provider
addresses. However, this loop can take O(xx) mins on large
clusters.

Before the external cloud providers were enabled by default,
the kubelet was in charge of setting the corresponding
providerID and zone and region labels during the node object
creation.

Once this logic was moved to the external cloud providers,
there are cases that the node controller may fail to add the
providerID value on the node object and this is never reconciled.
The problem is that there are many controllers and projects that
depend on this field to be set.

Checking at the code it is not possible to not have a ProviderID
in any cloud-provider, since it is always built from the provider name
and the instance. ProviderID is also inmutable once set, so we make
ProviderID a requirement for node initialization.

To avoid any possible problems, we rollout this change under a feature
gate in deprecated state, so cloud providers can opt-out to the new
behavior.

Change-Id: Ic5d9c23b6a286b12c9721d4a378485a8b81212d1

Change-Id: Iac8c7e7e47a3247553806ed7128b273bbef0a30b
This commit is contained in:
Antonio Ojea 2024-02-17 18:47:01 +00:00
parent 099f26296b
commit 03bd3e25b1
3 changed files with 112 additions and 10 deletions

View File

@ -29,6 +29,7 @@ import (
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coreinformers "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
@ -43,6 +44,7 @@ import (
cloudnodeutil "k8s.io/cloud-provider/node/helpers"
controllersmetrics "k8s.io/component-base/metrics/prometheus/controllers"
nodeutil "k8s.io/component-helpers/node/util"
"k8s.io/controller-manager/pkg/features"
"k8s.io/klog/v2"
)
@ -486,6 +488,19 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e
modify(newNode)
}
// spec.ProviderID is required for multiple controllers, like loadbalancers, so we should not
// untaint the node until is set. Once it is set, the field is immutable, so no need to reconcile.
// We only set this value during initialization and is never reconciled, so if for some reason
// we are not able to set it, the instance will never be able to acquire it.
// Before external cloud providers were enabled by default, the field was set by the kubelet, and the
// node was created with the value.
// xref: https://issues.k8s.io/123024
if !utilfeature.DefaultFeatureGate.Enabled(features.OptionalProviderID) {
if newNode.Spec.ProviderID == "" {
return fmt.Errorf("failed to get provider ID for node %s at cloudprovider", nodeName)
}
}
_, err = cnc.kubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{})
if err != nil {
return err

View File

@ -32,12 +32,15 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/informers"
clienttesting "k8s.io/client-go/testing"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
cloudproviderapi "k8s.io/cloud-provider/api"
fakecloud "k8s.io/cloud-provider/fake"
featuregatetesting "k8s.io/component-base/featuregate/testing"
"k8s.io/controller-manager/pkg/features"
_ "k8s.io/controller-manager/pkg/features/register"
"github.com/google/go-cmp/cmp"
@ -50,6 +53,8 @@ func Test_syncNode(t *testing.T) {
fakeCloud *fakecloud.Cloud
existingNode *v1.Node
updatedNode *v1.Node
optionalProviderID bool
expectedErr bool
}{
{
name: "node initialized with provider ID",
@ -546,6 +551,7 @@ func Test_syncNode(t *testing.T) {
},
{
name: "provided node IP address is not valid",
expectedErr: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
Addresses: []v1.NodeAddress{
@ -644,6 +650,7 @@ func Test_syncNode(t *testing.T) {
},
{
name: "provided node IP address is not present",
expectedErr: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
Addresses: []v1.NodeAddress{
@ -836,7 +843,8 @@ func Test_syncNode(t *testing.T) {
},
},
{
name: "provider ID not implemented",
name: "provider ID not implemented and optional",
optionalProviderID: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
InstanceTypes: map[types.NodeName]string{},
@ -892,6 +900,71 @@ func Test_syncNode(t *testing.T) {
},
},
},
{
name: "provider ID not implemented and required",
optionalProviderID: false,
expectedErr: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
InstanceTypes: map[types.NodeName]string{},
Provider: "test",
ExtID: map[types.NodeName]string{},
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.NotImplemented,
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: cloudproviderapi.TaintExternalCloudProvider,
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
updatedNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{
{
Key: cloudproviderapi.TaintExternalCloudProvider,
Value: "true",
Effect: v1.TaintEffectNoSchedule,
},
},
},
},
},
{
name: "[instanceV2] node initialized with provider ID",
fakeCloud: &fakecloud.Cloud{
@ -1399,6 +1472,7 @@ func Test_syncNode(t *testing.T) {
},
{
name: "[instanceV2] provider ID not implemented",
optionalProviderID: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
InstanceTypes: map[types.NodeName]string{},
@ -1456,6 +1530,7 @@ func Test_syncNode(t *testing.T) {
},
{
name: "[instanceV2] error getting InstanceMetadata",
expectedErr: true,
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
InstanceTypes: map[types.NodeName]string{},
@ -1521,6 +1596,7 @@ func Test_syncNode(t *testing.T) {
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.OptionalProviderID, test.optionalProviderID)()
clientset := fake.NewSimpleClientset(test.existingNode)
factory := informers.NewSharedInformerFactory(clientset, 0)
@ -1543,7 +1619,10 @@ func Test_syncNode(t *testing.T) {
w := eventBroadcaster.StartLogging(klog.Infof)
defer w.Stop()
cloudNodeController.syncNode(context.TODO(), test.existingNode.Name)
err := cloudNodeController.syncNode(context.TODO(), test.existingNode.Name)
if (err != nil) != test.expectedErr {
t.Fatalf("error got: %v expected: %v", err, test.expectedErr)
}
updatedNode, err := clientset.CoreV1().Nodes().Get(context.TODO(), test.existingNode.Name, metav1.GetOptions{})
if err != nil {

View File

@ -47,6 +47,13 @@ const (
// `alpha.kubernetes.io/provided-node-ip` annotation
CloudDualStackNodeIPs featuregate.Feature = "CloudDualStackNodeIPs"
// owner: @aojea
// Deprecated: v1.30
// issue: https://issues.k8s.io/123024
//
// If enabled, the ProviderID field is not required for the node initialization.
OptionalProviderID featuregate.Feature = "OptionalProviderID"
// owner: @alexanderConstantinescu
// kep: http://kep.k8s.io/3458
// beta: v1.27
@ -66,5 +73,6 @@ func SetupCurrentKubernetesSpecificFeatureGates(featuregates featuregate.Mutable
var cloudPublicFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
CloudControllerManagerWebhook: {Default: false, PreRelease: featuregate.Alpha},
CloudDualStackNodeIPs: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.32
OptionalProviderID: {Default: false, PreRelease: featuregate.Deprecated}, // remove after 1.31
StableLoadBalancerNodeSet: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // GA in 1.30, remove in 1.31
}