mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-14 06:15:45 +00:00
[cloud-provider] require providerID to initialize node
The node controller has two reconcilations loops: 1. workqueue receiving events from watchers, to implement the node initialization 2. periodic loop to reconcile cloud-provider addresses and node objects, since there is no watch for the cloud-provider addresses. However, this loop can take O(xx) mins on large clusters. Before the external cloud providers were enabled by default, the kubelet was in charge of setting the corresponding providerID and zone and region labels during the node object creation. Once this logic was moved to the external cloud providers, there are cases that the node controller may fail to add the providerID value on the node object and this is never reconciled. The problem is that there are many controllers and projects that depend on this field to be set. Checking at the code it is not possible to not have a ProviderID in any cloud-provider, since it is always built from the provider name and the instance. ProviderID is also inmutable once set, so we make ProviderID a requirement for node initialization. To avoid any possible problems, we rollout this change under a feature gate in deprecated state, so cloud providers can opt-out to the new behavior. Change-Id: Ic5d9c23b6a286b12c9721d4a378485a8b81212d1 Change-Id: Iac8c7e7e47a3247553806ed7128b273bbef0a30b
This commit is contained in:
parent
099f26296b
commit
03bd3e25b1
@ -29,6 +29,7 @@ import (
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
coreinformers "k8s.io/client-go/informers/core/v1"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
@ -43,6 +44,7 @@ import (
|
||||
cloudnodeutil "k8s.io/cloud-provider/node/helpers"
|
||||
controllersmetrics "k8s.io/component-base/metrics/prometheus/controllers"
|
||||
nodeutil "k8s.io/component-helpers/node/util"
|
||||
"k8s.io/controller-manager/pkg/features"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
@ -486,6 +488,19 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e
|
||||
modify(newNode)
|
||||
}
|
||||
|
||||
// spec.ProviderID is required for multiple controllers, like loadbalancers, so we should not
|
||||
// untaint the node until is set. Once it is set, the field is immutable, so no need to reconcile.
|
||||
// We only set this value during initialization and is never reconciled, so if for some reason
|
||||
// we are not able to set it, the instance will never be able to acquire it.
|
||||
// Before external cloud providers were enabled by default, the field was set by the kubelet, and the
|
||||
// node was created with the value.
|
||||
// xref: https://issues.k8s.io/123024
|
||||
if !utilfeature.DefaultFeatureGate.Enabled(features.OptionalProviderID) {
|
||||
if newNode.Spec.ProviderID == "" {
|
||||
return fmt.Errorf("failed to get provider ID for node %s at cloudprovider", nodeName)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = cnc.kubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{})
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -32,12 +32,15 @@ import (
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
"k8s.io/client-go/informers"
|
||||
clienttesting "k8s.io/client-go/testing"
|
||||
"k8s.io/client-go/tools/record"
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
cloudproviderapi "k8s.io/cloud-provider/api"
|
||||
fakecloud "k8s.io/cloud-provider/fake"
|
||||
featuregatetesting "k8s.io/component-base/featuregate/testing"
|
||||
"k8s.io/controller-manager/pkg/features"
|
||||
_ "k8s.io/controller-manager/pkg/features/register"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
@ -50,6 +53,8 @@ func Test_syncNode(t *testing.T) {
|
||||
fakeCloud *fakecloud.Cloud
|
||||
existingNode *v1.Node
|
||||
updatedNode *v1.Node
|
||||
optionalProviderID bool
|
||||
expectedErr bool
|
||||
}{
|
||||
{
|
||||
name: "node initialized with provider ID",
|
||||
@ -546,6 +551,7 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "provided node IP address is not valid",
|
||||
expectedErr: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
Addresses: []v1.NodeAddress{
|
||||
@ -644,6 +650,7 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "provided node IP address is not present",
|
||||
expectedErr: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
Addresses: []v1.NodeAddress{
|
||||
@ -836,7 +843,8 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "provider ID not implemented",
|
||||
name: "provider ID not implemented and optional",
|
||||
optionalProviderID: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
@ -892,6 +900,71 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "provider ID not implemented and required",
|
||||
optionalProviderID: false,
|
||||
expectedErr: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
Provider: "test",
|
||||
ExtID: map[types.NodeName]string{},
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.NotImplemented,
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionUnknown,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
Taints: []v1.Taint{
|
||||
{
|
||||
Key: cloudproviderapi.TaintExternalCloudProvider,
|
||||
Value: "true",
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
updatedNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionUnknown,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
Taints: []v1.Taint{
|
||||
{
|
||||
Key: cloudproviderapi.TaintExternalCloudProvider,
|
||||
Value: "true",
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "[instanceV2] node initialized with provider ID",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
@ -1399,6 +1472,7 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "[instanceV2] provider ID not implemented",
|
||||
optionalProviderID: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
@ -1456,6 +1530,7 @@ func Test_syncNode(t *testing.T) {
|
||||
},
|
||||
{
|
||||
name: "[instanceV2] error getting InstanceMetadata",
|
||||
expectedErr: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
InstanceTypes: map[types.NodeName]string{},
|
||||
@ -1521,6 +1596,7 @@ func Test_syncNode(t *testing.T) {
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.OptionalProviderID, test.optionalProviderID)()
|
||||
clientset := fake.NewSimpleClientset(test.existingNode)
|
||||
factory := informers.NewSharedInformerFactory(clientset, 0)
|
||||
|
||||
@ -1543,7 +1619,10 @@ func Test_syncNode(t *testing.T) {
|
||||
w := eventBroadcaster.StartLogging(klog.Infof)
|
||||
defer w.Stop()
|
||||
|
||||
cloudNodeController.syncNode(context.TODO(), test.existingNode.Name)
|
||||
err := cloudNodeController.syncNode(context.TODO(), test.existingNode.Name)
|
||||
if (err != nil) != test.expectedErr {
|
||||
t.Fatalf("error got: %v expected: %v", err, test.expectedErr)
|
||||
}
|
||||
|
||||
updatedNode, err := clientset.CoreV1().Nodes().Get(context.TODO(), test.existingNode.Name, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
|
@ -47,6 +47,13 @@ const (
|
||||
// `alpha.kubernetes.io/provided-node-ip` annotation
|
||||
CloudDualStackNodeIPs featuregate.Feature = "CloudDualStackNodeIPs"
|
||||
|
||||
// owner: @aojea
|
||||
// Deprecated: v1.30
|
||||
// issue: https://issues.k8s.io/123024
|
||||
//
|
||||
// If enabled, the ProviderID field is not required for the node initialization.
|
||||
OptionalProviderID featuregate.Feature = "OptionalProviderID"
|
||||
|
||||
// owner: @alexanderConstantinescu
|
||||
// kep: http://kep.k8s.io/3458
|
||||
// beta: v1.27
|
||||
@ -66,5 +73,6 @@ func SetupCurrentKubernetesSpecificFeatureGates(featuregates featuregate.Mutable
|
||||
var cloudPublicFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
|
||||
CloudControllerManagerWebhook: {Default: false, PreRelease: featuregate.Alpha},
|
||||
CloudDualStackNodeIPs: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.32
|
||||
OptionalProviderID: {Default: false, PreRelease: featuregate.Deprecated}, // remove after 1.31
|
||||
StableLoadBalancerNodeSet: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // GA in 1.30, remove in 1.31
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user