From 4a640ea384ad308e42bfce982c429cd37ac2da0c Mon Sep 17 00:00:00 2001 From: Mat Kowalski Date: Tue, 12 Sep 2023 20:50:40 +0200 Subject: [PATCH 1/2] cloud-node-lifecycle controller: add fallback for empty providerID in shutdown Simiarly to the function `ensureNodeExistsByProviderID`, `shutdownInCloudProvider` should have a logic where in case of an empty providerID we get it using the name of the node. This is to support scenarios when the function is called with Node object that has a name but does not have any provider ID. Currently in such a scenario we have an error as it is not possible to call `InstanceShutdownByProviderID` with empty value. With this change in such a scenario we will first obtain a correct provider ID and only afterwards check the shutdown status. Signed-off-by: Mat Kowalski --- staging/src/k8s.io/cloud-provider/cloud.go | 7 +- .../node_lifecycle_controller.go | 67 ++-- .../node_lifecycle_controller_test.go | 312 ++++++++++++++++++ .../src/k8s.io/cloud-provider/fake/fake.go | 5 + 4 files changed, 367 insertions(+), 24 deletions(-) diff --git a/staging/src/k8s.io/cloud-provider/cloud.go b/staging/src/k8s.io/cloud-provider/cloud.go index c9a04085f48..482656b3703 100644 --- a/staging/src/k8s.io/cloud-provider/cloud.go +++ b/staging/src/k8s.io/cloud-provider/cloud.go @@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string { } // GetInstanceProviderID builds a ProviderID for a node in a cloud. +// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound) +// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) { instances, ok := cloud.Instances() if !ok { @@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types. if err == NotImplemented { return "", err } + if err == InstanceNotFound { + return "", err + } - return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err) + return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err) } return cloud.ProviderName() + "://" + instanceID, nil } diff --git a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go index aa6601facd7..b8a50e42cdd 100644 --- a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go +++ b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go @@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { // At this point the node has NotReady status, we need to check if the node has been removed // from the cloud provider. If node cannot be found in cloudprovider, then delete the node - exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node) + exists, err := c.ensureNodeExistsByProviderID(ctx, node) if err != nil { klog.Errorf("error checking if node %s exists: %v", node.Name, err) continue @@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { // Node exists. We need to check this to get taint working in similar in all cloudproviders // current problem is that shutdown nodes are not working in similar way ie. all cloudproviders // does not delete node from kubernetes cluster when instance it is shutdown see issue #46442 - shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node) + shutdown, err := c.shutdownInCloudProvider(ctx, node) if err != nil { klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err) } @@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { } } +// getProviderID returns the provider ID for the node. If Node CR has no provider ID, +// it will be the one from the cloud provider. +func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) { + if node.Spec.ProviderID != "" { + return node.Spec.ProviderID, nil + } + + if instanceV2, ok := c.cloud.InstancesV2(); ok { + metadata, err := instanceV2.InstanceMetadata(ctx, node) + if err != nil { + return "", err + } + return metadata.ProviderID, nil + } + + providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name)) + if err != nil { + return "", err + } + + return providerID, nil +} + // shutdownInCloudProvider returns true if the node is shutdown on the cloud provider -func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) { - if instanceV2, ok := cloud.InstancesV2(); ok { +func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) { + if instanceV2, ok := c.cloud.InstancesV2(); ok { return instanceV2.InstanceShutdown(ctx, node) } - instances, ok := cloud.Instances() + instances, ok := c.cloud.Instances() if !ok { return false, errors.New("cloud provider does not support instances") } - shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID) + providerID, err := c.getProviderID(ctx, node) + if err != nil { + if err == cloudprovider.InstanceNotFound { + return false, nil + } + return false, err + } + + shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID) if err == cloudprovider.NotImplemented { return false, nil } @@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, } // ensureNodeExistsByProviderID checks if the instance exists by the provider id, -// If provider id in spec is empty it calls instanceId with node name to get provider id -func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) { - if instanceV2, ok := cloud.InstancesV2(); ok { +func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) { + if instanceV2, ok := c.cloud.InstancesV2(); ok { return instanceV2.InstanceExists(ctx, node) } - instances, ok := cloud.Instances() + instances, ok := c.cloud.Instances() if !ok { return false, errors.New("instances interface not supported in the cloud provider") } - providerID := node.Spec.ProviderID - if providerID == "" { - var err error - providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name)) - if err != nil { - if err == cloudprovider.InstanceNotFound { - return false, nil - } - return false, err - } - - if providerID == "" { - klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name) + providerID, err := c.getProviderID(ctx, node) + if err != nil { + if err == cloudprovider.InstanceNotFound { return false, nil } + return false, err } return instances.InstanceExistsByProviderID(ctx, providerID) diff --git a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go index 2f71eb2b07b..2132831d44a 100644 --- a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go +++ b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go @@ -19,6 +19,7 @@ package cloud import ( "context" "errors" + "github.com/google/go-cmp/cmp" "reflect" "testing" "time" @@ -32,6 +33,7 @@ import ( "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" + cloudprovider "k8s.io/cloud-provider" fakecloud "k8s.io/cloud-provider/fake" "k8s.io/klog/v2" ) @@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) { ErrShutdownByProviderID: nil, }, }, + { + name: "node with empty spec providerID is not ready and was shutdown, but exists", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + Taints: []v1.Taint{ + *ShutdownTaint, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: false, + fakeCloud: &fakecloud.Cloud{ + NodeShutdown: true, + ExistsByProviderID: true, + ErrShutdownByProviderID: nil, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "foo://12345", + }, + }, + }, + { + name: "node with non-existing providerID (missing in cloud provider) gets deleted", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: true, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "", + }, + }, + }, + { + name: "node with error when getting providerID does not have shutdown taint", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: false, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): errors.New("err!"), + }, + }, + }, + { + name: "node with InstanceID returning InstanceNotFound gets deleted", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: true, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.InstanceNotFound, + }, + }, + }, { name: "node is not ready, but there is error checking if node is shutdown", existingNode: &v1.Node{ @@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) { } } +func Test_GetProviderID(t *testing.T) { + testcases := []struct { + name string + fakeCloud *fakecloud.Cloud + existingNode *v1.Node + expectedProviderID string + expectedErr error + }{ + { + name: "node initialized with provider ID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "node initialized with provider ID with InstancesV2", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with Instances", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "12345", + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with InstancesV2", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with InstancesV2 (without providerID)", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: nil, + }, + { + name: "cloud implemented with Instances with instance missing", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.InstanceNotFound, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: cloudprovider.InstanceNotFound, + }, + { + name: "cloud implemented with Instances with unknown error", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): errors.New("unknown error"), + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"), + }, + } + + for _, testcase := range testcases { + t.Run(testcase.name, func(t *testing.T) { + cloudNodeLifecycleController := &CloudNodeLifecycleController{ + cloud: testcase.fakeCloud, + } + + providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode) + + if err != nil && testcase.expectedErr == nil { + t.Fatalf("unexpected error: %v", err) + } + if err == nil && testcase.expectedErr != nil { + t.Fatalf("did not get expected error %q", testcase.expectedErr) + } + if err != nil && err.Error() != testcase.expectedErr.Error() { + t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error()) + } + + if !cmp.Equal(providerID, testcase.expectedProviderID) { + t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID)) + } + }) + } +} + func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error { nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { diff --git a/staging/src/k8s.io/cloud-provider/fake/fake.go b/staging/src/k8s.io/cloud-provider/fake/fake.go index a0ec6399729..7fcda129569 100644 --- a/staging/src/k8s.io/cloud-provider/fake/fake.go +++ b/staging/src/k8s.io/cloud-provider/fake/fake.go @@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin // InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { f.addCall("instance-shutdown-by-provider-id") + + if providerID == "" { + return false, fmt.Errorf("cannot shutdown instance with empty providerID") + } + return f.NodeShutdown, f.ErrShutdownByProviderID } From 60a602f17014cf52c0556668858a4c8222a1ebd5 Mon Sep 17 00:00:00 2001 From: Mat Kowalski Date: Thu, 26 Oct 2023 11:33:33 +0200 Subject: [PATCH 2/2] gce: fix test for non-existing instance in cloud This commit fixes a GCE instance test for a scenario where instance does not exist in the cloud. In the `gce_instances.go` we have a code that handles such a scenario and strips off the error, i.e. ``` if providerID, err = cloudprovider.GetInstanceProviderID(ctx, g, types.NodeName(node.Name)); err != nil { if err == cloudprovider.InstanceNotFound { return false, nil } return false, err } ``` but nevertheless the test was expecing a non-empty error string. This issue got exposed when implementation of `GetInstanceProviderID` in the `cloud-provider/cloud.go` changed to return `InstanceNotFound` as-is and not as a custom string. Signed-off-by: Mat Kowalski --- .../k8s.io/legacy-cloud-providers/gce/gce_instances_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go index 91a1c3ad024..4496f232800 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go @@ -21,7 +21,6 @@ package gce import ( "context" - "fmt" "testing" "github.com/stretchr/testify/assert" @@ -55,7 +54,7 @@ func TestInstanceExists(t *testing.T) { name: "node not exist", nodeName: "test-node-2", exist: false, - expectedErr: fmt.Errorf("failed to get instance ID from cloud provider: instance not found"), + expectedErr: nil, }, }