diff --git a/staging/src/k8s.io/cloud-provider/cloud.go b/staging/src/k8s.io/cloud-provider/cloud.go index c9a04085f48..482656b3703 100644 --- a/staging/src/k8s.io/cloud-provider/cloud.go +++ b/staging/src/k8s.io/cloud-provider/cloud.go @@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string { } // GetInstanceProviderID builds a ProviderID for a node in a cloud. +// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound) +// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) { instances, ok := cloud.Instances() if !ok { @@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types. if err == NotImplemented { return "", err } + if err == InstanceNotFound { + return "", err + } - return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err) + return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err) } return cloud.ProviderName() + "://" + instanceID, nil } diff --git a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go index aa6601facd7..b8a50e42cdd 100644 --- a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go +++ b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go @@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { // At this point the node has NotReady status, we need to check if the node has been removed // from the cloud provider. If node cannot be found in cloudprovider, then delete the node - exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node) + exists, err := c.ensureNodeExistsByProviderID(ctx, node) if err != nil { klog.Errorf("error checking if node %s exists: %v", node.Name, err) continue @@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { // Node exists. We need to check this to get taint working in similar in all cloudproviders // current problem is that shutdown nodes are not working in similar way ie. all cloudproviders // does not delete node from kubernetes cluster when instance it is shutdown see issue #46442 - shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node) + shutdown, err := c.shutdownInCloudProvider(ctx, node) if err != nil { klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err) } @@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) { } } +// getProviderID returns the provider ID for the node. If Node CR has no provider ID, +// it will be the one from the cloud provider. +func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) { + if node.Spec.ProviderID != "" { + return node.Spec.ProviderID, nil + } + + if instanceV2, ok := c.cloud.InstancesV2(); ok { + metadata, err := instanceV2.InstanceMetadata(ctx, node) + if err != nil { + return "", err + } + return metadata.ProviderID, nil + } + + providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name)) + if err != nil { + return "", err + } + + return providerID, nil +} + // shutdownInCloudProvider returns true if the node is shutdown on the cloud provider -func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) { - if instanceV2, ok := cloud.InstancesV2(); ok { +func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) { + if instanceV2, ok := c.cloud.InstancesV2(); ok { return instanceV2.InstanceShutdown(ctx, node) } - instances, ok := cloud.Instances() + instances, ok := c.cloud.Instances() if !ok { return false, errors.New("cloud provider does not support instances") } - shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID) + providerID, err := c.getProviderID(ctx, node) + if err != nil { + if err == cloudprovider.InstanceNotFound { + return false, nil + } + return false, err + } + + shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID) if err == cloudprovider.NotImplemented { return false, nil } @@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, } // ensureNodeExistsByProviderID checks if the instance exists by the provider id, -// If provider id in spec is empty it calls instanceId with node name to get provider id -func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) { - if instanceV2, ok := cloud.InstancesV2(); ok { +func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) { + if instanceV2, ok := c.cloud.InstancesV2(); ok { return instanceV2.InstanceExists(ctx, node) } - instances, ok := cloud.Instances() + instances, ok := c.cloud.Instances() if !ok { return false, errors.New("instances interface not supported in the cloud provider") } - providerID := node.Spec.ProviderID - if providerID == "" { - var err error - providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name)) - if err != nil { - if err == cloudprovider.InstanceNotFound { - return false, nil - } - return false, err - } - - if providerID == "" { - klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name) + providerID, err := c.getProviderID(ctx, node) + if err != nil { + if err == cloudprovider.InstanceNotFound { return false, nil } + return false, err } return instances.InstanceExistsByProviderID(ctx, providerID) diff --git a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go index 2f71eb2b07b..2132831d44a 100644 --- a/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go +++ b/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller_test.go @@ -19,6 +19,7 @@ package cloud import ( "context" "errors" + "github.com/google/go-cmp/cmp" "reflect" "testing" "time" @@ -32,6 +33,7 @@ import ( "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" + cloudprovider "k8s.io/cloud-provider" fakecloud "k8s.io/cloud-provider/fake" "k8s.io/klog/v2" ) @@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) { ErrShutdownByProviderID: nil, }, }, + { + name: "node with empty spec providerID is not ready and was shutdown, but exists", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + Taints: []v1.Taint{ + *ShutdownTaint, + }, + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: false, + fakeCloud: &fakecloud.Cloud{ + NodeShutdown: true, + ExistsByProviderID: true, + ErrShutdownByProviderID: nil, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "foo://12345", + }, + }, + }, + { + name: "node with non-existing providerID (missing in cloud provider) gets deleted", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: true, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "", + }, + }, + }, + { + name: "node with error when getting providerID does not have shutdown taint", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: false, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): errors.New("err!"), + }, + }, + }, + { + name: "node with InstanceID returning InstanceNotFound gets deleted", + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local), + }, + Spec: v1.NodeSpec{ + ProviderID: "", + }, + Status: v1.NodeStatus{ + Conditions: []v1.NodeCondition{ + { + Type: v1.NodeReady, + Status: v1.ConditionFalse, + LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local), + }, + }, + }, + }, + expectedDeleted: true, + fakeCloud: &fakecloud.Cloud{ + ErrShutdownByProviderID: nil, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.InstanceNotFound, + }, + }, + }, { name: "node is not ready, but there is error checking if node is shutdown", existingNode: &v1.Node{ @@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) { } } +func Test_GetProviderID(t *testing.T) { + testcases := []struct { + name string + fakeCloud *fakecloud.Cloud + existingNode *v1.Node + expectedProviderID string + expectedErr error + }{ + { + name: "node initialized with provider ID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "node initialized with provider ID with InstancesV2", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with Instances", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "12345", + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with InstancesV2", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "fake://12345", + expectedErr: nil, + }, + { + name: "cloud implemented with InstancesV2 (without providerID)", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: nil, + }, + { + name: "cloud implemented with Instances with instance missing", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.InstanceNotFound, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: cloudprovider.InstanceNotFound, + }, + { + name: "cloud implemented with Instances with unknown error", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): errors.New("unknown error"), + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + }, + expectedProviderID: "", + expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"), + }, + } + + for _, testcase := range testcases { + t.Run(testcase.name, func(t *testing.T) { + cloudNodeLifecycleController := &CloudNodeLifecycleController{ + cloud: testcase.fakeCloud, + } + + providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode) + + if err != nil && testcase.expectedErr == nil { + t.Fatalf("unexpected error: %v", err) + } + if err == nil && testcase.expectedErr != nil { + t.Fatalf("did not get expected error %q", testcase.expectedErr) + } + if err != nil && err.Error() != testcase.expectedErr.Error() { + t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error()) + } + + if !cmp.Equal(providerID, testcase.expectedProviderID) { + t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID)) + } + }) + } +} + func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error { nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { diff --git a/staging/src/k8s.io/cloud-provider/fake/fake.go b/staging/src/k8s.io/cloud-provider/fake/fake.go index a0ec6399729..7fcda129569 100644 --- a/staging/src/k8s.io/cloud-provider/fake/fake.go +++ b/staging/src/k8s.io/cloud-provider/fake/fake.go @@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin // InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) { f.addCall("instance-shutdown-by-provider-id") + + if providerID == "" { + return false, fmt.Errorf("cannot shutdown instance with empty providerID") + } + return f.NodeShutdown, f.ErrShutdownByProviderID } diff --git a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go index 91a1c3ad024..4496f232800 100644 --- a/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go +++ b/staging/src/k8s.io/legacy-cloud-providers/gce/gce_instances_test.go @@ -21,7 +21,6 @@ package gce import ( "context" - "fmt" "testing" "github.com/stretchr/testify/assert" @@ -55,7 +54,7 @@ func TestInstanceExists(t *testing.T) { name: "node not exist", nodeName: "test-node-2", exist: false, - expectedErr: fmt.Errorf("failed to get instance ID from cloud provider: instance not found"), + expectedErr: nil, }, }