cloud-node-lifecycle controller: add fallback for empty providerID in shutdown

Simiarly to the function `ensureNodeExistsByProviderID`,
`shutdownInCloudProvider` should have a logic where in case of an empty
providerID we get it using the name of the node. This is to support
scenarios when the function is called with Node object that has a name
but does not have any provider ID.

Currently in such a scenario we have an error as it is not possible to
call `InstanceShutdownByProviderID` with empty value. With this change
in such a scenario we will first obtain a correct provider ID and only
afterwards check the shutdown status.

Signed-off-by: Mat Kowalski <mko@redhat.com>
This commit is contained in:
Mat Kowalski 2023-09-12 20:50:40 +02:00
parent 9eb53ec78a
commit 4a640ea384
No known key found for this signature in database
GPG Key ID: 7E2A225088D24529
4 changed files with 367 additions and 24 deletions

View File

@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string {
}
// GetInstanceProviderID builds a ProviderID for a node in a cloud.
// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound)
// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping
func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) {
instances, ok := cloud.Instances()
if !ok {
@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
if err == NotImplemented {
return "", err
}
if err == InstanceNotFound {
return "", err
}
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err)
}
return cloud.ProviderName() + "://" + instanceID, nil
}

View File

@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
// At this point the node has NotReady status, we need to check if the node has been removed
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node)
exists, err := c.ensureNodeExistsByProviderID(ctx, node)
if err != nil {
klog.Errorf("error checking if node %s exists: %v", node.Name, err)
continue
@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
// Node exists. We need to check this to get taint working in similar in all cloudproviders
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node)
shutdown, err := c.shutdownInCloudProvider(ctx, node)
if err != nil {
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
}
@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
}
}
// getProviderID returns the provider ID for the node. If Node CR has no provider ID,
// it will be the one from the cloud provider.
func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
if node.Spec.ProviderID != "" {
return node.Spec.ProviderID, nil
}
if instanceV2, ok := c.cloud.InstancesV2(); ok {
metadata, err := instanceV2.InstanceMetadata(ctx, node)
if err != nil {
return "", err
}
return metadata.ProviderID, nil
}
providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name))
if err != nil {
return "", err
}
return providerID, nil
}
// shutdownInCloudProvider returns true if the node is shutdown on the cloud provider
func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
if instanceV2, ok := cloud.InstancesV2(); ok {
func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) {
if instanceV2, ok := c.cloud.InstancesV2(); ok {
return instanceV2.InstanceShutdown(ctx, node)
}
instances, ok := cloud.Instances()
instances, ok := c.cloud.Instances()
if !ok {
return false, errors.New("cloud provider does not support instances")
}
shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID)
providerID, err := c.getProviderID(ctx, node)
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID)
if err == cloudprovider.NotImplemented {
return false, nil
}
@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface,
}
// ensureNodeExistsByProviderID checks if the instance exists by the provider id,
// If provider id in spec is empty it calls instanceId with node name to get provider id
func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
if instanceV2, ok := cloud.InstancesV2(); ok {
func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) {
if instanceV2, ok := c.cloud.InstancesV2(); ok {
return instanceV2.InstanceExists(ctx, node)
}
instances, ok := cloud.Instances()
instances, ok := c.cloud.Instances()
if !ok {
return false, errors.New("instances interface not supported in the cloud provider")
}
providerID := node.Spec.ProviderID
if providerID == "" {
var err error
providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name))
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
if providerID == "" {
klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name)
providerID, err := c.getProviderID(ctx, node)
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
return instances.InstanceExistsByProviderID(ctx, providerID)

View File

@ -19,6 +19,7 @@ package cloud
import (
"context"
"errors"
"github.com/google/go-cmp/cmp"
"reflect"
"testing"
"time"
@ -32,6 +33,7 @@ import (
"k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
fakecloud "k8s.io/cloud-provider/fake"
"k8s.io/klog/v2"
)
@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) {
ErrShutdownByProviderID: nil,
},
},
{
name: "node with empty spec providerID is not ready and was shutdown, but exists",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
Taints: []v1.Taint{
*ShutdownTaint,
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: false,
fakeCloud: &fakecloud.Cloud{
NodeShutdown: true,
ExistsByProviderID: true,
ErrShutdownByProviderID: nil,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "foo://12345",
},
},
},
{
name: "node with non-existing providerID (missing in cloud provider) gets deleted",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: true,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "",
},
},
},
{
name: "node with error when getting providerID does not have shutdown taint",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: false,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): errors.New("err!"),
},
},
},
{
name: "node with InstanceID returning InstanceNotFound gets deleted",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: true,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.InstanceNotFound,
},
},
},
{
name: "node is not ready, but there is error checking if node is shutdown",
existingNode: &v1.Node{
@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) {
}
}
func Test_GetProviderID(t *testing.T) {
testcases := []struct {
name string
fakeCloud *fakecloud.Cloud
existingNode *v1.Node
expectedProviderID string
expectedErr error
}{
{
name: "node initialized with provider ID",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
Spec: v1.NodeSpec{
ProviderID: "fake://12345",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "node initialized with provider ID with InstancesV2",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
Spec: v1.NodeSpec{
ProviderID: "fake://12345",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with Instances",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "12345",
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with InstancesV2",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
ProviderID: map[types.NodeName]string{
types.NodeName("node0"): "fake://12345",
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with InstancesV2 (without providerID)",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: nil,
},
{
name: "cloud implemented with Instances with instance missing",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.InstanceNotFound,
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: cloudprovider.InstanceNotFound,
},
{
name: "cloud implemented with Instances with unknown error",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): errors.New("unknown error"),
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"),
},
}
for _, testcase := range testcases {
t.Run(testcase.name, func(t *testing.T) {
cloudNodeLifecycleController := &CloudNodeLifecycleController{
cloud: testcase.fakeCloud,
}
providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode)
if err != nil && testcase.expectedErr == nil {
t.Fatalf("unexpected error: %v", err)
}
if err == nil && testcase.expectedErr != nil {
t.Fatalf("did not get expected error %q", testcase.expectedErr)
}
if err != nil && err.Error() != testcase.expectedErr.Error() {
t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error())
}
if !cmp.Equal(providerID, testcase.expectedProviderID) {
t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID))
}
})
}
}
func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error {
nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {

View File

@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
f.addCall("instance-shutdown-by-provider-id")
if providerID == "" {
return false, fmt.Errorf("cannot shutdown instance with empty providerID")
}
return f.NodeShutdown, f.ErrShutdownByProviderID
}