mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-09 03:57:41 +00:00
cloud-node-lifecycle controller: add fallback for empty providerID in shutdown
Simiarly to the function `ensureNodeExistsByProviderID`, `shutdownInCloudProvider` should have a logic where in case of an empty providerID we get it using the name of the node. This is to support scenarios when the function is called with Node object that has a name but does not have any provider ID. Currently in such a scenario we have an error as it is not possible to call `InstanceShutdownByProviderID` with empty value. With this change in such a scenario we will first obtain a correct provider ID and only afterwards check the shutdown status. Signed-off-by: Mat Kowalski <mko@redhat.com>
This commit is contained in:
parent
9eb53ec78a
commit
4a640ea384
@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string {
|
||||
}
|
||||
|
||||
// GetInstanceProviderID builds a ProviderID for a node in a cloud.
|
||||
// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound)
|
||||
// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping
|
||||
func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) {
|
||||
instances, ok := cloud.Instances()
|
||||
if !ok {
|
||||
@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
|
||||
if err == NotImplemented {
|
||||
return "", err
|
||||
}
|
||||
if err == InstanceNotFound {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
|
||||
return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err)
|
||||
}
|
||||
return cloud.ProviderName() + "://" + instanceID, nil
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
|
||||
// At this point the node has NotReady status, we need to check if the node has been removed
|
||||
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
|
||||
exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node)
|
||||
exists, err := c.ensureNodeExistsByProviderID(ctx, node)
|
||||
if err != nil {
|
||||
klog.Errorf("error checking if node %s exists: %v", node.Name, err)
|
||||
continue
|
||||
@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
// Node exists. We need to check this to get taint working in similar in all cloudproviders
|
||||
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
|
||||
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
|
||||
shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node)
|
||||
shutdown, err := c.shutdownInCloudProvider(ctx, node)
|
||||
if err != nil {
|
||||
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
|
||||
}
|
||||
@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// getProviderID returns the provider ID for the node. If Node CR has no provider ID,
|
||||
// it will be the one from the cloud provider.
|
||||
func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
|
||||
if node.Spec.ProviderID != "" {
|
||||
return node.Spec.ProviderID, nil
|
||||
}
|
||||
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
metadata, err := instanceV2.InstanceMetadata(ctx, node)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return metadata.ProviderID, nil
|
||||
}
|
||||
|
||||
providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return providerID, nil
|
||||
}
|
||||
|
||||
// shutdownInCloudProvider returns true if the node is shutdown on the cloud provider
|
||||
func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := cloud.InstancesV2(); ok {
|
||||
func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
return instanceV2.InstanceShutdown(ctx, node)
|
||||
}
|
||||
|
||||
instances, ok := cloud.Instances()
|
||||
instances, ok := c.cloud.Instances()
|
||||
if !ok {
|
||||
return false, errors.New("cloud provider does not support instances")
|
||||
}
|
||||
|
||||
shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID)
|
||||
providerID, err := c.getProviderID(ctx, node)
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID)
|
||||
if err == cloudprovider.NotImplemented {
|
||||
return false, nil
|
||||
}
|
||||
@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface,
|
||||
}
|
||||
|
||||
// ensureNodeExistsByProviderID checks if the instance exists by the provider id,
|
||||
// If provider id in spec is empty it calls instanceId with node name to get provider id
|
||||
func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := cloud.InstancesV2(); ok {
|
||||
func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
return instanceV2.InstanceExists(ctx, node)
|
||||
}
|
||||
|
||||
instances, ok := cloud.Instances()
|
||||
instances, ok := c.cloud.Instances()
|
||||
if !ok {
|
||||
return false, errors.New("instances interface not supported in the cloud provider")
|
||||
}
|
||||
|
||||
providerID := node.Spec.ProviderID
|
||||
if providerID == "" {
|
||||
var err error
|
||||
providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name))
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
if providerID == "" {
|
||||
klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name)
|
||||
providerID, err := c.getProviderID(ctx, node)
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
return instances.InstanceExistsByProviderID(ctx, providerID)
|
||||
|
@ -19,6 +19,7 @@ package cloud
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
@ -32,6 +33,7 @@ import (
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
"k8s.io/client-go/tools/record"
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
fakecloud "k8s.io/cloud-provider/fake"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) {
|
||||
ErrShutdownByProviderID: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with empty spec providerID is not ready and was shutdown, but exists",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
Taints: []v1.Taint{
|
||||
*ShutdownTaint,
|
||||
},
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: false,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
NodeShutdown: true,
|
||||
ExistsByProviderID: true,
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "foo://12345",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with non-existing providerID (missing in cloud provider) gets deleted",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with error when getting providerID does not have shutdown taint",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: false,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): errors.New("err!"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with InstanceID returning InstanceNotFound gets deleted",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.InstanceNotFound,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node is not ready, but there is error checking if node is shutdown",
|
||||
existingNode: &v1.Node{
|
||||
@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func Test_GetProviderID(t *testing.T) {
|
||||
testcases := []struct {
|
||||
name string
|
||||
fakeCloud *fakecloud.Cloud
|
||||
existingNode *v1.Node
|
||||
expectedProviderID string
|
||||
expectedErr error
|
||||
}{
|
||||
{
|
||||
name: "node initialized with provider ID",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "fake://12345",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "node initialized with provider ID with InstancesV2",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "fake://12345",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "12345",
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with InstancesV2",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
ProviderID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "fake://12345",
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with InstancesV2 (without providerID)",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances with instance missing",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.InstanceNotFound,
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: cloudprovider.InstanceNotFound,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances with unknown error",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): errors.New("unknown error"),
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, testcase := range testcases {
|
||||
t.Run(testcase.name, func(t *testing.T) {
|
||||
cloudNodeLifecycleController := &CloudNodeLifecycleController{
|
||||
cloud: testcase.fakeCloud,
|
||||
}
|
||||
|
||||
providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode)
|
||||
|
||||
if err != nil && testcase.expectedErr == nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if err == nil && testcase.expectedErr != nil {
|
||||
t.Fatalf("did not get expected error %q", testcase.expectedErr)
|
||||
}
|
||||
if err != nil && err.Error() != testcase.expectedErr.Error() {
|
||||
t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error())
|
||||
}
|
||||
|
||||
if !cmp.Equal(providerID, testcase.expectedProviderID) {
|
||||
t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error {
|
||||
nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
|
||||
if err != nil {
|
||||
|
@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
|
||||
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
|
||||
func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||
f.addCall("instance-shutdown-by-provider-id")
|
||||
|
||||
if providerID == "" {
|
||||
return false, fmt.Errorf("cannot shutdown instance with empty providerID")
|
||||
}
|
||||
|
||||
return f.NodeShutdown, f.ErrShutdownByProviderID
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user