mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-01 15:58:37 +00:00
Merge pull request #120615 from mkowalski/OCPBUGS-18641
cloud-node-lifecycle controller: add fallback for empty providerID in shutdown
This commit is contained in:
commit
036091645d
@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string {
|
||||
}
|
||||
|
||||
// GetInstanceProviderID builds a ProviderID for a node in a cloud.
|
||||
// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound)
|
||||
// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping
|
||||
func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) {
|
||||
instances, ok := cloud.Instances()
|
||||
if !ok {
|
||||
@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
|
||||
if err == NotImplemented {
|
||||
return "", err
|
||||
}
|
||||
if err == InstanceNotFound {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
|
||||
return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err)
|
||||
}
|
||||
return cloud.ProviderName() + "://" + instanceID, nil
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
|
||||
// At this point the node has NotReady status, we need to check if the node has been removed
|
||||
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
|
||||
exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node)
|
||||
exists, err := c.ensureNodeExistsByProviderID(ctx, node)
|
||||
if err != nil {
|
||||
klog.Errorf("error checking if node %s exists: %v", node.Name, err)
|
||||
continue
|
||||
@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
// Node exists. We need to check this to get taint working in similar in all cloudproviders
|
||||
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
|
||||
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
|
||||
shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node)
|
||||
shutdown, err := c.shutdownInCloudProvider(ctx, node)
|
||||
if err != nil {
|
||||
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
|
||||
}
|
||||
@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// getProviderID returns the provider ID for the node. If Node CR has no provider ID,
|
||||
// it will be the one from the cloud provider.
|
||||
func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
|
||||
if node.Spec.ProviderID != "" {
|
||||
return node.Spec.ProviderID, nil
|
||||
}
|
||||
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
metadata, err := instanceV2.InstanceMetadata(ctx, node)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return metadata.ProviderID, nil
|
||||
}
|
||||
|
||||
providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return providerID, nil
|
||||
}
|
||||
|
||||
// shutdownInCloudProvider returns true if the node is shutdown on the cloud provider
|
||||
func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := cloud.InstancesV2(); ok {
|
||||
func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
return instanceV2.InstanceShutdown(ctx, node)
|
||||
}
|
||||
|
||||
instances, ok := cloud.Instances()
|
||||
instances, ok := c.cloud.Instances()
|
||||
if !ok {
|
||||
return false, errors.New("cloud provider does not support instances")
|
||||
}
|
||||
|
||||
shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID)
|
||||
providerID, err := c.getProviderID(ctx, node)
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID)
|
||||
if err == cloudprovider.NotImplemented {
|
||||
return false, nil
|
||||
}
|
||||
@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface,
|
||||
}
|
||||
|
||||
// ensureNodeExistsByProviderID checks if the instance exists by the provider id,
|
||||
// If provider id in spec is empty it calls instanceId with node name to get provider id
|
||||
func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := cloud.InstancesV2(); ok {
|
||||
func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) {
|
||||
if instanceV2, ok := c.cloud.InstancesV2(); ok {
|
||||
return instanceV2.InstanceExists(ctx, node)
|
||||
}
|
||||
|
||||
instances, ok := cloud.Instances()
|
||||
instances, ok := c.cloud.Instances()
|
||||
if !ok {
|
||||
return false, errors.New("instances interface not supported in the cloud provider")
|
||||
}
|
||||
|
||||
providerID := node.Spec.ProviderID
|
||||
if providerID == "" {
|
||||
var err error
|
||||
providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name))
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
if providerID == "" {
|
||||
klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name)
|
||||
providerID, err := c.getProviderID(ctx, node)
|
||||
if err != nil {
|
||||
if err == cloudprovider.InstanceNotFound {
|
||||
return false, nil
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
return instances.InstanceExistsByProviderID(ctx, providerID)
|
||||
|
@ -19,6 +19,7 @@ package cloud
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"github.com/google/go-cmp/cmp"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
@ -32,6 +33,7 @@ import (
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
"k8s.io/client-go/kubernetes/scheme"
|
||||
"k8s.io/client-go/tools/record"
|
||||
cloudprovider "k8s.io/cloud-provider"
|
||||
fakecloud "k8s.io/cloud-provider/fake"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) {
|
||||
ErrShutdownByProviderID: nil,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with empty spec providerID is not ready and was shutdown, but exists",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
Taints: []v1.Taint{
|
||||
*ShutdownTaint,
|
||||
},
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: false,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
NodeShutdown: true,
|
||||
ExistsByProviderID: true,
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "foo://12345",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with non-existing providerID (missing in cloud provider) gets deleted",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "",
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with error when getting providerID does not have shutdown taint",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: false,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): errors.New("err!"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node with InstanceID returning InstanceNotFound gets deleted",
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDeleted: true,
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
ErrShutdownByProviderID: nil,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.InstanceNotFound,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "node is not ready, but there is error checking if node is shutdown",
|
||||
existingNode: &v1.Node{
|
||||
@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func Test_GetProviderID(t *testing.T) {
|
||||
testcases := []struct {
|
||||
name string
|
||||
fakeCloud *fakecloud.Cloud
|
||||
existingNode *v1.Node
|
||||
expectedProviderID string
|
||||
expectedErr error
|
||||
}{
|
||||
{
|
||||
name: "node initialized with provider ID",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "fake://12345",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "node initialized with provider ID with InstancesV2",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "fake://12345",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "12345",
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with InstancesV2",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
ProviderID: map[types.NodeName]string{
|
||||
types.NodeName("node0"): "fake://12345",
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "fake://12345",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with InstancesV2 (without providerID)",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: true,
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: nil,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances with instance missing",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): cloudprovider.InstanceNotFound,
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: cloudprovider.InstanceNotFound,
|
||||
},
|
||||
{
|
||||
name: "cloud implemented with Instances with unknown error",
|
||||
fakeCloud: &fakecloud.Cloud{
|
||||
EnableInstancesV2: false,
|
||||
ExtIDErr: map[types.NodeName]error{
|
||||
types.NodeName("node0"): errors.New("unknown error"),
|
||||
},
|
||||
Err: nil,
|
||||
},
|
||||
existingNode: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
},
|
||||
},
|
||||
expectedProviderID: "",
|
||||
expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, testcase := range testcases {
|
||||
t.Run(testcase.name, func(t *testing.T) {
|
||||
cloudNodeLifecycleController := &CloudNodeLifecycleController{
|
||||
cloud: testcase.fakeCloud,
|
||||
}
|
||||
|
||||
providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode)
|
||||
|
||||
if err != nil && testcase.expectedErr == nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if err == nil && testcase.expectedErr != nil {
|
||||
t.Fatalf("did not get expected error %q", testcase.expectedErr)
|
||||
}
|
||||
if err != nil && err.Error() != testcase.expectedErr.Error() {
|
||||
t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error())
|
||||
}
|
||||
|
||||
if !cmp.Equal(providerID, testcase.expectedProviderID) {
|
||||
t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error {
|
||||
nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
|
||||
if err != nil {
|
||||
|
@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
|
||||
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
|
||||
func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
|
||||
f.addCall("instance-shutdown-by-provider-id")
|
||||
|
||||
if providerID == "" {
|
||||
return false, fmt.Errorf("cannot shutdown instance with empty providerID")
|
||||
}
|
||||
|
||||
return f.NodeShutdown, f.ErrShutdownByProviderID
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,6 @@ package gce
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
@ -55,7 +54,7 @@ func TestInstanceExists(t *testing.T) {
|
||||
name: "node not exist",
|
||||
nodeName: "test-node-2",
|
||||
exist: false,
|
||||
expectedErr: fmt.Errorf("failed to get instance ID from cloud provider: instance not found"),
|
||||
expectedErr: nil,
|
||||
},
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user