Merge pull request #120615 from mkowalski/OCPBUGS-18641

cloud-node-lifecycle controller: add fallback for empty providerID in shutdown
This commit is contained in:
Kubernetes Prow Robot 2023-10-27 08:51:17 +02:00 committed by GitHub
commit 036091645d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 368 additions and 26 deletions

View File

@ -98,6 +98,8 @@ func DefaultLoadBalancerName(service *v1.Service) string {
}
// GetInstanceProviderID builds a ProviderID for a node in a cloud.
// Note that if the instance does not exist, we must return ("", cloudprovider.InstanceNotFound)
// cloudprovider.InstanceNotFound should NOT be returned for instances that exist but are stopped/sleeping
func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.NodeName) (string, error) {
instances, ok := cloud.Instances()
if !ok {
@ -108,8 +110,11 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types.
if err == NotImplemented {
return "", err
}
if err == InstanceNotFound {
return "", err
}
return "", fmt.Errorf("failed to get instance ID from cloud provider: %v", err)
return "", fmt.Errorf("failed to get instance ID from cloud provider: %w", err)
}
return cloud.ProviderName() + "://" + instanceID, nil
}

View File

@ -152,7 +152,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
// At this point the node has NotReady status, we need to check if the node has been removed
// from the cloud provider. If node cannot be found in cloudprovider, then delete the node
exists, err := ensureNodeExistsByProviderID(ctx, c.cloud, node)
exists, err := c.ensureNodeExistsByProviderID(ctx, node)
if err != nil {
klog.Errorf("error checking if node %s exists: %v", node.Name, err)
continue
@ -180,7 +180,7 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
// Node exists. We need to check this to get taint working in similar in all cloudproviders
// current problem is that shutdown nodes are not working in similar way ie. all cloudproviders
// does not delete node from kubernetes cluster when instance it is shutdown see issue #46442
shutdown, err := shutdownInCloudProvider(ctx, c.cloud, node)
shutdown, err := c.shutdownInCloudProvider(ctx, node)
if err != nil {
klog.Errorf("error checking if node %s is shutdown: %v", node.Name, err)
}
@ -196,18 +196,49 @@ func (c *CloudNodeLifecycleController) MonitorNodes(ctx context.Context) {
}
}
// getProviderID returns the provider ID for the node. If Node CR has no provider ID,
// it will be the one from the cloud provider.
func (c *CloudNodeLifecycleController) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
if node.Spec.ProviderID != "" {
return node.Spec.ProviderID, nil
}
if instanceV2, ok := c.cloud.InstancesV2(); ok {
metadata, err := instanceV2.InstanceMetadata(ctx, node)
if err != nil {
return "", err
}
return metadata.ProviderID, nil
}
providerID, err := cloudprovider.GetInstanceProviderID(ctx, c.cloud, types.NodeName(node.Name))
if err != nil {
return "", err
}
return providerID, nil
}
// shutdownInCloudProvider returns true if the node is shutdown on the cloud provider
func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
if instanceV2, ok := cloud.InstancesV2(); ok {
func (c *CloudNodeLifecycleController) shutdownInCloudProvider(ctx context.Context, node *v1.Node) (bool, error) {
if instanceV2, ok := c.cloud.InstancesV2(); ok {
return instanceV2.InstanceShutdown(ctx, node)
}
instances, ok := cloud.Instances()
instances, ok := c.cloud.Instances()
if !ok {
return false, errors.New("cloud provider does not support instances")
}
shutdown, err := instances.InstanceShutdownByProviderID(ctx, node.Spec.ProviderID)
providerID, err := c.getProviderID(ctx, node)
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
shutdown, err := instances.InstanceShutdownByProviderID(ctx, providerID)
if err == cloudprovider.NotImplemented {
return false, nil
}
@ -216,32 +247,22 @@ func shutdownInCloudProvider(ctx context.Context, cloud cloudprovider.Interface,
}
// ensureNodeExistsByProviderID checks if the instance exists by the provider id,
// If provider id in spec is empty it calls instanceId with node name to get provider id
func ensureNodeExistsByProviderID(ctx context.Context, cloud cloudprovider.Interface, node *v1.Node) (bool, error) {
if instanceV2, ok := cloud.InstancesV2(); ok {
func (c *CloudNodeLifecycleController) ensureNodeExistsByProviderID(ctx context.Context, node *v1.Node) (bool, error) {
if instanceV2, ok := c.cloud.InstancesV2(); ok {
return instanceV2.InstanceExists(ctx, node)
}
instances, ok := cloud.Instances()
instances, ok := c.cloud.Instances()
if !ok {
return false, errors.New("instances interface not supported in the cloud provider")
}
providerID := node.Spec.ProviderID
if providerID == "" {
var err error
providerID, err = instances.InstanceID(ctx, types.NodeName(node.Name))
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
if providerID == "" {
klog.Warningf("Cannot find valid providerID for node name %q, assuming non existence", node.Name)
providerID, err := c.getProviderID(ctx, node)
if err != nil {
if err == cloudprovider.InstanceNotFound {
return false, nil
}
return false, err
}
return instances.InstanceExistsByProviderID(ctx, providerID)

View File

@ -19,6 +19,7 @@ package cloud
import (
"context"
"errors"
"github.com/google/go-cmp/cmp"
"reflect"
"testing"
"time"
@ -32,6 +33,7 @@ import (
"k8s.io/client-go/kubernetes/fake"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
cloudprovider "k8s.io/cloud-provider"
fakecloud "k8s.io/cloud-provider/fake"
"k8s.io/klog/v2"
)
@ -598,6 +600,165 @@ func Test_NodesShutdown(t *testing.T) {
ErrShutdownByProviderID: nil,
},
},
{
name: "node with empty spec providerID is not ready and was shutdown, but exists",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
Taints: []v1.Taint{
*ShutdownTaint,
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: false,
fakeCloud: &fakecloud.Cloud{
NodeShutdown: true,
ExistsByProviderID: true,
ErrShutdownByProviderID: nil,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "foo://12345",
},
},
},
{
name: "node with non-existing providerID (missing in cloud provider) gets deleted",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: true,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "",
},
},
},
{
name: "node with error when getting providerID does not have shutdown taint",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: false,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): errors.New("err!"),
},
},
},
{
name: "node with InstanceID returning InstanceNotFound gets deleted",
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.Local),
},
Spec: v1.NodeSpec{
ProviderID: "",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.Local),
},
},
},
},
expectedDeleted: true,
fakeCloud: &fakecloud.Cloud{
ErrShutdownByProviderID: nil,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.InstanceNotFound,
},
},
},
{
name: "node is not ready, but there is error checking if node is shutdown",
existingNode: &v1.Node{
@ -749,6 +910,157 @@ func Test_NodesShutdown(t *testing.T) {
}
}
func Test_GetProviderID(t *testing.T) {
testcases := []struct {
name string
fakeCloud *fakecloud.Cloud
existingNode *v1.Node
expectedProviderID string
expectedErr error
}{
{
name: "node initialized with provider ID",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
Spec: v1.NodeSpec{
ProviderID: "fake://12345",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "node initialized with provider ID with InstancesV2",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
Spec: v1.NodeSpec{
ProviderID: "fake://12345",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with Instances",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtID: map[types.NodeName]string{
types.NodeName("node0"): "12345",
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with InstancesV2",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
ProviderID: map[types.NodeName]string{
types.NodeName("node0"): "fake://12345",
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "fake://12345",
expectedErr: nil,
},
{
name: "cloud implemented with InstancesV2 (without providerID)",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: true,
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: nil,
},
{
name: "cloud implemented with Instances with instance missing",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): cloudprovider.InstanceNotFound,
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: cloudprovider.InstanceNotFound,
},
{
name: "cloud implemented with Instances with unknown error",
fakeCloud: &fakecloud.Cloud{
EnableInstancesV2: false,
ExtIDErr: map[types.NodeName]error{
types.NodeName("node0"): errors.New("unknown error"),
},
Err: nil,
},
existingNode: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
},
},
expectedProviderID: "",
expectedErr: errors.New("failed to get instance ID from cloud provider: unknown error"),
},
}
for _, testcase := range testcases {
t.Run(testcase.name, func(t *testing.T) {
cloudNodeLifecycleController := &CloudNodeLifecycleController{
cloud: testcase.fakeCloud,
}
providerID, err := cloudNodeLifecycleController.getProviderID(context.TODO(), testcase.existingNode)
if err != nil && testcase.expectedErr == nil {
t.Fatalf("unexpected error: %v", err)
}
if err == nil && testcase.expectedErr != nil {
t.Fatalf("did not get expected error %q", testcase.expectedErr)
}
if err != nil && err.Error() != testcase.expectedErr.Error() {
t.Fatalf("expected error %q, got %q", testcase.expectedErr.Error(), err.Error())
}
if !cmp.Equal(providerID, testcase.expectedProviderID) {
t.Errorf("unexpected providerID %s", cmp.Diff(providerID, testcase.expectedProviderID))
}
})
}
}
func syncNodeStore(nodeinformer coreinformers.NodeInformer, f *fake.Clientset) error {
nodes, err := f.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {

View File

@ -312,6 +312,11 @@ func (f *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
// InstanceShutdownByProviderID returns true if the instances is in safe state to detach volumes
func (f *Cloud) InstanceShutdownByProviderID(ctx context.Context, providerID string) (bool, error) {
f.addCall("instance-shutdown-by-provider-id")
if providerID == "" {
return false, fmt.Errorf("cannot shutdown instance with empty providerID")
}
return f.NodeShutdown, f.ErrShutdownByProviderID
}

View File

@ -21,7 +21,6 @@ package gce
import (
"context"
"fmt"
"testing"
"github.com/stretchr/testify/assert"
@ -55,7 +54,7 @@ func TestInstanceExists(t *testing.T) {
name: "node not exist",
nodeName: "test-node-2",
exist: false,
expectedErr: fmt.Errorf("failed to get instance ID from cloud provider: instance not found"),
expectedErr: nil,
},
}