diff --git a/pkg/cloudprovider/cloud.go b/pkg/cloudprovider/cloud.go index 5c4651f7d3a..47e088a09e6 100644 --- a/pkg/cloudprovider/cloud.go +++ b/pkg/cloudprovider/cloud.go @@ -124,7 +124,7 @@ type Instances interface { // ProviderID is a unique identifier of the node. This will not be called // from the node whose nodeaddresses are being queried. i.e. local metadata // services cannot be used in this method to obtain nodeaddresses - NodeAddressesByProviderID(providerId string) ([]v1.NodeAddress, error) + NodeAddressesByProviderID(providerID string) ([]v1.NodeAddress, error) // ExternalID returns the cloud provider ID of the node with the specified NodeName. // Note that if the instance does not exist or is no longer running, we must return ("", cloudprovider.InstanceNotFound) ExternalID(nodeName types.NodeName) (string, error) @@ -140,6 +140,9 @@ type Instances interface { // CurrentNodeName returns the name of the node we are currently running on // On most clouds (e.g. GCE) this is the hostname, so we provide the hostname CurrentNodeName(hostname string) (types.NodeName, error) + // InstanceExistsByProviderID returns true if the instance for the given provider id still is running. + // If false is returned with no error, the instance will be immediately deleted. + InstanceExistsByProviderID(providerID string) (bool, error) } // Route is a representation of an advanced routing rule. diff --git a/pkg/cloudprovider/providers/aws/aws.go b/pkg/cloudprovider/providers/aws/aws.go index 217fbcad45f..18fbbc3b453 100644 --- a/pkg/cloudprovider/providers/aws/aws.go +++ b/pkg/cloudprovider/providers/aws/aws.go @@ -1101,6 +1101,11 @@ func (c *Cloud) ExternalID(nodeName types.NodeName) (string, error) { return orEmpty(instance.InstanceId), nil } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (c *Cloud) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the node with the specified nodeName. func (c *Cloud) InstanceID(nodeName types.NodeName) (string, error) { // In the future it is possible to also return an endpoint as: diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go index b94b8225d64..e0fb6d0753f 100644 --- a/pkg/cloudprovider/providers/azure/azure_instances.go +++ b/pkg/cloudprovider/providers/azure/azure_instances.go @@ -17,6 +17,7 @@ limitations under the License. package azure import ( + "errors" "fmt" "k8s.io/api/core/v1" @@ -86,6 +87,11 @@ func (az *Cloud) ExternalID(name types.NodeName) (string, error) { return az.InstanceID(name) } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (az *Cloud) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + func (az *Cloud) isCurrentInstance(name types.NodeName) (bool, error) { nodeName := mapNodeNameToVMName(name) metadataName, err := az.metadata.Text("instance/compute/name") diff --git a/pkg/cloudprovider/providers/fake/fake.go b/pkg/cloudprovider/providers/fake/fake.go index 05c10c146cc..633dbba8ae7 100644 --- a/pkg/cloudprovider/providers/fake/fake.go +++ b/pkg/cloudprovider/providers/fake/fake.go @@ -47,8 +47,12 @@ type FakeUpdateBalancerCall struct { // FakeCloud is a test-double implementation of Interface, LoadBalancer, Instances, and Routes. It is useful for testing. type FakeCloud struct { - Exists bool - Err error + Exists bool + Err error + + ExistsByProviderID bool + ErrByProviderID error + Calls []string Addresses []v1.NodeAddress ExtID map[types.NodeName]string @@ -234,6 +238,12 @@ func (f *FakeCloud) InstanceTypeByProviderID(providerID string) (string, error) return f.InstanceTypes[types.NodeName(providerID)], nil } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (f *FakeCloud) InstanceExistsByProviderID(providerID string) (bool, error) { + f.addCall("instance-exists-by-provider-id") + return f.ExistsByProviderID, f.ErrByProviderID +} + // List is a test-spy implementation of Instances.List. // It adds an entry "list" into the internal method call record. func (f *FakeCloud) List(filter string) ([]types.NodeName, error) { diff --git a/pkg/cloudprovider/providers/gce/gce_instances.go b/pkg/cloudprovider/providers/gce/gce_instances.go index 4c1c4af6c96..25f3728bef2 100644 --- a/pkg/cloudprovider/providers/gce/gce_instances.go +++ b/pkg/cloudprovider/providers/gce/gce_instances.go @@ -17,6 +17,7 @@ limitations under the License. package gce import ( + "errors" "fmt" "net/http" "strconv" @@ -151,6 +152,11 @@ func (gce *GCECloud) ExternalID(nodeName types.NodeName) (string, error) { return strconv.FormatUint(inst.ID, 10), nil } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (gce *GCECloud) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the node with the specified NodeName. func (gce *GCECloud) InstanceID(nodeName types.NodeName) (string, error) { instanceName := mapNodeNameToInstanceName(nodeName) diff --git a/pkg/cloudprovider/providers/openstack/openstack_instances.go b/pkg/cloudprovider/providers/openstack/openstack_instances.go index 2ae872cc999..d8f96061f75 100644 --- a/pkg/cloudprovider/providers/openstack/openstack_instances.go +++ b/pkg/cloudprovider/providers/openstack/openstack_instances.go @@ -110,6 +110,11 @@ func (i *Instances) ExternalID(name types.NodeName) (string, error) { return srv.ID, nil } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (i *Instances) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the kubelet's cloud provider ID. func (os *OpenStack) InstanceID() (string, error) { if len(os.localInstanceID) == 0 { diff --git a/pkg/cloudprovider/providers/ovirt/ovirt.go b/pkg/cloudprovider/providers/ovirt/ovirt.go index fe71113ed0a..38c22e0f1b0 100644 --- a/pkg/cloudprovider/providers/ovirt/ovirt.go +++ b/pkg/cloudprovider/providers/ovirt/ovirt.go @@ -211,6 +211,11 @@ func (v *OVirtCloud) ExternalID(nodeName types.NodeName) (string, error) { return instance.UUID, nil } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (v *OVirtCloud) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the node with the specified NodeName. func (v *OVirtCloud) InstanceID(nodeName types.NodeName) (string, error) { name := mapNodeNameToInstanceName(nodeName) diff --git a/pkg/cloudprovider/providers/photon/photon.go b/pkg/cloudprovider/providers/photon/photon.go index e9c9fbc87a7..37466bd3468 100644 --- a/pkg/cloudprovider/providers/photon/photon.go +++ b/pkg/cloudprovider/providers/photon/photon.go @@ -470,6 +470,11 @@ func (pc *PCCloud) ExternalID(nodeName k8stypes.NodeName) (string, error) { } } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (pc *PCCloud) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the specified instance. func (pc *PCCloud) InstanceID(nodeName k8stypes.NodeName) (string, error) { name := string(nodeName) diff --git a/pkg/cloudprovider/providers/rackspace/rackspace.go b/pkg/cloudprovider/providers/rackspace/rackspace.go index b087bfa4cea..61127aea0e2 100644 --- a/pkg/cloudprovider/providers/rackspace/rackspace.go +++ b/pkg/cloudprovider/providers/rackspace/rackspace.go @@ -436,6 +436,11 @@ func (i *Instances) ExternalID(nodeName types.NodeName) (string, error) { return probeInstanceID(i.compute, serverName) } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (i *Instances) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the kubelet's instance. func (rs *Rackspace) InstanceID() (string, error) { return readInstanceID() diff --git a/pkg/cloudprovider/providers/vsphere/vsphere.go b/pkg/cloudprovider/providers/vsphere/vsphere.go index 2105edcd6e9..58e1a3020ec 100644 --- a/pkg/cloudprovider/providers/vsphere/vsphere.go +++ b/pkg/cloudprovider/providers/vsphere/vsphere.go @@ -376,6 +376,11 @@ func (vs *VSphere) ExternalID(nodeName k8stypes.NodeName) (string, error) { return vs.InstanceID(nodeName) } +// InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running. +func (vs *VSphere) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + // InstanceID returns the cloud provider ID of the node with the specified Name. func (vs *VSphere) InstanceID(nodeName k8stypes.NodeName) (string, error) { if vs.localInstanceID == nodeNameToVMName(nodeName) { diff --git a/pkg/controller/cloud/node_controller.go b/pkg/controller/cloud/node_controller.go index bb470d41bc4..c5557832d0f 100644 --- a/pkg/controller/cloud/node_controller.go +++ b/pkg/controller/cloud/node_controller.go @@ -237,26 +237,36 @@ func (cnc *CloudNodeController) MonitorNode() { if currentReadyCondition.Status != v1.ConditionTrue { // Check with the cloud provider to see if the node still exists. If it // doesn't, delete the node immediately. - if _, err := instances.ExternalID(types.NodeName(node.Name)); err != nil { - if err == cloudprovider.InstanceNotFound { - glog.V(2).Infof("Deleting node no longer present in cloud provider: %s", node.Name) - ref := &v1.ObjectReference{ - Kind: "Node", - Name: node.Name, - UID: types.UID(node.UID), - Namespace: "", - } - glog.V(2).Infof("Recording %s event message for node %s", "DeletingNode", node.Name) - cnc.recorder.Eventf(ref, v1.EventTypeNormal, fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name), "Node %s event: %s", node.Name, "DeletingNode") - go func(nodeName string) { - defer utilruntime.HandleCrash() - if err := cnc.kubeClient.CoreV1().Nodes().Delete(node.Name, nil); err != nil { - glog.Errorf("unable to delete node %q: %v", node.Name, err) - } - }(node.Name) - } - glog.Errorf("Error getting node data from cloud: %v", err) + exists, err := ensureNodeExistsByProviderIDOrName(instances, node) + if err != nil { + glog.Errorf("Error getting data for node %s from cloud: %v", node.Name, err) + continue } + + if exists { + // Continue checking the remaining nodes since the current one is fine. + continue + } + + glog.V(2).Infof("Deleting node since it is no longer present in cloud provider: %s", node.Name) + + ref := &v1.ObjectReference{ + Kind: "Node", + Name: node.Name, + UID: types.UID(node.UID), + Namespace: "", + } + glog.V(2).Infof("Recording %s event message for node %s", "DeletingNode", node.Name) + + cnc.recorder.Eventf(ref, v1.EventTypeNormal, fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name), "Node %s event: %s", node.Name, "DeletingNode") + + go func(nodeName string) { + defer utilruntime.HandleCrash() + if err := cnc.kubeClient.CoreV1().Nodes().Delete(nodeName, nil); err != nil { + glog.Errorf("unable to delete node %q: %v", nodeName, err) + } + }(node.Name) + } } } @@ -372,6 +382,25 @@ func excludeTaintFromList(taints []v1.Taint, toExclude v1.Taint) []v1.Taint { return newTaints } +// ensureNodeExistsByProviderIDOrName first checks if the instance exists by the provider id and then by node name +func ensureNodeExistsByProviderIDOrName(instances cloudprovider.Instances, node *v1.Node) (bool, error) { + exists, err := instances.InstanceExistsByProviderID(node.Spec.ProviderID) + if err != nil { + providerIDErr := err + _, err = instances.ExternalID(types.NodeName(node.Name)) + if err == nil { + return true, nil + } + if err == cloudprovider.InstanceNotFound { + return false, nil + } + + return false, fmt.Errorf("InstanceExistsByProviderID: Error fetching by providerID: %v Error fetching by NodeName: %v", providerIDErr, err) + } + + return exists, nil +} + func getNodeAddressesByProviderIDOrName(instances cloudprovider.Instances, node *v1.Node) ([]v1.NodeAddress, error) { nodeAddresses, err := instances.NodeAddressesByProviderID(node.Spec.ProviderID) if err != nil { diff --git a/pkg/controller/cloud/node_controller_test.go b/pkg/controller/cloud/node_controller_test.go index 033dbe90889..49e956e93fd 100644 --- a/pkg/controller/cloud/node_controller_test.go +++ b/pkg/controller/cloud/node_controller_test.go @@ -17,6 +17,8 @@ limitations under the License. package cloud import ( + "errors" + "reflect" "testing" "time" @@ -39,6 +41,119 @@ import ( "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" ) +func TestEnsureNodeExistsByProviderIDOrNodeName(t *testing.T) { + + testCases := []struct { + testName string + node *v1.Node + expectedCalls []string + existsByNodeName bool + existsByProviderID bool + nodeNameErr error + providerIDErr error + }{ + { + testName: "node exists by provider id", + existsByProviderID: true, + providerIDErr: nil, + existsByNodeName: false, + nodeNameErr: errors.New("unimplemented"), + expectedCalls: []string{"instance-exists-by-provider-id"}, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + }, + }, + { + testName: "does not exist by provider id", + existsByProviderID: false, + providerIDErr: nil, + existsByNodeName: false, + nodeNameErr: errors.New("unimplemented"), + expectedCalls: []string{"instance-exists-by-provider-id"}, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + }, + }, + { + testName: "node exists by node name", + existsByProviderID: false, + providerIDErr: errors.New("unimplemented"), + existsByNodeName: true, + nodeNameErr: nil, + expectedCalls: []string{"instance-exists-by-provider-id", "external-id"}, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + }, + }, + { + testName: "does not exist by node name", + existsByProviderID: false, + providerIDErr: errors.New("unimplemented"), + existsByNodeName: false, + nodeNameErr: cloudprovider.InstanceNotFound, + expectedCalls: []string{"instance-exists-by-provider-id", "external-id"}, + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + }, + Spec: v1.NodeSpec{ + ProviderID: "node0", + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.testName, func(t *testing.T) { + fc := &fakecloud.FakeCloud{ + Exists: tc.existsByNodeName, + ExistsByProviderID: tc.existsByProviderID, + Err: tc.nodeNameErr, + ErrByProviderID: tc.providerIDErr, + } + + instances, _ := fc.Instances() + exists, err := ensureNodeExistsByProviderIDOrName(instances, tc.node) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(fc.Calls, tc.expectedCalls) { + t.Fatalf("expected cloud provider methods `%v` to be called but `%v` was called ", tc.expectedCalls, fc.Calls) + } + + if tc.existsByProviderID && tc.existsByProviderID != exists { + t.Fatalf("expected exist by provider id to be `%t` but got `%t`", tc.existsByProviderID, exists) + } + + if tc.existsByNodeName && tc.existsByNodeName != exists { + t.Fatalf("expected exist by node name to be `%t` but got `%t`", tc.existsByNodeName, exists) + } + + if !tc.existsByNodeName && !tc.existsByProviderID && exists { + t.Fatal("node is not supposed to exist") + } + + }) + } + +} + // This test checks that the node is deleted when kubelet stops reporting // and cloud provider says node is gone func TestNodeDeleted(t *testing.T) { @@ -105,9 +220,12 @@ func TestNodeDeleted(t *testing.T) { eventBroadcaster := record.NewBroadcaster() cloudNodeController := &CloudNodeController{ - kubeClient: fnh, - nodeInformer: factory.Core().V1().Nodes(), - cloud: &fakecloud.FakeCloud{Err: cloudprovider.InstanceNotFound}, + kubeClient: fnh, + nodeInformer: factory.Core().V1().Nodes(), + cloud: &fakecloud.FakeCloud{ + ExistsByProviderID: false, + Err: nil, + }, nodeMonitorPeriod: 1 * time.Second, recorder: eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "cloud-controller-manager"}), nodeStatusUpdateFrequency: 1 * time.Second, @@ -520,7 +638,8 @@ func TestNodeAddresses(t *testing.T) { FailureDomain: "us-west-1a", Region: "us-west", }, - Err: nil, + ExistsByProviderID: true, + Err: nil, } eventBroadcaster := record.NewBroadcaster() @@ -639,7 +758,8 @@ func TestNodeProvidedIPAddresses(t *testing.T) { FailureDomain: "us-west-1a", Region: "us-west", }, - Err: nil, + ExistsByProviderID: true, + Err: nil, } eventBroadcaster := record.NewBroadcaster() diff --git a/pkg/volume/cinder/attacher_test.go b/pkg/volume/cinder/attacher_test.go index e1818d58447..d6a4b5b06e6 100644 --- a/pkg/volume/cinder/attacher_test.go +++ b/pkg/volume/cinder/attacher_test.go @@ -650,6 +650,10 @@ func (instances *instances) InstanceTypeByProviderID(providerID string) (string, return "", errors.New("Not implemented") } +func (instances *instances) InstanceExistsByProviderID(providerID string) (bool, error) { + return false, errors.New("unimplemented") +} + func (instances *instances) List(filter string) ([]types.NodeName, error) { return []types.NodeName{}, errors.New("Not implemented") }