diff --git a/cluster/addons.go b/cluster/addons.go index 7e3457c0..c6bcf5ce 100644 --- a/cluster/addons.go +++ b/cluster/addons.go @@ -492,7 +492,7 @@ func (c *Cluster) doAddonDeploy(ctx context.Context, addonYaml, resourceName str if err != nil { return &addonError{fmt.Sprintf("%v", err), isCritical} } - node, err := k8s.GetNode(k8sClient, c.ControlPlaneHosts[0].HostnameOverride, c.CloudProvider.Name) + node, err := k8s.GetNode(k8sClient, c.ControlPlaneHosts[0].HostnameOverride, c.ControlPlaneHosts[0].InternalAddress, c.CloudProvider.Name) if err != nil { return &addonError{fmt.Sprintf("Failed to get Node [%s]: %v", c.ControlPlaneHosts[0].HostnameOverride, err), isCritical} } @@ -513,7 +513,7 @@ func (c *Cluster) doAddonDelete(ctx context.Context, resourceName string, isCrit if err != nil { return &addonError{fmt.Sprintf("%v", err), isCritical} } - node, err := k8s.GetNode(k8sClient, c.ControlPlaneHosts[0].HostnameOverride, c.CloudProvider.Name) + node, err := k8s.GetNode(k8sClient, c.ControlPlaneHosts[0].HostnameOverride, c.ControlPlaneHosts[0].InternalAddress, c.CloudProvider.Name) if err != nil { return &addonError{fmt.Sprintf("Failed to get Node [%s]: %v", c.ControlPlaneHosts[0].HostnameOverride, err), isCritical} } diff --git a/cluster/cluster.go b/cluster/cluster.go index 3b3f0efd..a0a683a9 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -1018,13 +1018,12 @@ func setNodeAnnotationsLabelsTaints(k8sClient *kubernetes.Clientset, host *hosts node := &v1.Node{} var err error for retries := 0; retries <= 5; retries++ { - node, err = k8s.GetNode(k8sClient, host.HostnameOverride, cloudProviderName) + node, err = k8s.GetNode(k8sClient, host.HostnameOverride, host.InternalAddress, cloudProviderName) if err != nil { logrus.Debugf("[hosts] Can't find node by name [%s], error: %v", host.HostnameOverride, err) time.Sleep(2 * time.Second) continue } - oldNode := node.DeepCopy() k8s.SetNodeAddressesAnnotations(node, host.InternalAddress, host.Address) k8s.SyncNodeLabels(node, host.ToAddLabels, host.ToDelLabels) diff --git a/cluster/remove.go b/cluster/remove.go index b8053839..a2e1dcaa 100644 --- a/cluster/remove.go +++ b/cluster/remove.go @@ -11,6 +11,7 @@ import ( v3 "github.com/rancher/rke/types" "github.com/rancher/rke/util" "golang.org/x/sync/errgroup" + v1 "k8s.io/api/core/v1" ) func (c *Cluster) ClusterRemove(ctx context.Context) error { @@ -92,7 +93,13 @@ func (c *Cluster) RemoveOldNodes(ctx context.Context) error { host := &hosts.Host{} host.HostnameOverride = node.Name if !hosts.IsNodeInList(host, uniqueHosts) { - if err := k8s.DeleteNode(kubeClient, node.Name, c.CloudProvider.Name); err != nil { + nodeAddress := "" + for _, addr := range node.Status.Addresses { + if addr.Type == v1.NodeInternalIP { + nodeAddress = addr.Address + } + } + if err := k8s.DeleteNode(kubeClient, node.Name, nodeAddress, c.CloudProvider.Name); err != nil { log.Warnf(ctx, "Failed to delete old node [%s] from kubernetes") } } diff --git a/hosts/hosts.go b/hosts/hosts.go index 721aae8c..682a34d6 100644 --- a/hosts/hosts.go +++ b/hosts/hosts.go @@ -197,7 +197,7 @@ func DeleteNode(ctx context.Context, toDeleteHost *Host, kubeClient *kubernetes. return nil } log.Infof(ctx, "[hosts] Cordoning host [%s]", toDeleteHost.Address) - if _, err := k8s.GetNode(kubeClient, toDeleteHost.HostnameOverride, cloudProviderName); err != nil { + if _, err := k8s.GetNode(kubeClient, toDeleteHost.HostnameOverride, toDeleteHost.InternalAddress, cloudProviderName); err != nil { if apierrors.IsNotFound(err) { log.Warnf(ctx, "[hosts] Can't find node by name [%s]", toDeleteHost.Address) return nil @@ -205,11 +205,11 @@ func DeleteNode(ctx context.Context, toDeleteHost *Host, kubeClient *kubernetes. return err } - if err := k8s.CordonUncordon(kubeClient, toDeleteHost.HostnameOverride, cloudProviderName, true); err != nil { + if err := k8s.CordonUncordon(kubeClient, toDeleteHost.HostnameOverride, toDeleteHost.InternalAddress, cloudProviderName, true); err != nil { return err } log.Infof(ctx, "[hosts] Deleting host [%s] from the cluster", toDeleteHost.Address) - if err := k8s.DeleteNode(kubeClient, toDeleteHost.HostnameOverride, cloudProviderName); err != nil { + if err := k8s.DeleteNode(kubeClient, toDeleteHost.HostnameOverride, toDeleteHost.InternalAddress, cloudProviderName); err != nil { return err } log.Infof(ctx, "[hosts] Successfully deleted host [%s] from the cluster", toDeleteHost.Address) diff --git a/k8s/node.go b/k8s/node.go index c48fc24c..8f3dabb8 100644 --- a/k8s/node.go +++ b/k8s/node.go @@ -24,10 +24,10 @@ const ( RetryInterval = 5 ) -func DeleteNode(k8sClient *kubernetes.Clientset, nodeName, cloudProviderName string) error { +func DeleteNode(k8sClient *kubernetes.Clientset, nodeName string, nodeAddress string, cloudProviderName string) error { // If cloud provider is configured, the node name can be set by the cloud provider, which can be different from the original node name if cloudProviderName != "" { - node, err := GetNode(k8sClient, nodeName, cloudProviderName) + node, err := GetNode(k8sClient, nodeName, nodeAddress, cloudProviderName) if err != nil { return err } @@ -40,7 +40,7 @@ func GetNodeList(k8sClient *kubernetes.Clientset) (*v1.NodeList, error) { return k8sClient.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) } -func GetNode(k8sClient *kubernetes.Clientset, nodeName, cloudProviderName string) (*v1.Node, error) { +func GetNode(k8sClient *kubernetes.Clientset, nodeName, nodeAddress, cloudProviderName string) (*v1.Node, error) { var listErr error for retries := 0; retries < MaxRetries; retries++ { logrus.Debugf("Checking node list for node [%v], try #%v", nodeName, retries+1) @@ -57,9 +57,13 @@ func GetNode(k8sClient *kubernetes.Clientset, nodeName, cloudProviderName string return &node, nil } if cloudProviderName == ExternalAWSCloudProviderName { - logrus.Debugf("Checking hostname address for node [%v], cloud provider: %v", nodeName, cloudProviderName) + if nodeAddress == "" { + return nil, fmt.Errorf("failed to find node [%v] with empty nodeAddress, cloud provider: %v", nodeName, cloudProviderName) + } + logrus.Debugf("Checking internal address for node [%v], cloud provider: %v", nodeName, cloudProviderName) for _, addr := range node.Status.Addresses { - if addr.Type == v1.NodeHostName && strings.ToLower(node.Labels[HostnameLabel]) == addr.Address { + if addr.Type == v1.NodeInternalIP && nodeAddress == addr.Address { + logrus.Debugf("Found node [%s]: %v", nodeName, nodeAddress) return &node, nil } } @@ -73,10 +77,10 @@ func GetNode(k8sClient *kubernetes.Clientset, nodeName, cloudProviderName string return nil, apierrors.NewNotFound(schema.GroupResource{}, nodeName) } -func CordonUncordon(k8sClient *kubernetes.Clientset, nodeName string, cloudProviderName string, cordoned bool) error { +func CordonUncordon(k8sClient *kubernetes.Clientset, nodeName string, nodeAddress string, cloudProviderName string, cordoned bool) error { updated := false for retries := 0; retries < MaxRetries; retries++ { - node, err := GetNode(k8sClient, nodeName, cloudProviderName) + node, err := GetNode(k8sClient, nodeName, nodeAddress, cloudProviderName) if err != nil { logrus.Debugf("Error getting node %s: %v", nodeName, err) // no need to retry here since GetNode already retries @@ -127,6 +131,7 @@ func SyncNodeLabels(node *v1.Node, toAddLabels, toDelLabels map[string]string) { delete(node.Labels, key) } } + // ADD Labels for key, value := range toAddLabels { node.Labels[key] = value diff --git a/services/controlplane.go b/services/controlplane.go index 2bcca06d..711cfbbf 100644 --- a/services/controlplane.go +++ b/services/controlplane.go @@ -165,7 +165,7 @@ func processControlPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.C } if !controlPlaneUpgradable && !workerPlaneUpgradable { log.Infof(ctx, "Upgrade not required for controlplane and worker components of host %v", runHost.HostnameOverride) - if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, cloudProviderName, false); err != nil { + if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, runHost.InternalAddress, cloudProviderName, false); err != nil { // This node didn't undergo an upgrade, so RKE will only log any error after uncordoning it and won't count this in maxUnavailable logrus.Errorf("[controlplane] Failed to uncordon node %v, error: %v", runHost.HostnameOverride, err) } @@ -237,7 +237,7 @@ func upgradeControlHost(ctx context.Context, kubeClient *kubernetes.Clientset, h if err := CheckNodeReady(kubeClient, host, ControlRole, cloudProviderName); err != nil { return err } - return k8s.CordonUncordon(kubeClient, host.HostnameOverride, cloudProviderName, false) + return k8s.CordonUncordon(kubeClient, host.HostnameOverride, host.InternalAddress, cloudProviderName, false) } func RemoveControlPlane(ctx context.Context, controlHosts []*hosts.Host, force bool) error { diff --git a/services/node_util.go b/services/node_util.go index df26f45a..eb1ef6b1 100644 --- a/services/node_util.go +++ b/services/node_util.go @@ -20,7 +20,7 @@ import ( func CheckNodeReady(kubeClient *kubernetes.Clientset, runHost *hosts.Host, component, cloudProviderName string) error { for retries := 0; retries < k8s.MaxRetries; retries++ { logrus.Infof("[%s] Now checking status of node %v, try #%v", component, runHost.HostnameOverride, retries+1) - k8sNode, err := k8s.GetNode(kubeClient, runHost.HostnameOverride, cloudProviderName) + k8sNode, err := k8s.GetNode(kubeClient, runHost.HostnameOverride, runHost.InternalAddress, cloudProviderName) if err != nil { return fmt.Errorf("[%s] Error getting node %v: %v", component, runHost.HostnameOverride, err) } @@ -35,7 +35,7 @@ func CheckNodeReady(kubeClient *kubernetes.Clientset, runHost *hosts.Host, compo func cordonAndDrainNode(kubeClient *kubernetes.Clientset, host *hosts.Host, drainNode bool, drainHelper drain.Helper, component, cloudProviderName string) error { logrus.Debugf("[%s] Cordoning node %v", component, host.HostnameOverride) - if err := k8s.CordonUncordon(kubeClient, host.HostnameOverride, cloudProviderName, true); err != nil { + if err := k8s.CordonUncordon(kubeClient, host.HostnameOverride, host.InternalAddress, cloudProviderName, true); err != nil { return err } if !drainNode { diff --git a/services/workerplane.go b/services/workerplane.go index a9e8e6b7..b4130e73 100644 --- a/services/workerplane.go +++ b/services/workerplane.go @@ -87,7 +87,7 @@ func UpgradeWorkerPlaneForWorkerAndEtcdNodes(ctx context.Context, kubeClient *ku func updateNewHostsList(kubeClient *kubernetes.Clientset, allHosts []*hosts.Host, newHosts map[string]bool, cloudProviderName string) { for _, h := range allHosts { - _, err := k8s.GetNode(kubeClient, h.HostnameOverride, cloudProviderName) + _, err := k8s.GetNode(kubeClient, h.HostnameOverride, h.InternalAddress, cloudProviderName) if err != nil && apierrors.IsNotFound(err) { // this host could have been added to cluster state upon successful controlplane upgrade but isn't a node yet. newHosts[h.HostnameOverride] = true @@ -167,7 +167,7 @@ func processWorkerPlaneForUpgrade(ctx context.Context, kubeClient *kubernetes.Cl } if !upgradable { logrus.Infof("[workerplane] Upgrade not required for worker components of host %v", runHost.HostnameOverride) - if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, cloudProviderName, false); err != nil { + if err := k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, runHost.InternalAddress, cloudProviderName, false); err != nil { // This node didn't undergo an upgrade, so RKE will only log any error after uncordoning it and won't count this in maxUnavailable logrus.Errorf("[workerplane] Failed to uncordon node %v, error: %v", runHost.HostnameOverride, err) } @@ -211,7 +211,7 @@ func upgradeWorkerHost(ctx context.Context, kubeClient *kubernetes.Clientset, ru return err } // uncordon node - return k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, cloudProviderName, false) + return k8s.CordonUncordon(kubeClient, runHost.HostnameOverride, runHost.InternalAddress, cloudProviderName, false) } func doDeployWorkerPlaneHost(ctx context.Context, host *hosts.Host, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, processMap map[string]v3.Process, certMap map[string]pki.CertificatePKI, updateWorkersOnly bool, alpineImage, k8sVersion string) error {