[kubeadm] Modify the kubeadm upgrade DAG for the TLS Upgrade

- Calculate `beforePodHashMap` before the etcd upgrade in anticipation of KubeAPIServer downtime - Detect if pre-upgrade etcd static pod cluster `HasTLS()==false` to switch on the Etcd TLS Upgrade if TLS Upgrade: - Skip L7 Etcd check (could implement a waiter for this) - Skip data rollback on etcd upgrade failure due to lack of L7 check (APIServer is already down unable to serve new requests) - On APIServer upgrade failure, also rollback the etcd manifest to maintain protocol compatibility - Add logging
2025-09-23 02:47:57 +00:00 · 2018-04-16 08:32:41 -06:00
parent 4a37e05665
commit 8129480d44
1 changed files with 73 additions and 35 deletions
--- a/cmd/kubeadm/app/phases/upgrade/staticpods.go
+++ b/cmd/kubeadm/app/phases/upgrade/staticpods.go
@@ -181,22 +181,45 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
 	}
 	fmt.Printf("[upgrade/staticpods] Moved new manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath)
 	fmt.Println("[upgrade/staticpods] Waiting for the kubelet to restart the component")
-	// Wait for the mirror Pod hash to change; otherwise we'll run into race conditions here when the kubelet hasn't had time to
+	waitForComponentRestart := true
-	// notice the removal of the Static Pod, leading to a false positive below where we check that the API endpoint is healthy
+	if isTLSUpgrade {
-	// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the
+		// We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer;
-	// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results.
+		// Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently.
-	if err := waiter.WaitForStaticPodHashChange(cfg.NodeName, component, beforePodHash); err != nil {
+		// Skip waiting for Etcd to restart and immediately move on to updating the apiserver.
-		return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
+		if component == constants.Etcd {
 			waitForComponentRestart = false
 		}
 		// Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back.
 		// In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd
 		// if the APIServer upgrade fails.
 		if component == constants.KubeAPIServer {
 			recoverEtcd = true
 			fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component)
 		}
 	}
-	// Wait for the static pod component to come up and register itself as a mirror pod
+	if waitForComponentRestart {
-	if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil {
+		fmt.Println("[upgrade/staticpods] Waiting for the kubelet to restart the component")
-		return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
+
 		// Wait for the mirror Pod hash to change; otherwise we'll run into race conditions here when the kubelet hasn't had time to
 		// notice the removal of the Static Pod, leading to a false positive below where we check that the API endpoint is healthy
 		// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the
 		// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results.
 		if err := waiter.WaitForStaticPodHashChange(cfg.NodeName, component, beforePodHash); err != nil {
 			return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
 		}
 		// Wait for the static pod component to come up and register itself as a mirror pod
 		if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil {
 			return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
 		}
 		fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
 	} else {
 		fmt.Printf("[upgrade/staticpods] Not waiting for pod-hash change for component %q\n", component)
 	}
 	fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
 	return nil
 }
@@ -276,26 +299,30 @@ func performEtcdStaticPodUpgrade(waiter apiclient.Waiter, pathMgr StaticPodPathM
 		return true, fmt.Errorf("fatal error when trying to upgrade the etcd cluster: %v, rolled the state back to pre-upgrade state", err)
 	}
-	// Checking health state of etcd after the upgrade
+	if isTLSUpgrade {
-	if _, err = etcdCluster.GetEtcdClusterStatus(); err != nil {
+		fmt.Printf("[upgrade/etcd] Skipping L7 health-check for %s (as well as data rollback on failure)\n", constants.Etcd)
-		// Despite the fact that upgradeComponent was successful, there is something wrong with etcd cluster
+	} else {
-		// First step is to restore back up of datastore
+		// Checking health state of etcd after the upgrade
-		if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
+		if _, err = newEtcdCluster.GetStatus(); err != nil {
-			// Even copying back datastore failed, no options for recovery left, bailing out
+			// Despite the fact that upgradeComponent was successful, there is something wrong with etcd cluster
-			return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
+			// First step is to restore back up of datastore
-		}
+			if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
-		// Old datastore has been copied, rolling back old manifests
+				// Even copying back datastore failed, no options for recovery left, bailing out
-		if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
+				return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
-			// Rolling back to old manifests failed, no options for recovery left, bailing out
+			}
-			return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
+			// Old datastore has been copied, rolling back old manifests
-		}
+			if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
-		// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
+				// Rolling back to old manifests failed, no options for recovery left, bailing out
-		if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
+				return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
-			// Nothing else left to try to recover etcd cluster
+			}
-			return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
+			// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
-		}
+			if _, err := oldEtcdCluster.GetStatus(); err != nil {
 				// Nothing else left to try to recover etcd cluster
 				return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
 			}
-		return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
+			return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
 		}
 	}
 	return false, nil
@@ -306,8 +333,24 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
 	recoverManifests := map[string]string{}
 	var isTLSUpgrade bool
 	beforePodHashMap, err := waiter.WaitForStaticPodControlPlaneHashes(cfg.NodeName)
 	if err != nil {
 		return err
 	}
 	// etcd upgrade is done prior to other control plane components
 	if etcdUpgrade {
 		previousEtcdHasTLS, err := oldEtcdCluster.HasTLS()
 		if err != nil {
 			return fmt.Errorf("failed to determine if previous etcd was using TLS: %v", err)
 		}
 		// set the TLS upgrade flag for all components
 		isTLSUpgrade = !previousEtcdHasTLS
 		if isTLSUpgrade {
 			fmt.Printf("[upgrade/etcd] Upgrading to TLS for %s\n", constants.Etcd)
 		}
 		// Perform etcd upgrade using common to all control plane components function
 		fatal, err := performEtcdStaticPodUpgrade(waiter, pathMgr, cfg, recoverManifests, isTLSUpgrade, oldEtcdCluster, newEtcdCluster)
 		if err != nil {
@@ -318,11 +361,6 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
 		}
 	}
 	beforePodHashMap, err := waiter.WaitForStaticPodControlPlaneHashes(cfg.NodeName)
 	if err != nil {
 		return err
 	}
 	// Write the updated static Pod manifests into the temporary directory
 	fmt.Printf("[upgrade/staticpods] Writing new Static Pod manifests to %q\n", pathMgr.TempManifestDir())
 	err = controlplanephase.CreateInitStaticPodManifestFiles(pathMgr.TempManifestDir(), cfg)