[kubeadm] Modify the kubeadm upgrade DAG for the TLS Upgrade

- Calculate `beforePodHashMap` before the etcd upgrade in anticipation of KubeAPIServer downtime
- Detect if pre-upgrade etcd static pod cluster `HasTLS()==false` to switch on the Etcd TLS Upgrade
if TLS Upgrade:
  - Skip L7 Etcd check (could implement a waiter for this)
  - Skip data rollback on etcd upgrade failure due to lack of L7 check (APIServer is already down unable to serve new requests)
  - On APIServer upgrade failure, also rollback the etcd manifest to maintain protocol compatibility

- Add logging
This commit is contained in:
leigh schrandt 2018-04-16 08:32:41 -06:00
parent 4a37e05665
commit 8129480d44

View File

@ -181,22 +181,45 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
} }
fmt.Printf("[upgrade/staticpods] Moved new manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath) fmt.Printf("[upgrade/staticpods] Moved new manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath)
fmt.Println("[upgrade/staticpods] Waiting for the kubelet to restart the component")
// Wait for the mirror Pod hash to change; otherwise we'll run into race conditions here when the kubelet hasn't had time to waitForComponentRestart := true
// notice the removal of the Static Pod, leading to a false positive below where we check that the API endpoint is healthy if isTLSUpgrade {
// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the // We currently depend on getting the Etcd mirror Pod hash from the KubeAPIServer;
// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results. // Upgrading the Etcd protocol takes down the apiserver, so we can't verify component restarts if we restart Etcd independently.
if err := waiter.WaitForStaticPodHashChange(cfg.NodeName, component, beforePodHash); err != nil { // Skip waiting for Etcd to restart and immediately move on to updating the apiserver.
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd) if component == constants.Etcd {
waitForComponentRestart = false
}
// Normally, if an Etcd upgrade is successful, but the apiserver upgrade fails, Etcd is not rolled back.
// In the case of a TLS upgrade, the old KubeAPIServer config is incompatible with the new Etcd confg, so we rollback Etcd
// if the APIServer upgrade fails.
if component == constants.KubeAPIServer {
recoverEtcd = true
fmt.Printf("[upgrade/staticpods] The %s manifest will be restored if component %q fails to upgrade\n", constants.Etcd, component)
}
} }
// Wait for the static pod component to come up and register itself as a mirror pod if waitForComponentRestart {
if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil { fmt.Println("[upgrade/staticpods] Waiting for the kubelet to restart the component")
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
// Wait for the mirror Pod hash to change; otherwise we'll run into race conditions here when the kubelet hasn't had time to
// notice the removal of the Static Pod, leading to a false positive below where we check that the API endpoint is healthy
// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the
// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results.
if err := waiter.WaitForStaticPodHashChange(cfg.NodeName, component, beforePodHash); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}
// Wait for the static pod component to come up and register itself as a mirror pod
if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}
fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
} else {
fmt.Printf("[upgrade/staticpods] Not waiting for pod-hash change for component %q\n", component)
} }
fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
return nil return nil
} }
@ -276,26 +299,30 @@ func performEtcdStaticPodUpgrade(waiter apiclient.Waiter, pathMgr StaticPodPathM
return true, fmt.Errorf("fatal error when trying to upgrade the etcd cluster: %v, rolled the state back to pre-upgrade state", err) return true, fmt.Errorf("fatal error when trying to upgrade the etcd cluster: %v, rolled the state back to pre-upgrade state", err)
} }
// Checking health state of etcd after the upgrade if isTLSUpgrade {
if _, err = etcdCluster.GetEtcdClusterStatus(); err != nil { fmt.Printf("[upgrade/etcd] Skipping L7 health-check for %s (as well as data rollback on failure)\n", constants.Etcd)
// Despite the fact that upgradeComponent was successful, there is something wrong with etcd cluster } else {
// First step is to restore back up of datastore // Checking health state of etcd after the upgrade
if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil { if _, err = newEtcdCluster.GetStatus(); err != nil {
// Even copying back datastore failed, no options for recovery left, bailing out // Despite the fact that upgradeComponent was successful, there is something wrong with etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir) // First step is to restore back up of datastore
} if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
// Old datastore has been copied, rolling back old manifests // Even copying back datastore failed, no options for recovery left, bailing out
if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil { return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
// Rolling back to old manifests failed, no options for recovery left, bailing out }
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir) // Old datastore has been copied, rolling back old manifests
} if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster // Rolling back to old manifests failed, no options for recovery left, bailing out
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil { return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
// Nothing else left to try to recover etcd cluster }
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir) // Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
} if _, err := oldEtcdCluster.GetStatus(); err != nil {
// Nothing else left to try to recover etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err) return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
}
} }
return false, nil return false, nil
@ -306,8 +333,24 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
recoverManifests := map[string]string{} recoverManifests := map[string]string{}
var isTLSUpgrade bool var isTLSUpgrade bool
beforePodHashMap, err := waiter.WaitForStaticPodControlPlaneHashes(cfg.NodeName)
if err != nil {
return err
}
// etcd upgrade is done prior to other control plane components // etcd upgrade is done prior to other control plane components
if etcdUpgrade { if etcdUpgrade {
previousEtcdHasTLS, err := oldEtcdCluster.HasTLS()
if err != nil {
return fmt.Errorf("failed to determine if previous etcd was using TLS: %v", err)
}
// set the TLS upgrade flag for all components
isTLSUpgrade = !previousEtcdHasTLS
if isTLSUpgrade {
fmt.Printf("[upgrade/etcd] Upgrading to TLS for %s\n", constants.Etcd)
}
// Perform etcd upgrade using common to all control plane components function // Perform etcd upgrade using common to all control plane components function
fatal, err := performEtcdStaticPodUpgrade(waiter, pathMgr, cfg, recoverManifests, isTLSUpgrade, oldEtcdCluster, newEtcdCluster) fatal, err := performEtcdStaticPodUpgrade(waiter, pathMgr, cfg, recoverManifests, isTLSUpgrade, oldEtcdCluster, newEtcdCluster)
if err != nil { if err != nil {
@ -318,11 +361,6 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
} }
} }
beforePodHashMap, err := waiter.WaitForStaticPodControlPlaneHashes(cfg.NodeName)
if err != nil {
return err
}
// Write the updated static Pod manifests into the temporary directory // Write the updated static Pod manifests into the temporary directory
fmt.Printf("[upgrade/staticpods] Writing new Static Pod manifests to %q\n", pathMgr.TempManifestDir()) fmt.Printf("[upgrade/staticpods] Writing new Static Pod manifests to %q\n", pathMgr.TempManifestDir())
err = controlplanephase.CreateInitStaticPodManifestFiles(pathMgr.TempManifestDir(), cfg) err = controlplanephase.CreateInitStaticPodManifestFiles(pathMgr.TempManifestDir(), cfg)