Merge pull request #56500 from sbezverk/kubeadm_etcd_fix_1

Automatic merge from submit-queue (batch tested with PRs 56497, 56500, 55018, 56544, 56425). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

kubeadm etcd modifying recovery steps

Closes #56499
```release-note
Modifying etcd recovery steps for the case of failed upgrade
```
This commit is contained in:
Kubernetes Submit Queue 2017-11-29 15:26:02 -08:00 committed by GitHub
commit b86569fe10
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 61 additions and 12 deletions

View File

@ -64,7 +64,7 @@ func NewCmdApply(parentFlags *cmdUpgradeFlags) *cobra.Command {
flags := &applyFlags{ flags := &applyFlags{
parent: parentFlags, parent: parentFlags,
imagePullTimeout: 15 * time.Minute, imagePullTimeout: 15 * time.Minute,
etcdUpgrade: false, etcdUpgrade: true,
} }
cmd := &cobra.Command{ cmd := &cobra.Command{

View File

@ -127,6 +127,12 @@ func (spm *KubeStaticPodPathManager) BackupEtcdDir() string {
} }
func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticPodPathManager, cfg *kubeadmapi.MasterConfiguration, beforePodHash string, recoverManifests map[string]string) error { func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticPodPathManager, cfg *kubeadmapi.MasterConfiguration, beforePodHash string, recoverManifests map[string]string) error {
// Special treatment is required for etcd case, when rollbackOldManifests should roll back etcd
// manifests only for the case when component is Etcd
recoverEtcd := false
if component == constants.Etcd {
recoverEtcd = true
}
// The old manifest is here; in the /etc/kubernetes/manifests/ // The old manifest is here; in the /etc/kubernetes/manifests/
currentManifestPath := pathMgr.RealManifestPath(component) currentManifestPath := pathMgr.RealManifestPath(component)
// The new, upgraded manifest will be written here // The new, upgraded manifest will be written here
@ -140,12 +146,12 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
// Move the old manifest into the old-manifests directory // Move the old manifest into the old-manifests directory
if err := pathMgr.MoveFile(currentManifestPath, backupManifestPath); err != nil { if err := pathMgr.MoveFile(currentManifestPath, backupManifestPath); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr) return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
} }
// Move the new manifest into the manifests directory // Move the new manifest into the manifests directory
if err := pathMgr.MoveFile(newManifestPath, currentManifestPath); err != nil { if err := pathMgr.MoveFile(newManifestPath, currentManifestPath); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr) return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
} }
fmt.Printf("[upgrade/staticpods] Moved upgraded manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath) fmt.Printf("[upgrade/staticpods] Moved upgraded manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath)
@ -156,12 +162,12 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the // If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the
// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results. // API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results.
if err := waiter.WaitForStaticPodControlPlaneHashChange(cfg.NodeName, component, beforePodHash); err != nil { if err := waiter.WaitForStaticPodControlPlaneHashChange(cfg.NodeName, component, beforePodHash); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr) return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
} }
// Wait for the static pod component to come up and register itself as a mirror pod // Wait for the static pod component to come up and register itself as a mirror pod
if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil { if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr) return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
} }
fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component) fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
@ -212,20 +218,59 @@ func performEtcdStaticPodUpgrade(waiter apiclient.Waiter, pathMgr StaticPodPathM
return true, fmt.Errorf("fail to get etcd pod's hash: %v", err) return true, fmt.Errorf("fail to get etcd pod's hash: %v", err)
} }
// Write the updated etcd static Pod manifest into the temporary directory // Write the updated etcd static Pod manifest into the temporary directory, at this point no etcd change
// has occured in any aspects.
if err := etcdphase.CreateLocalEtcdStaticPodManifestFile(pathMgr.TempManifestDir(), cfg); err != nil { if err := etcdphase.CreateLocalEtcdStaticPodManifestFile(pathMgr.TempManifestDir(), cfg); err != nil {
return true, rollbackEtcdData(cfg, fmt.Errorf("error creating local etcd static pod manifest file: %v", err), pathMgr) return true, fmt.Errorf("error creating local etcd static pod manifest file: %v", err)
} }
// Perform etcd upgrade using common to all control plane components function // Perform etcd upgrade using common to all control plane components function
if err := upgradeComponent(constants.Etcd, waiter, pathMgr, cfg, beforeEtcdPodHash, recoverManifests); err != nil { if err := upgradeComponent(constants.Etcd, waiter, pathMgr, cfg, beforeEtcdPodHash, recoverManifests); err != nil {
return true, rollbackEtcdData(cfg, err, pathMgr) // Since etcd upgrade component failed, the old manifest has been restored
// now we need to check the heatlth of etcd cluster if it came back up with old manifest
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest
if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
// Even copying back datastore failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Old datastore has been copied, rolling back old manifests
if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
// Rolling back to old manifests failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// Nothing else left to try to recover etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
}
// Since etcd cluster came back up with the old manifest
return true, fmt.Errorf("fatal error when trying to upgrade the etcd cluster: %v, rolled the state back to pre-upgrade state", err)
} }
// Checking health state of etcd after the upgrade // Checking health state of etcd after the upgrade
etcdStatus, err = etcdCluster.GetEtcdClusterStatus() if _, err = etcdCluster.GetEtcdClusterStatus(); err != nil {
if err != nil { // Despite the fact that upgradeComponent was sucessfull, there is something wrong with etcd cluster
return true, rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr) // First step is to restore back up of datastore
if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
// Even copying back datastore failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Old datastore has been copied, rolling back old manifests
if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
// Rolling back to old manifests failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// Nothing else left to try to recover etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
} }
return false, nil return false, nil
@ -276,9 +321,13 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
} }
// rollbackOldManifests rolls back the backuped manifests if something went wrong // rollbackOldManifests rolls back the backuped manifests if something went wrong
func rollbackOldManifests(oldManifests map[string]string, origErr error, pathMgr StaticPodPathManager) error { func rollbackOldManifests(oldManifests map[string]string, origErr error, pathMgr StaticPodPathManager, restoreEtcd bool) error {
errs := []error{origErr} errs := []error{origErr}
for component, backupPath := range oldManifests { for component, backupPath := range oldManifests {
// Will restore etcd manifest only if it was explicitely requested by setting restoreEtcd to True
if component == constants.Etcd && !restoreEtcd {
continue
}
// Where we should put back the backed up manifest // Where we should put back the backed up manifest
realManifestPath := pathMgr.RealManifestPath(component) realManifestPath := pathMgr.RealManifestPath(component)