diff --git a/cluster/addons.go b/cluster/addons.go index 0a1015a2..e1e9ab3b 100644 --- a/cluster/addons.go +++ b/cluster/addons.go @@ -557,7 +557,6 @@ func (c *Cluster) StoreAddonConfigMap(ctx context.Context, addonYaml string, add timeout := make(chan bool, 1) go func() { for { - updated, err = k8s.UpdateConfigMap(kubeClient, []byte(addonYaml), addonName) if err != nil { time.Sleep(time.Second * 5) diff --git a/cluster/etcd.go b/cluster/etcd.go index 3c0a4bff..907bf56f 100644 --- a/cluster/etcd.go +++ b/cluster/etcd.go @@ -3,6 +3,7 @@ package cluster import ( "context" "fmt" + "strings" "github.com/sirupsen/logrus" @@ -17,16 +18,52 @@ import ( func (c *Cluster) SnapshotEtcd(ctx context.Context, snapshotName string) error { backupImage := c.getBackupImage() + containerTimeout := DefaultEtcdBackupConfigTimeout + if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.Timeout > 0 { + containerTimeout = c.Services.Etcd.BackupConfig.Timeout + } + + // store first error message + var snapshotErr error + snapshotFailures := 0 + s3UploadFailures := 0 + for _, host := range c.EtcdHosts { - containerTimeout := DefaultEtcdBackupConfigTimeout - if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.Timeout > 0 { - containerTimeout = c.Services.Etcd.BackupConfig.Timeout - } newCtx := context.WithValue(ctx, docker.WaitTimeoutContextKey, containerTimeout) if err := services.RunEtcdSnapshotSave(newCtx, host, c.PrivateRegistriesMap, backupImage, snapshotName, true, c.Services.Etcd, c.Version); err != nil { - return err + if strings.Contains(err.Error(), "failed to upload etcd snapshot file to s3 on host") { + s3UploadFailures++ + } else { + if snapshotErr == nil { + snapshotErr = err + } + snapshotFailures++ + } } } + + if snapshotFailures == len(c.EtcdHosts) { + log.Warnf(ctx, "[etcd] Failed to take snapshot on all etcd hosts: %s", snapshotErr) + return fmt.Errorf("[etcd] Failed to take snapshot on all etcd hosts: %s", snapshotErr) + } else if snapshotFailures > 0 { + log.Warnf(ctx, "[etcd] Failed to take snapshot on %s etcd hosts", snapshotFailures) + } else { + log.Infof(ctx, "[etcd] Finished saving snapshot [%s] on all etcd hosts", snapshotName) + } + + if c.Services.Etcd.BackupConfig.S3BackupConfig == nil { + return nil + } + + if s3UploadFailures >= len(c.EtcdHosts)-snapshotFailures { + log.Warnf(ctx, "[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts") + return fmt.Errorf("[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts") + } else if s3UploadFailures > 0 { + log.Warnf(ctx, "[etcd] Failed to upload etcd snapshot file to s3 on %s etcd hosts", s3UploadFailures) + } else { + log.Infof(ctx, "[etcd] Finished uploading etcd snapshot file to s3 on all etcd hosts") + } + return nil } @@ -112,13 +149,28 @@ func (c *Cluster) PrepareBackup(ctx context.Context, snapshotPath string) error var backupServer *hosts.Host backupImage := c.getBackupImage() var errors []error - if c.Services.Etcd.BackupConfig == nil || // legacy rke local backup - (c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.S3BackupConfig == nil) { // rancher local backup + // s3 backup case + if c.Services.Etcd.BackupConfig != nil && + c.Services.Etcd.BackupConfig.S3BackupConfig != nil { + log.Infof(ctx, "[etcd] etcd s3 backup configuration found, will use s3 as source") + downloadFailed := false + for _, host := range c.EtcdHosts { + if err := services.DownloadEtcdSnapshotFromS3(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotPath, c.Services.Etcd, c.Version); err != nil { + log.Warnf(ctx, "failed to download snapshot [%s] from s3 on host [%s]: %v", snapshotPath, host.Address, err) + downloadFailed = true + break + } + } + backupReady = !downloadFailed + } + // legacy rke local backup or rancher local backup + if !backupReady { if c.Services.Etcd.BackupConfig == nil { log.Infof(ctx, "[etcd] No etcd snapshot configuration found, will use local as source") - } - if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.S3BackupConfig == nil { + } else if c.Services.Etcd.BackupConfig.S3BackupConfig == nil { log.Infof(ctx, "[etcd] etcd snapshot configuration found and no s3 backup configuration found, will use local as source") + } else { + log.Warnf(ctx, "[etcd] etcd snapshot configuration found and s3 backup configuration failed, falling back to use local as source") } // stop etcd on all etcd nodes, we need this because we start the backup server on the same port for _, host := range c.EtcdHosts { @@ -159,17 +211,6 @@ func (c *Cluster) PrepareBackup(ctx context.Context, snapshotPath string) error backupReady = true } - // s3 backup case - if c.Services.Etcd.BackupConfig != nil && - c.Services.Etcd.BackupConfig.S3BackupConfig != nil { - log.Infof(ctx, "[etcd] etcd s3 backup configuration found, will use s3 as source") - for _, host := range c.EtcdHosts { - if err := services.DownloadEtcdSnapshotFromS3(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotPath, c.Services.Etcd, c.Version); err != nil { - return err - } - } - backupReady = true - } if !backupReady { return fmt.Errorf("failed to prepare backup for restore") } diff --git a/docker/docker.go b/docker/docker.go index fa0489ad..1a4a11e9 100644 --- a/docker/docker.go +++ b/docker/docker.go @@ -124,7 +124,7 @@ func DoRunOnetimeContainer(ctx context.Context, dClient *client.Client, imageCfg } log.Infof(ctx, "Successfully started [%s] container on host [%s]", containerName, hostname) log.Infof(ctx, "Waiting for [%s] container to exit on host [%s]", containerName, hostname) - exitCode, err := WaitForContainer(ctx, dClient, hostname, containerName) + exitCode, err := WaitForContainer(ctx, dClient, hostname, containerName, true) if err != nil { return fmt.Errorf("Container [%s] did not complete in time on host [%s]", containerName, hostname) } @@ -164,11 +164,11 @@ func DoRollingUpdateContainer(ctx context.Context, dClient *client.Client, image return fmt.Errorf("[%s] Failed rolling update of container: docker client is nil for container [%s] on host [%s]", plane, containerName, hostname) } logrus.Debugf("[%s] Checking for deployed [%s]", plane, containerName) - isRunning, err := IsContainerRunning(ctx, dClient, hostname, containerName, false) + exists, err := DoesContainerExist(ctx, dClient, hostname, containerName, false) if err != nil { return err } - if !isRunning { + if !exists { logrus.Debugf("[%s] Container %s is not running on host [%s]", plane, containerName, hostname) return nil } @@ -217,30 +217,52 @@ func DoRemoveContainer(ctx context.Context, dClient *client.Client, containerNam return nil } -func IsContainerRunning(ctx context.Context, dClient *client.Client, hostname string, containerName string, all bool) (bool, error) { +func FindContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string, all bool) (*types.Container, error) { if dClient == nil { - return false, fmt.Errorf("Failed to check if container is running: docker client is nil for container [%s] on host [%s]", containerName, hostname) + return nil, fmt.Errorf("Failed to find container: docker client is nil for container [%s] on host [%s]", containerName, hostname) } var containers []types.Container var err error for i := 1; i <= RetryCount; i++ { - logrus.Infof("Checking if container [%s] is running on host [%s], try #%d", containerName, hostname, i) + logrus.Infof("Finding container [%s] on host [%s], try #%d", containerName, hostname, i) containers, err = dClient.ContainerList(ctx, types.ContainerListOptions{All: all}) if err != nil { - logrus.Warnf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err) + logrus.Warnf("Error finding container [%s] exists on host [%s]: %v", containerName, hostname, err) continue } break } if err != nil { - return false, fmt.Errorf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err) + return nil, fmt.Errorf("Error checking if container [%s] exists on host [%s]: %v", containerName, hostname, err) } for _, container := range containers { if len(container.Names) != 0 && container.Names[0] == "/"+containerName { - return true, nil + return &container, nil } } - return false, nil + return nil, nil +} + +func DoesContainerExist(ctx context.Context, dClient *client.Client, hostname string, containerName string, all bool) (bool, error) { + if dClient == nil { + return false, fmt.Errorf("Failed to check if container exists: docker client is nil for container [%s] on host [%s]", containerName, hostname) + } + container, err := FindContainer(ctx, dClient, hostname, containerName, all) + if err != nil { + return false, fmt.Errorf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err) + } + return container != nil, nil +} + +func IsContainerRunning(ctx context.Context, dClient *client.Client, hostname string, containerName string, all bool) (bool, error) { + if dClient == nil { + return false, fmt.Errorf("Failed to check if container is running: docker client is nil for container [%s] on host [%s]", containerName, hostname) + } + container, err := FindContainer(ctx, dClient, hostname, containerName, all) + if err != nil { + return false, fmt.Errorf("Error checking if container [%s] is running on host [%s]: %v", containerName, hostname, err) + } + return container != nil && container.State == "running", nil } func localImageExists(ctx context.Context, dClient *client.Client, hostname string, containerImage string) error { @@ -476,7 +498,7 @@ func StopRenameContainer(ctx context.Context, dClient *client.Client, hostname s return fmt.Errorf("Failed to stop and rename container: docker client is nil for container [%s] on host [%s]", oldContainerName, hostname) } // make sure we don't have an old old-container from a previous broken update - exists, err := IsContainerRunning(ctx, dClient, hostname, newContainerName, true) + exists, err := DoesContainerExist(ctx, dClient, hostname, newContainerName, true) if err != nil { return err } @@ -488,14 +510,14 @@ func StopRenameContainer(ctx context.Context, dClient *client.Client, hostname s if err := StopContainer(ctx, dClient, hostname, oldContainerName); err != nil { return err } - if _, err := WaitForContainer(ctx, dClient, hostname, oldContainerName); err != nil { + if _, err := WaitForContainer(ctx, dClient, hostname, oldContainerName, true); err != nil { return err } return RenameContainer(ctx, dClient, hostname, oldContainerName, newContainerName) } -func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) (int64, error) { +func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string, noisy bool) (int64, error) { if dClient == nil { return 1, fmt.Errorf("Failed waiting for container: docker client is nil for container [%s] on host [%s]", containerName, hostname) } @@ -504,8 +526,9 @@ func WaitForContainer(ctx context.Context, dClient *client.Client, hostname stri if v, ok := ctx.Value(WaitTimeoutContextKey).(int); ok && v > 0 { containerTimeout = v } + log.Infof(ctx, "Waiting for [%s] container to exit on host [%s]", containerName, hostname) + var lastStdout, lastStderr string for retries := 0; retries < containerTimeout; retries++ { - log.Infof(ctx, "Waiting for [%s] container to exit on host [%s]", containerName, hostname) container, err := InspectContainer(ctx, dClient, hostname, containerName) if err != nil { return 1, fmt.Errorf("Could not inspect container [%s] on host [%s]: %s", containerName, hostname, err) @@ -515,8 +538,12 @@ func WaitForContainer(ctx context.Context, dClient *client.Client, hostname stri if err != nil { logrus.Warnf("Failed to get container logs from container [%s] on host [%s]: %v", containerName, hostname, err) } + if noisy || lastStdout != stdout || lastStderr != stderr { + log.Infof(ctx, "Container [%s] is still running on host [%s]: stderr: [%s], stdout: [%s]", containerName, hostname, stderr, stdout) + lastStdout = stdout + lastStderr = stderr + } - log.Infof(ctx, "Container [%s] is still running on host [%s]: stderr: [%s], stdout: [%s]", containerName, hostname, stderr, stdout) time.Sleep(1 * time.Second) continue } @@ -789,11 +816,11 @@ func DoRestartContainer(ctx context.Context, dClient *client.Client, containerNa return nil } -func GetContainerOutput(ctx context.Context, dClient *client.Client, containerName, hostname string) (int64, string, string, error) { +func GetContainerOutput(ctx context.Context, dClient *client.Client, containerName, hostname string, noisy bool) (int64, string, string, error) { if dClient == nil { return 1, "", "", fmt.Errorf("Failed to get container output: docker client is nil for container [%s] on host [%s]", containerName, hostname) } - status, err := WaitForContainer(ctx, dClient, hostname, containerName) + status, err := WaitForContainer(ctx, dClient, hostname, containerName, noisy) if err != nil { return 1, "", "", err } diff --git a/hosts/hosts.go b/hosts/hosts.go index 8401cd94..86dc1361 100644 --- a/hosts/hosts.go +++ b/hosts/hosts.go @@ -135,7 +135,7 @@ func (h *Host) CleanUp(ctx context.Context, toCleanPaths []string, cleanerImage return err } - if _, err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName); err != nil { + if _, err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName, true); err != nil { return err } diff --git a/pki/deploy.go b/pki/deploy.go index 2c07fb8b..c4879575 100644 --- a/pki/deploy.go +++ b/pki/deploy.go @@ -116,7 +116,7 @@ func DeployStateOnPlaneHost(ctx context.Context, host *hosts.Host, stateDownload logrus.Warnf("[state] Error during copying state file [%s] to node [%s]: %v", stateFilePath, host.Address, err) } - if _, err := docker.WaitForContainer(ctx, host.DClient, host.Address, StateDeployerContainerName); err != nil { + if _, err := docker.WaitForContainer(ctx, host.DClient, host.Address, StateDeployerContainerName, true); err != nil { return err } @@ -125,12 +125,11 @@ func DeployStateOnPlaneHost(ctx context.Context, host *hosts.Host, stateDownload func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry, k8sVersion string) error { // remove existing container. Only way it's still here is if previous deployment failed - isRunning := false - isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, CrtDownloaderContainer, true) + exists, err := docker.DoesContainerExist(ctx, host.DClient, host.Address, CrtDownloaderContainer, true) if err != nil { return err } - if isRunning { + if exists { if err := docker.RemoveContainer(ctx, host.DClient, host.Address, CrtDownloaderContainer); err != nil { return err } @@ -188,7 +187,7 @@ func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string, } logrus.Debugf("[certificates] Successfully started Certificate deployer container: %s", CrtDownloaderContainer) for { - isDeployerRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, CrtDownloaderContainer, false) + isDeployerRunning, err := docker.DoesContainerExist(ctx, host.DClient, host.Address, CrtDownloaderContainer, false) if err != nil { return err } @@ -331,11 +330,11 @@ func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts. Binds: Binds, Privileged: true, } - isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, containerName, true) + exists, err := docker.DoesContainerExist(ctx, host.DClient, host.Address, containerName, true) if err != nil { return "", err } - if !isRunning { + if !exists { if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, containerName, host.Address, state, prsMap); err != nil { return "", err } diff --git a/pki/pki.go b/pki/pki.go index 4030c68e..e42d5ac1 100644 --- a/pki/pki.go +++ b/pki/pki.go @@ -131,7 +131,7 @@ func SaveBackupBundleOnHost(ctx context.Context, host *hosts.Host, alpineSystemI if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, BundleCertContainer, host.Address, "certificates", prsMap); err != nil { return err } - status, err := docker.WaitForContainer(ctx, host.DClient, host.Address, BundleCertContainer) + status, err := docker.WaitForContainer(ctx, host.DClient, host.Address, BundleCertContainer, true) if err != nil { return err } diff --git a/services/etcd.go b/services/etcd.go index 8694329f..15224378 100644 --- a/services/etcd.go +++ b/services/etcd.go @@ -439,7 +439,6 @@ func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[s if hosts.IsDockerSELinuxEnabled(etcdHost) { hostCfg.SecurityOpt = append(hostCfg.SecurityOpt, SELinuxLabel) } - } hostCfg.Binds = binds @@ -449,18 +448,30 @@ func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[s if err := docker.DoRemoveContainer(ctx, etcdHost.DClient, EtcdSnapshotOnceContainerName, etcdHost.Address); err != nil { return err } + + // If the etcd container is not running the snapshot will never succeed + log.Debugf(ctx, "[etcd] Checking if etcd is running on host [%s]", etcdHost.Address) + if running, err := docker.IsContainerRunning(ctx, etcdHost.DClient, etcdHost.Address, "etcd", true); err != nil { + return err + } else if !running { + return fmt.Errorf("etcd is not running on host [%s]", etcdHost.Address) + } + if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdSnapshotOnceContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { return err } - status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdSnapshotOnceContainerName, etcdHost.Address) + status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdSnapshotOnceContainerName, etcdHost.Address, false) if status != 0 || err != nil { if removeErr := docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotOnceContainerName); removeErr != nil { - log.Warnf(ctx, "[etcd] Failed to remove container [%s] on host [%s]: %v", removeErr, etcdHost.Address) + log.Warnf(ctx, "[etcd] Failed to remove container [%s] on host [%s]: %v", EtcdSnapshotOnceContainerName, removeErr, etcdHost.Address) } if err != nil { return err } - return fmt.Errorf("[etcd] Failed to take one-time snapshot on host [%s], exit code [%d]: %v", etcdHost.Address, status, stderr) + if strings.Contains(stderr, "failed to upload etcd snapshot file") { + return fmt.Errorf("failed to upload etcd snapshot file to s3 on host [%s], exit code [%d]: %v", etcdHost.Address, status, stderr) + } + return fmt.Errorf("failed to take one-time snapshot on host [%s], exit code [%d]: %v", etcdHost.Address, status, stderr) } return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotOnceContainerName) @@ -618,7 +629,7 @@ func DownloadEtcdSnapshotFromS3(ctx context.Context, etcdHost *hosts.Host, prsMa return err } - status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdDownloadBackupContainerName, etcdHost.Address) + status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdDownloadBackupContainerName, etcdHost.Address, true) if status != 0 || err != nil { if removeErr := docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdDownloadBackupContainerName); removeErr != nil { log.Warnf(ctx, "Failed to remove container [%s]: %v", removeErr) @@ -689,7 +700,7 @@ func RestoreEtcdSnapshot(ctx context.Context, etcdHost *hosts.Host, prsMap map[s if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdRestoreContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { return err } - status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName) + status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName, false) if err != nil { return err } @@ -779,7 +790,7 @@ func RunEtcdSnapshotRemove(ctx context.Context, etcdHost *hosts.Host, prsMap map if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdSnapshotRemoveContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { return err } - status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdSnapshotRemoveContainerName, etcdHost.Address) + status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdSnapshotRemoveContainerName, etcdHost.Address, true) if status != 0 || err != nil { if removeErr := docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdSnapshotRemoveContainerName); removeErr != nil { log.Warnf(ctx, "Failed to remove container [%s]: %v", removeErr) @@ -831,7 +842,7 @@ func GetEtcdSnapshotChecksum(ctx context.Context, etcdHost *hosts.Host, prsMap m if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdChecksumContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { return checksum, err } - if _, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdChecksumContainerName); err != nil { + if _, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdChecksumContainerName, true); err != nil { return checksum, err } stderr, checksum, err = docker.GetContainerLogsStdoutStderr(ctx, etcdHost.DClient, EtcdChecksumContainerName, "1", false) @@ -999,7 +1010,7 @@ func DownloadEtcdSnapshotFromBackupServer(ctx context.Context, etcdHost *hosts.H return err } - status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdDownloadBackupContainerName, etcdHost.Address) + status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdDownloadBackupContainerName, etcdHost.Address, true) if status != 0 || err != nil { if removeErr := docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdDownloadBackupContainerName); removeErr != nil { log.Warnf(ctx, "Failed to remove container [%s]: %v", removeErr) diff --git a/services/services.go b/services/services.go index 493eaa97..491a0bb2 100644 --- a/services/services.go +++ b/services/services.go @@ -60,13 +60,13 @@ const ( type RestartFunc func(context.Context, *hosts.Host) error func runSidekick(ctx context.Context, host *hosts.Host, prsMap map[string]v3.PrivateRegistry, sidecarProcess v3.Process, k8sVersion string) error { - isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, SidekickContainerName, true) + exists, err := docker.DoesContainerExist(ctx, host.DClient, host.Address, SidekickContainerName, true) if err != nil { return err } imageCfg, hostCfg, _ := GetProcessConfig(sidecarProcess, host, k8sVersion) isUpgradable := false - if isRunning { + if exists { isUpgradable, err = docker.IsContainerUpgradable(ctx, host.DClient, imageCfg, hostCfg, SidekickContainerName, host.Address, SidekickServiceName) if err != nil { return err