1
0
mirror of https://github.com/rancher/rke.git synced 2025-06-23 22:17:11 +00:00
rke/cluster/etcd.go

319 lines
11 KiB
Go

package cluster
import (
"context"
"fmt"
"strings"
"github.com/sirupsen/logrus"
"github.com/rancher/rke/docker"
"github.com/rancher/rke/hosts"
"github.com/rancher/rke/log"
"github.com/rancher/rke/pki"
"github.com/rancher/rke/services"
"github.com/rancher/rke/util"
"golang.org/x/sync/errgroup"
)
const MinEtcdVersionWithDistrolessImage = "v3.5.7"
func (c *Cluster) SnapshotEtcd(ctx context.Context, snapshotName string) error {
backupImage := c.getBackupImage()
containerTimeout := DefaultEtcdBackupConfigTimeout
if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.Timeout > 0 {
containerTimeout = c.Services.Etcd.BackupConfig.Timeout
}
// store first error message
var snapshotErr error
snapshotFailures := 0
s3UploadFailures := 0
for _, host := range c.EtcdHosts {
newCtx := context.WithValue(ctx, docker.WaitTimeoutContextKey, containerTimeout)
if err := services.RunEtcdSnapshotSave(newCtx, host, c.PrivateRegistriesMap, backupImage, snapshotName, true, c.Services.Etcd, c.Version); err != nil {
if strings.Contains(err.Error(), "failed to upload etcd snapshot file to s3 on host") {
s3UploadFailures++
} else {
if snapshotErr == nil {
snapshotErr = err
}
snapshotFailures++
}
}
}
if snapshotFailures == len(c.EtcdHosts) {
log.Warnf(ctx, "[etcd] Failed to take snapshot on all etcd hosts: %s", snapshotErr)
return fmt.Errorf("[etcd] Failed to take snapshot on all etcd hosts: %s", snapshotErr)
} else if snapshotFailures > 0 {
log.Warnf(ctx, "[etcd] Failed to take snapshot on %s etcd hosts", snapshotFailures)
} else {
log.Infof(ctx, "[etcd] Finished saving snapshot [%s] on all etcd hosts", snapshotName)
}
if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.S3BackupConfig == nil {
return nil
}
if s3UploadFailures >= len(c.EtcdHosts)-snapshotFailures {
log.Warnf(ctx, "[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts")
return fmt.Errorf("[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts")
} else if s3UploadFailures > 0 {
log.Warnf(ctx, "[etcd] Failed to upload etcd snapshot file to s3 on %s etcd hosts", s3UploadFailures)
} else {
log.Infof(ctx, "[etcd] Finished uploading etcd snapshot file to s3 on all etcd hosts")
}
return nil
}
func (c *Cluster) DeployRestoreCerts(ctx context.Context, clusterCerts map[string]pki.CertificatePKI) error {
var errgrp errgroup.Group
hostsQueue := util.GetObjectQueue(c.EtcdHosts)
restoreCerts := map[string]pki.CertificatePKI{}
for _, n := range []string{pki.CACertName, pki.KubeNodeCertName, pki.KubeNodeCertName} {
restoreCerts[n] = clusterCerts[n]
}
for w := 0; w < WorkerThreads; w++ {
errgrp.Go(func() error {
var errList []error
for host := range hostsQueue {
h := host.(*hosts.Host)
var env []string
if h.IsWindows() {
env = c.getWindowsEnv(h)
}
if err := pki.DeployCertificatesOnPlaneHost(
ctx,
h,
c.RancherKubernetesEngineConfig,
restoreCerts,
c.SystemImages.CertDownloader,
c.PrivateRegistriesMap,
false,
env,
c.Version); err != nil {
errList = append(errList, err)
}
}
return util.ErrList(errList)
})
}
return errgrp.Wait()
}
func (c *Cluster) DeployStateFile(ctx context.Context, stateFilePath, snapshotName string) error {
stateFileExists, err := util.IsFileExists(stateFilePath)
if err != nil {
logrus.Warnf("Could not read cluster state file from [%s], error: [%v]. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'", stateFilePath, err)
return nil
}
if !stateFileExists {
logrus.Warnf("Could not read cluster state file from [%s], file does not exist. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'", stateFilePath)
return nil
}
var errgrp errgroup.Group
hostsQueue := util.GetObjectQueue(c.EtcdHosts)
for w := 0; w < WorkerThreads; w++ {
errgrp.Go(func() error {
var errList []error
for host := range hostsQueue {
err := pki.DeployStateOnPlaneHost(ctx, host.(*hosts.Host), c.SystemImages.CertDownloader, c.PrivateRegistriesMap, stateFilePath, snapshotName, c.Version)
if err != nil {
errList = append(errList, err)
}
}
return util.ErrList(errList)
})
}
return errgrp.Wait()
}
func (c *Cluster) GetStateFileFromSnapshot(ctx context.Context, snapshotName string) (string, error) {
backupImage := c.getBackupImage()
for _, host := range c.EtcdHosts {
stateFile, err := services.RunGetStateFileFromSnapshot(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotName, c.Services.Etcd, c.Version)
if err != nil || stateFile == "" {
logrus.Infof("Could not extract state file from snapshot [%s] on host [%s]", snapshotName, host.Address)
continue
}
return stateFile, nil
}
return "", fmt.Errorf("Unable to find statefile in snapshot [%s]", snapshotName)
}
func (c *Cluster) PrepareBackup(ctx context.Context, snapshotPath string) error {
// local backup case
var backupReady bool
var backupServer *hosts.Host
backupImage := c.getBackupImage()
var errors []error
// s3 backup case
if c.Services.Etcd.BackupConfig != nil &&
c.Services.Etcd.BackupConfig.S3BackupConfig != nil {
log.Infof(ctx, "[etcd] etcd s3 backup configuration found, will use s3 as source")
downloadFailed := false
for _, host := range c.EtcdHosts {
if err := services.DownloadEtcdSnapshotFromS3(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotPath, c.Services.Etcd, c.Version); err != nil {
log.Warnf(ctx, "failed to download snapshot [%s] from s3 on host [%s]: %v", snapshotPath, host.Address, err)
downloadFailed = true
break
}
}
backupReady = !downloadFailed
}
// legacy rke local backup or rancher local backup
if !backupReady {
if c.Services.Etcd.BackupConfig == nil {
log.Infof(ctx, "[etcd] No etcd snapshot configuration found, will use local as source")
} else if c.Services.Etcd.BackupConfig.S3BackupConfig == nil {
log.Infof(ctx, "[etcd] etcd snapshot configuration found and no s3 backup configuration found, will use local as source")
} else {
log.Warnf(ctx, "[etcd] etcd snapshot configuration found and s3 backup configuration failed, falling back to use local as source")
}
// stop etcd on all etcd nodes, we need this because we start the backup server on the same port
for _, host := range c.EtcdHosts {
if err := docker.StopContainer(ctx, host.DClient, host.Address, services.EtcdContainerName); err != nil {
log.Warnf(ctx, "failed to stop etcd container on host [%s]: %v", host.Address, err)
}
// start the download server, only one node should have it!
if err := services.StartBackupServer(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotPath, c.Version); err != nil {
log.Warnf(ctx, "failed to start backup server on host [%s]: %v", host.Address, err)
errors = append(errors, err)
continue
}
backupServer = host
break
}
if backupServer == nil { //failed to start the backupServer, I will cleanup and exit
for _, host := range c.EtcdHosts {
if err := docker.StartContainer(ctx, host.DClient, host.Address, services.EtcdContainerName); err != nil {
log.Warnf(ctx, "failed to start etcd container on host [%s]: %v", host.Address, err)
}
}
return fmt.Errorf("failed to start backup server on all etcd nodes: %v", errors)
}
// start downloading the snapshot
for _, host := range c.EtcdHosts {
if host.Address == backupServer.Address { // we skip the backup server if it's there
continue
}
if err := services.DownloadEtcdSnapshotFromBackupServer(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotPath, backupServer, c.Version); err != nil {
return err
}
}
// all good, let's remove the backup server container
if err := docker.DoRemoveContainer(ctx, backupServer.DClient, services.EtcdServeBackupContainerName, backupServer.Address); err != nil {
return err
}
backupReady = true
}
if !backupReady {
return fmt.Errorf("failed to prepare backup for restore")
}
// this applies to all cases!
if isEqual := c.etcdSnapshotChecksum(ctx, snapshotPath); !isEqual {
return fmt.Errorf("etcd snapshots are not consistent")
}
return nil
}
func (c *Cluster) RestoreEtcdSnapshot(ctx context.Context, snapshotPath string) error {
// Start restore process on all etcd hosts
initCluster := services.GetEtcdInitialCluster(c.EtcdHosts)
backupImage := c.getBackupImage()
restoreImage := c.getRestoreImage()
for _, host := range c.EtcdHosts {
containerTimeout := DefaultEtcdBackupConfigTimeout
if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.Timeout > 0 {
containerTimeout = c.Services.Etcd.BackupConfig.Timeout
}
newCtx := context.WithValue(ctx, docker.WaitTimeoutContextKey, containerTimeout)
if err := services.RestoreEtcdSnapshot(newCtx, host, c.PrivateRegistriesMap, restoreImage, backupImage,
snapshotPath, initCluster, c.Services.Etcd, c.Version); err != nil {
return fmt.Errorf("[etcd] Failed to restore etcd snapshot: %v", err)
}
}
return nil
}
func (c *Cluster) RemoveEtcdSnapshot(ctx context.Context, snapshotName string) error {
backupImage := c.getBackupImage()
for _, host := range c.EtcdHosts {
if err := services.RunEtcdSnapshotRemove(ctx, host, c.PrivateRegistriesMap, backupImage, snapshotName,
false, c.Services.Etcd, c.Version); err != nil {
return err
}
}
return nil
}
func (c *Cluster) etcdSnapshotChecksum(ctx context.Context, snapshotPath string) bool {
log.Infof(ctx, "[etcd] Checking if all snapshots are identical")
etcdChecksums := []string{}
backupImage := c.getBackupImage()
for _, etcdHost := range c.EtcdHosts {
checksum, err := services.GetEtcdSnapshotChecksum(ctx, etcdHost, c.PrivateRegistriesMap, backupImage, snapshotPath, c.Version)
if err != nil {
return false
}
etcdChecksums = append(etcdChecksums, checksum)
log.Infof(ctx, "[etcd] Checksum of etcd snapshot on host [%s] is [%s]", etcdHost.Address, checksum)
}
hostChecksum := etcdChecksums[0]
for _, checksum := range etcdChecksums {
if checksum != hostChecksum {
return false
}
}
return true
}
func (c *Cluster) getBackupImage() string {
rkeToolsImage, err := util.GetDefaultRKETools(c.SystemImages.Alpine)
if err != nil {
logrus.Errorf("[etcd] error getting backup image %v", err)
return ""
}
logrus.Debugf("[etcd] Image used for etcd snapshot is: [%s]", rkeToolsImage)
return rkeToolsImage
}
func (c *Cluster) getRestoreImage() string {
// use etcd image for restore in case of custom system image
if !strings.Contains(c.SystemImages.Etcd, "rancher/mirrored-coreos-etcd") {
return c.SystemImages.Etcd
}
etcdImageTag, err := util.GetImageTagFromImage(c.SystemImages.Etcd)
if err != nil {
logrus.Errorf("[etcd] getRestoreImage: error extracting tag from etcd image: %v", err)
return ""
}
etcdVersion, err := util.StrToSemVer(etcdImageTag)
if err != nil {
logrus.Errorf("[etcd] getRestoreImage: error converting etcd image tag to semver: %v", err)
return ""
}
minEtcdVersionWithDistrolessImage, err := util.StrToSemVer(MinEtcdVersionWithDistrolessImage)
if err != nil {
logrus.Errorf("[etcd] getRestoreImage: error converting min distroless etcd image version to semver: %v", err)
return ""
}
if etcdVersion.LessThan(*minEtcdVersionWithDistrolessImage) {
return c.SystemImages.Etcd
}
return c.getBackupImage()
}