diff --git a/cluster/etcd.go b/cluster/etcd.go index a92ac3f9..e75cc059 100644 --- a/cluster/etcd.go +++ b/cluster/etcd.go @@ -3,9 +3,17 @@ package cluster import ( "context" "fmt" + "strings" + "github.com/rancher/rke/docker" + "github.com/rancher/rke/hosts" "github.com/rancher/rke/log" "github.com/rancher/rke/services" + "github.com/rancher/rke/util" +) + +const ( + SupportedSyncToolsVersion = "0.1.22" ) func (c *Cluster) SnapshotEtcd(ctx context.Context, snapshotName string) error { @@ -17,20 +25,57 @@ func (c *Cluster) SnapshotEtcd(ctx context.Context, snapshotName string) error { return nil } -func (c *Cluster) RestoreEtcdSnapshot(ctx context.Context, snapshotPath string) error { - // get etcd snapshots from s3 if backup backend server is set +func (c *Cluster) PrepareBackup(ctx context.Context, snapshotPath string) error { + // local backup case + var backupServer *hosts.Host + // stop etcd on all etcd nodes, we need this because we start the backup server on the same port + if !isAutoSyncSupported(c.SystemImages.Alpine) { + log.Warnf(ctx, "Auto local backup sync is not supported. Use `rancher/rke-tools:%s` or up", SupportedSyncToolsVersion) + } else if c.Services.Etcd.BackupConfig == nil || // legacy rke local backup + (c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.S3BackupConfig == nil) { // rancher local backup, no s3 + for _, host := range c.EtcdHosts { + if err := docker.StopContainer(ctx, host.DClient, host.Address, services.EtcdContainerName); err != nil { + log.Warnf(ctx, "failed to stop etcd container on host [%s]: %v", host.Address, err) + } + if backupServer == nil { // start the download server, only one node should have it! + if err := services.StartBackupServer(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, snapshotPath); err != nil { + log.Warnf(ctx, "failed to start backup server on host [%s]: %v", host.Address, err) + continue + } + backupServer = host + } + } + // start downloading the snapshot + for _, host := range c.EtcdHosts { + if backupServer != nil && host.Address == backupServer.Address { // we skip the backup server if it's there + continue + } + if err := services.DownloadEtcdSnapshotFromBackupServer(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, snapshotPath, backupServer); err != nil { + return err + } + } + // all good, let's remove the backup server container + if err := docker.DoRemoveContainer(ctx, backupServer.DClient, services.EtcdServeBackupContainerName, backupServer.Address); err != nil { + return err + } + } + + // s3 backup case if c.Services.Etcd.BackupConfig != nil && c.Services.Etcd.BackupConfig.S3BackupConfig != nil { for _, host := range c.EtcdHosts { - if err := services.DownloadEtcdSnapshot(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, snapshotPath, c.Services.Etcd); err != nil { + if err := services.DownloadEtcdSnapshotFromS3(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, snapshotPath, c.Services.Etcd); err != nil { return err } } } + // this applies to all cases! if isEqual := c.etcdSnapshotChecksum(ctx, snapshotPath); !isEqual { return fmt.Errorf("etcd snapshots are not consistent") } - + return nil +} +func (c *Cluster) RestoreEtcdSnapshot(ctx context.Context, snapshotPath string) error { // Start restore process on all etcd hosts initCluster := services.GetEtcdInitialCluster(c.EtcdHosts) for _, host := range c.EtcdHosts { @@ -60,3 +105,22 @@ func (c *Cluster) etcdSnapshotChecksum(ctx context.Context, snapshotPath string) } return true } + +func isAutoSyncSupported(image string) bool { + v := strings.Split(image, ":") + last := v[len(v)-1] + + sv, err := util.StrToSemVer(last) + if err != nil { + return false + } + + supported, err := util.StrToSemVer(SupportedSyncToolsVersion) + if err != nil { + return false + } + if sv.LessThan(*supported) { + return false + } + return true +} diff --git a/cmd/etcd.go b/cmd/etcd.go index 5201bda9..40284689 100644 --- a/cmd/etcd.go +++ b/cmd/etcd.go @@ -130,7 +130,10 @@ func RestoreEtcdSnapshot( if err := kubeCluster.TunnelHosts(ctx, flags); err != nil { return err } - + // first download and check + if err := kubeCluster.PrepareBackup(ctx, snapshotName); err != nil { + return err + } log.Infof(ctx, "Cleaning old kubernetes cluster") if err := kubeCluster.CleanupNodes(ctx); err != nil { return err diff --git a/services/etcd.go b/services/etcd.go index aefcb49b..099f12df 100644 --- a/services/etcd.go +++ b/services/etcd.go @@ -335,7 +335,8 @@ func RunEtcdSnapshotSave(ctx context.Context, etcdHost *hosts.Host, prsMap map[s return nil } -func DownloadEtcdSnapshot(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdSnapshotImage string, name string, es v3.ETCDService) error { +func DownloadEtcdSnapshotFromS3(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdSnapshotImage string, name string, es v3.ETCDService) error { + log.Infof(ctx, "[etcd] Get snapshot [%s] on host [%s]", name, etcdHost.Address) s3Backend := es.BackupConfig.S3BackupConfig if len(s3Backend.Endpoint) == 0 || len(s3Backend.BucketName) == 0 { @@ -347,6 +348,7 @@ func DownloadEtcdSnapshot(ctx context.Context, etcdHost *hosts.Host, prsMap map[ "etcd-backup", "download", "--name", name, + "--s3-backup=true", "--s3-endpoint=" + s3Backend.Endpoint, "--s3-accessKey=" + s3Backend.AccessKey, "--s3-secretKey=" + s3Backend.SecretKey, @@ -490,3 +492,69 @@ func configS3BackupImgCmd(ctx context.Context, imageCfg *container.Config, bc *v imageCfg.Cmd = append(imageCfg.Cmd, cmd...) return imageCfg } + +func StartBackupServer(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdSnapshotImage string, name string) error { + log.Infof(ctx, "[etcd] starting backup server on host [%s]", etcdHost.Address) + + imageCfg := &container.Config{ + Cmd: []string{ + "/opt/rke-tools/rke-etcd-backup", + "etcd-backup", + "serve", + "--name", name, + "--cacert", pki.GetCertPath(pki.CACertName), + "--cert", pki.GetCertPath(pki.KubeNodeCertName), + "--key", pki.GetKeyPath(pki.KubeNodeCertName), + }, + Image: etcdSnapshotImage, + } + hostCfg := &container.HostConfig{ + Binds: []string{ + fmt.Sprintf("%s:/backup", EtcdSnapshotPath), + fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))}, + NetworkMode: container.NetworkMode("host"), + RestartPolicy: container.RestartPolicy{Name: "on-failure"}, + } + return docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdServeBackupContainerName, etcdHost.Address, ETCDRole, prsMap) +} + +func DownloadEtcdSnapshotFromBackupServer(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdSnapshotImage, name string, backupServer *hosts.Host) error { + log.Infof(ctx, "[etcd] Get snapshot [%s] on host [%s]", name, etcdHost.Address) + imageCfg := &container.Config{ + Cmd: []string{ + "/opt/rke-tools/rke-etcd-backup", + "etcd-backup", + "download", + "--name", name, + "--local-endpoint", backupServer.Address, + "--cacert", pki.GetCertPath(pki.CACertName), + "--cert", pki.GetCertPath(pki.KubeNodeCertName), + "--key", pki.GetKeyPath(pki.KubeNodeCertName), + }, + Image: etcdSnapshotImage, + } + + hostCfg := &container.HostConfig{ + Binds: []string{ + fmt.Sprintf("%s:/backup", EtcdSnapshotPath), + fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))}, + NetworkMode: container.NetworkMode("host"), + RestartPolicy: container.RestartPolicy{Name: "on-failure"}, + } + + if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdDownloadBackupContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { + return err + } + + status, _, stderr, err := docker.GetContainerOutput(ctx, etcdHost.DClient, EtcdDownloadBackupContainerName, etcdHost.Address) + if status != 0 || err != nil { + if removeErr := docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdDownloadBackupContainerName); removeErr != nil { + log.Warnf(ctx, "Failed to remove container [%s]: %v", removeErr) + } + if err != nil { + return err + } + return fmt.Errorf("Failed to download etcd snapshot from backup server [%s], exit code [%d]: %v", backupServer.Address, status, stderr) + } + return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdDownloadBackupContainerName) +} diff --git a/services/services.go b/services/services.go index ff3e7fd9..fa2ccb6e 100644 --- a/services/services.go +++ b/services/services.go @@ -32,6 +32,7 @@ const ( EtcdSnapshotOnceContainerName = "etcd-snapshot-once" EtcdRestoreContainerName = "etcd-restore" EtcdDownloadBackupContainerName = "etcd-download-backup" + EtcdServeBackupContainerName = "etcd-Serve-backup" EtcdChecksumContainerName = "etcd-checksum-checker" NginxProxyContainerName = "nginx-proxy" SidekickContainerName = "service-sidekick"