2018-05-09 17:39:19 +00:00
package cluster
import (
"context"
"fmt"
2022-06-23 22:30:58 +00:00
"strings"
2018-05-09 17:39:19 +00:00
2019-08-17 00:15:58 +00:00
"github.com/sirupsen/logrus"
2019-01-28 17:17:44 +00:00
"github.com/rancher/rke/docker"
"github.com/rancher/rke/hosts"
2018-11-24 10:18:24 +00:00
"github.com/rancher/rke/log"
2019-03-02 02:28:40 +00:00
"github.com/rancher/rke/pki"
2018-05-09 17:39:19 +00:00
"github.com/rancher/rke/services"
2019-01-28 17:17:44 +00:00
"github.com/rancher/rke/util"
2019-03-02 02:28:40 +00:00
"golang.org/x/sync/errgroup"
2019-01-28 17:17:44 +00:00
)
2023-09-25 15:32:14 +00:00
const MinEtcdVersionWithDistrolessImage = "v3.5.7"
2018-05-17 22:27:35 +00:00
func ( c * Cluster ) SnapshotEtcd ( ctx context . Context , snapshotName string ) error {
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2022-06-23 22:30:58 +00:00
containerTimeout := DefaultEtcdBackupConfigTimeout
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . Timeout > 0 {
containerTimeout = c . Services . Etcd . BackupConfig . Timeout
}
// store first error message
var snapshotErr error
snapshotFailures := 0
s3UploadFailures := 0
2018-05-09 17:39:19 +00:00
for _ , host := range c . EtcdHosts {
2021-01-22 17:35:13 +00:00
newCtx := context . WithValue ( ctx , docker . WaitTimeoutContextKey , containerTimeout )
2021-03-16 09:54:01 +00:00
if err := services . RunEtcdSnapshotSave ( newCtx , host , c . PrivateRegistriesMap , backupImage , snapshotName , true , c . Services . Etcd , c . Version ) ; err != nil {
2022-06-23 22:30:58 +00:00
if strings . Contains ( err . Error ( ) , "failed to upload etcd snapshot file to s3 on host" ) {
s3UploadFailures ++
} else {
if snapshotErr == nil {
snapshotErr = err
}
snapshotFailures ++
}
2018-05-09 17:39:19 +00:00
}
}
2022-06-23 22:30:58 +00:00
if snapshotFailures == len ( c . EtcdHosts ) {
log . Warnf ( ctx , "[etcd] Failed to take snapshot on all etcd hosts: %s" , snapshotErr )
return fmt . Errorf ( "[etcd] Failed to take snapshot on all etcd hosts: %s" , snapshotErr )
} else if snapshotFailures > 0 {
log . Warnf ( ctx , "[etcd] Failed to take snapshot on %s etcd hosts" , snapshotFailures )
} else {
log . Infof ( ctx , "[etcd] Finished saving snapshot [%s] on all etcd hosts" , snapshotName )
}
2022-11-19 15:28:16 +00:00
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . S3BackupConfig == nil {
2022-06-23 22:30:58 +00:00
return nil
}
if s3UploadFailures >= len ( c . EtcdHosts ) - snapshotFailures {
log . Warnf ( ctx , "[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts" )
return fmt . Errorf ( "[etcd] Failed to upload etcd snapshot file to s3 on all etcd hosts" )
} else if s3UploadFailures > 0 {
log . Warnf ( ctx , "[etcd] Failed to upload etcd snapshot file to s3 on %s etcd hosts" , s3UploadFailures )
} else {
log . Infof ( ctx , "[etcd] Finished uploading etcd snapshot file to s3 on all etcd hosts" )
}
2018-05-09 17:39:19 +00:00
return nil
}
2019-03-02 02:28:40 +00:00
func ( c * Cluster ) DeployRestoreCerts ( ctx context . Context , clusterCerts map [ string ] pki . CertificatePKI ) error {
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( c . EtcdHosts )
restoreCerts := map [ string ] pki . CertificatePKI { }
for _ , n := range [ ] string { pki . CACertName , pki . KubeNodeCertName , pki . KubeNodeCertName } {
restoreCerts [ n ] = clusterCerts [ n ]
}
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
2020-08-05 22:39:25 +00:00
h := host . ( * hosts . Host )
var env [ ] string
if h . IsWindows ( ) {
env = c . getWindowsEnv ( h )
}
if err := pki . DeployCertificatesOnPlaneHost (
ctx ,
h ,
c . RancherKubernetesEngineConfig ,
restoreCerts ,
c . SystemImages . CertDownloader ,
c . PrivateRegistriesMap ,
false ,
2021-03-16 09:54:01 +00:00
env ,
c . Version ) ; err != nil {
2019-03-02 02:28:40 +00:00
errList = append ( errList , err )
}
}
return util . ErrList ( errList )
} )
}
2021-06-06 07:16:44 +00:00
return errgrp . Wait ( )
2019-03-02 02:28:40 +00:00
}
2020-08-14 16:42:37 +00:00
func ( c * Cluster ) DeployStateFile ( ctx context . Context , stateFilePath , snapshotName string ) error {
2021-05-15 16:55:04 +00:00
stateFileExists , err := util . IsFileExists ( stateFilePath )
if err != nil {
logrus . Warnf ( "Could not read cluster state file from [%s], error: [%v]. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'" , stateFilePath , err )
return nil
}
if ! stateFileExists {
logrus . Warnf ( "Could not read cluster state file from [%s], file does not exist. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'" , stateFilePath )
return nil
}
2020-03-30 19:16:47 +00:00
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( c . EtcdHosts )
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
2021-03-16 09:54:01 +00:00
err := pki . DeployStateOnPlaneHost ( ctx , host . ( * hosts . Host ) , c . SystemImages . CertDownloader , c . PrivateRegistriesMap , stateFilePath , snapshotName , c . Version )
2020-03-30 19:16:47 +00:00
if err != nil {
errList = append ( errList , err )
}
}
return util . ErrList ( errList )
} )
}
2021-06-06 07:16:44 +00:00
return errgrp . Wait ( )
2020-03-30 19:16:47 +00:00
}
func ( c * Cluster ) GetStateFileFromSnapshot ( ctx context . Context , snapshotName string ) ( string , error ) {
backupImage := c . getBackupImage ( )
for _ , host := range c . EtcdHosts {
2021-03-16 09:54:01 +00:00
stateFile , err := services . RunGetStateFileFromSnapshot ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotName , c . Services . Etcd , c . Version )
2020-03-30 19:16:47 +00:00
if err != nil || stateFile == "" {
logrus . Infof ( "Could not extract state file from snapshot [%s] on host [%s]" , snapshotName , host . Address )
continue
}
return stateFile , nil
}
return "" , fmt . Errorf ( "Unable to find statefile in snapshot [%s]" , snapshotName )
}
2019-01-28 17:17:44 +00:00
func ( c * Cluster ) PrepareBackup ( ctx context . Context , snapshotPath string ) error {
// local backup case
2019-03-13 23:18:57 +00:00
var backupReady bool
2019-01-28 17:17:44 +00:00
var backupServer * hosts . Host
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2019-02-27 05:34:54 +00:00
var errors [ ] error
2022-06-23 22:30:58 +00:00
// s3 backup case
if c . Services . Etcd . BackupConfig != nil &&
c . Services . Etcd . BackupConfig . S3BackupConfig != nil {
log . Infof ( ctx , "[etcd] etcd s3 backup configuration found, will use s3 as source" )
downloadFailed := false
for _ , host := range c . EtcdHosts {
if err := services . DownloadEtcdSnapshotFromS3 ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath , c . Services . Etcd , c . Version ) ; err != nil {
log . Warnf ( ctx , "failed to download snapshot [%s] from s3 on host [%s]: %v" , snapshotPath , host . Address , err )
downloadFailed = true
break
}
}
backupReady = ! downloadFailed
}
// legacy rke local backup or rancher local backup
if ! backupReady {
2020-01-20 17:43:35 +00:00
if c . Services . Etcd . BackupConfig == nil {
log . Infof ( ctx , "[etcd] No etcd snapshot configuration found, will use local as source" )
2022-06-23 22:30:58 +00:00
} else if c . Services . Etcd . BackupConfig . S3BackupConfig == nil {
2020-01-20 17:43:35 +00:00
log . Infof ( ctx , "[etcd] etcd snapshot configuration found and no s3 backup configuration found, will use local as source" )
2022-06-23 22:30:58 +00:00
} else {
log . Warnf ( ctx , "[etcd] etcd snapshot configuration found and s3 backup configuration failed, falling back to use local as source" )
2020-01-20 17:43:35 +00:00
}
2019-02-15 20:40:36 +00:00
// stop etcd on all etcd nodes, we need this because we start the backup server on the same port
2019-01-28 17:17:44 +00:00
for _ , host := range c . EtcdHosts {
if err := docker . StopContainer ( ctx , host . DClient , host . Address , services . EtcdContainerName ) ; err != nil {
log . Warnf ( ctx , "failed to stop etcd container on host [%s]: %v" , host . Address , err )
}
2019-09-20 23:54:44 +00:00
// start the download server, only one node should have it!
2021-03-16 09:54:01 +00:00
if err := services . StartBackupServer ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath , c . Version ) ; err != nil {
2019-09-20 23:54:44 +00:00
log . Warnf ( ctx , "failed to start backup server on host [%s]: %v" , host . Address , err )
errors = append ( errors , err )
continue
2019-01-28 17:17:44 +00:00
}
2019-09-20 23:54:44 +00:00
backupServer = host
break
2019-01-28 17:17:44 +00:00
}
2019-02-27 05:34:54 +00:00
if backupServer == nil { //failed to start the backupServer, I will cleanup and exit
for _ , host := range c . EtcdHosts {
if err := docker . StartContainer ( ctx , host . DClient , host . Address , services . EtcdContainerName ) ; err != nil {
log . Warnf ( ctx , "failed to start etcd container on host [%s]: %v" , host . Address , err )
}
}
return fmt . Errorf ( "failed to start backup server on all etcd nodes: %v" , errors )
}
2019-01-28 17:17:44 +00:00
// start downloading the snapshot
for _ , host := range c . EtcdHosts {
2019-09-20 23:54:44 +00:00
if host . Address == backupServer . Address { // we skip the backup server if it's there
2019-01-28 17:17:44 +00:00
continue
}
2021-03-16 09:54:01 +00:00
if err := services . DownloadEtcdSnapshotFromBackupServer ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath , backupServer , c . Version ) ; err != nil {
2019-01-28 17:17:44 +00:00
return err
}
}
// all good, let's remove the backup server container
if err := docker . DoRemoveContainer ( ctx , backupServer . DClient , services . EtcdServeBackupContainerName , backupServer . Address ) ; err != nil {
return err
}
2019-03-13 23:18:57 +00:00
backupReady = true
2019-01-28 17:17:44 +00:00
}
2019-03-13 23:18:57 +00:00
if ! backupReady {
return fmt . Errorf ( "failed to prepare backup for restore" )
2018-12-13 08:46:47 +00:00
}
2019-01-28 17:17:44 +00:00
// this applies to all cases!
2018-11-24 10:18:24 +00:00
if isEqual := c . etcdSnapshotChecksum ( ctx , snapshotPath ) ; ! isEqual {
return fmt . Errorf ( "etcd snapshots are not consistent" )
2018-05-09 17:39:19 +00:00
}
2019-01-28 17:17:44 +00:00
return nil
}
2019-02-15 20:40:36 +00:00
2019-01-28 17:17:44 +00:00
func ( c * Cluster ) RestoreEtcdSnapshot ( ctx context . Context , snapshotPath string ) error {
2018-05-09 17:39:19 +00:00
// Start restore process on all etcd hosts
initCluster := services . GetEtcdInitialCluster ( c . EtcdHosts )
2019-12-03 23:45:26 +00:00
backupImage := c . getBackupImage ( )
2023-09-25 15:32:14 +00:00
restoreImage := c . getRestoreImage ( )
2018-05-09 17:39:19 +00:00
for _ , host := range c . EtcdHosts {
2021-01-22 17:35:13 +00:00
containerTimeout := DefaultEtcdBackupConfigTimeout
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . Timeout > 0 {
containerTimeout = c . Services . Etcd . BackupConfig . Timeout
}
newCtx := context . WithValue ( ctx , docker . WaitTimeoutContextKey , containerTimeout )
2023-09-25 15:32:14 +00:00
if err := services . RestoreEtcdSnapshot ( newCtx , host , c . PrivateRegistriesMap , restoreImage , backupImage ,
2021-03-16 09:54:01 +00:00
snapshotPath , initCluster , c . Services . Etcd , c . Version ) ; err != nil {
2018-05-17 22:27:35 +00:00
return fmt . Errorf ( "[etcd] Failed to restore etcd snapshot: %v" , err )
2018-05-09 17:39:19 +00:00
}
}
return nil
}
2019-04-02 10:54:02 +00:00
func ( c * Cluster ) RemoveEtcdSnapshot ( ctx context . Context , snapshotName string ) error {
backupImage := c . getBackupImage ( )
for _ , host := range c . EtcdHosts {
2019-12-03 23:45:26 +00:00
if err := services . RunEtcdSnapshotRemove ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotName ,
2021-03-16 09:54:01 +00:00
false , c . Services . Etcd , c . Version ) ; err != nil {
2019-04-02 10:54:02 +00:00
return err
}
}
return nil
}
2018-11-24 10:18:24 +00:00
func ( c * Cluster ) etcdSnapshotChecksum ( ctx context . Context , snapshotPath string ) bool {
log . Infof ( ctx , "[etcd] Checking if all snapshots are identical" )
etcdChecksums := [ ] string { }
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2018-11-24 10:18:24 +00:00
for _ , etcdHost := range c . EtcdHosts {
2021-03-16 09:54:01 +00:00
checksum , err := services . GetEtcdSnapshotChecksum ( ctx , etcdHost , c . PrivateRegistriesMap , backupImage , snapshotPath , c . Version )
2018-11-24 10:18:24 +00:00
if err != nil {
return false
}
etcdChecksums = append ( etcdChecksums , checksum )
log . Infof ( ctx , "[etcd] Checksum of etcd snapshot on host [%s] is [%s]" , etcdHost . Address , checksum )
2018-05-09 17:39:19 +00:00
}
2018-11-24 10:18:24 +00:00
hostChecksum := etcdChecksums [ 0 ]
for _ , checksum := range etcdChecksums {
if checksum != hostChecksum {
return false
}
2018-05-09 17:39:19 +00:00
}
2018-11-24 10:18:24 +00:00
return true
2018-05-09 17:39:19 +00:00
}
2019-01-28 17:17:44 +00:00
2019-02-15 20:40:36 +00:00
func ( c * Cluster ) getBackupImage ( ) string {
2019-07-18 20:26:56 +00:00
rkeToolsImage , err := util . GetDefaultRKETools ( c . SystemImages . Alpine )
if err != nil {
logrus . Errorf ( "[etcd] error getting backup image %v" , err )
return ""
}
2020-01-20 17:43:35 +00:00
logrus . Debugf ( "[etcd] Image used for etcd snapshot is: [%s]" , rkeToolsImage )
2019-07-18 20:26:56 +00:00
return rkeToolsImage
2019-01-28 17:17:44 +00:00
}
2023-09-25 15:32:14 +00:00
func ( c * Cluster ) getRestoreImage ( ) string {
// use etcd image for restore in case of custom system image
if ! strings . Contains ( c . SystemImages . Etcd , "rancher/mirrored-coreos-etcd" ) {
return c . SystemImages . Etcd
}
etcdImageTag , err := util . GetImageTagFromImage ( c . SystemImages . Etcd )
if err != nil {
logrus . Errorf ( "[etcd] getRestoreImage: error extracting tag from etcd image: %v" , err )
return ""
}
etcdVersion , err := util . StrToSemVer ( etcdImageTag )
if err != nil {
logrus . Errorf ( "[etcd] getRestoreImage: error converting etcd image tag to semver: %v" , err )
return ""
}
minEtcdVersionWithDistrolessImage , err := util . StrToSemVer ( MinEtcdVersionWithDistrolessImage )
if err != nil {
logrus . Errorf ( "[etcd] getRestoreImage: error converting min distroless etcd image version to semver: %v" , err )
return ""
}
if etcdVersion . LessThan ( * minEtcdVersionWithDistrolessImage ) {
return c . SystemImages . Etcd
}
return c . getBackupImage ( )
}