2018-05-09 17:39:19 +00:00
package cluster
import (
"context"
"fmt"
2019-08-17 00:15:58 +00:00
"github.com/sirupsen/logrus"
2019-01-28 17:17:44 +00:00
"github.com/rancher/rke/docker"
"github.com/rancher/rke/hosts"
2018-11-24 10:18:24 +00:00
"github.com/rancher/rke/log"
2019-03-02 02:28:40 +00:00
"github.com/rancher/rke/pki"
2018-05-09 17:39:19 +00:00
"github.com/rancher/rke/services"
2019-01-28 17:17:44 +00:00
"github.com/rancher/rke/util"
2019-03-02 02:28:40 +00:00
"golang.org/x/sync/errgroup"
2019-01-28 17:17:44 +00:00
)
2018-05-17 22:27:35 +00:00
func ( c * Cluster ) SnapshotEtcd ( ctx context . Context , snapshotName string ) error {
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2018-05-09 17:39:19 +00:00
for _ , host := range c . EtcdHosts {
2021-01-22 17:35:13 +00:00
containerTimeout := DefaultEtcdBackupConfigTimeout
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . Timeout > 0 {
containerTimeout = c . Services . Etcd . BackupConfig . Timeout
}
newCtx := context . WithValue ( ctx , docker . WaitTimeoutContextKey , containerTimeout )
if err := services . RunEtcdSnapshotSave ( newCtx , host , c . PrivateRegistriesMap , backupImage , snapshotName , true , c . Services . Etcd ) ; err != nil {
2018-05-09 17:39:19 +00:00
return err
}
}
return nil
}
2019-03-02 02:28:40 +00:00
func ( c * Cluster ) DeployRestoreCerts ( ctx context . Context , clusterCerts map [ string ] pki . CertificatePKI ) error {
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( c . EtcdHosts )
restoreCerts := map [ string ] pki . CertificatePKI { }
for _ , n := range [ ] string { pki . CACertName , pki . KubeNodeCertName , pki . KubeNodeCertName } {
restoreCerts [ n ] = clusterCerts [ n ]
}
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
2020-08-05 22:39:25 +00:00
h := host . ( * hosts . Host )
var env [ ] string
if h . IsWindows ( ) {
env = c . getWindowsEnv ( h )
}
if err := pki . DeployCertificatesOnPlaneHost (
ctx ,
h ,
c . RancherKubernetesEngineConfig ,
restoreCerts ,
c . SystemImages . CertDownloader ,
c . PrivateRegistriesMap ,
false ,
env ) ; err != nil {
2019-03-02 02:28:40 +00:00
errList = append ( errList , err )
}
}
return util . ErrList ( errList )
} )
}
if err := errgrp . Wait ( ) ; err != nil {
return err
}
return nil
}
2020-08-14 16:42:37 +00:00
func ( c * Cluster ) DeployStateFile ( ctx context . Context , stateFilePath , snapshotName string ) error {
2021-05-15 16:55:04 +00:00
stateFileExists , err := util . IsFileExists ( stateFilePath )
if err != nil {
logrus . Warnf ( "Could not read cluster state file from [%s], error: [%v]. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'" , stateFilePath , err )
return nil
}
if ! stateFileExists {
logrus . Warnf ( "Could not read cluster state file from [%s], file does not exist. Snapshot will be created without cluster state file. You can retrieve the cluster state file using 'rke util get-state-file'" , stateFilePath )
return nil
}
2020-03-30 19:16:47 +00:00
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( c . EtcdHosts )
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
2020-08-14 16:42:37 +00:00
err := pki . DeployStateOnPlaneHost ( ctx , host . ( * hosts . Host ) , c . SystemImages . CertDownloader , c . PrivateRegistriesMap , stateFilePath , snapshotName )
2020-03-30 19:16:47 +00:00
if err != nil {
errList = append ( errList , err )
}
}
return util . ErrList ( errList )
} )
}
if err := errgrp . Wait ( ) ; err != nil {
return err
}
return nil
}
func ( c * Cluster ) GetStateFileFromSnapshot ( ctx context . Context , snapshotName string ) ( string , error ) {
backupImage := c . getBackupImage ( )
for _ , host := range c . EtcdHosts {
stateFile , err := services . RunGetStateFileFromSnapshot ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotName , c . Services . Etcd )
if err != nil || stateFile == "" {
logrus . Infof ( "Could not extract state file from snapshot [%s] on host [%s]" , snapshotName , host . Address )
continue
}
return stateFile , nil
}
return "" , fmt . Errorf ( "Unable to find statefile in snapshot [%s]" , snapshotName )
}
2019-01-28 17:17:44 +00:00
func ( c * Cluster ) PrepareBackup ( ctx context . Context , snapshotPath string ) error {
// local backup case
2019-03-13 23:18:57 +00:00
var backupReady bool
2019-01-28 17:17:44 +00:00
var backupServer * hosts . Host
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2019-02-27 05:34:54 +00:00
var errors [ ] error
2019-03-11 23:04:08 +00:00
if c . Services . Etcd . BackupConfig == nil || // legacy rke local backup
2020-01-20 17:43:35 +00:00
( c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . S3BackupConfig == nil ) { // rancher local backup
if c . Services . Etcd . BackupConfig == nil {
log . Infof ( ctx , "[etcd] No etcd snapshot configuration found, will use local as source" )
}
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . S3BackupConfig == nil {
log . Infof ( ctx , "[etcd] etcd snapshot configuration found and no s3 backup configuration found, will use local as source" )
}
2019-02-15 20:40:36 +00:00
// stop etcd on all etcd nodes, we need this because we start the backup server on the same port
2019-01-28 17:17:44 +00:00
for _ , host := range c . EtcdHosts {
if err := docker . StopContainer ( ctx , host . DClient , host . Address , services . EtcdContainerName ) ; err != nil {
log . Warnf ( ctx , "failed to stop etcd container on host [%s]: %v" , host . Address , err )
}
2019-09-20 23:54:44 +00:00
// start the download server, only one node should have it!
if err := services . StartBackupServer ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath ) ; err != nil {
log . Warnf ( ctx , "failed to start backup server on host [%s]: %v" , host . Address , err )
errors = append ( errors , err )
continue
2019-01-28 17:17:44 +00:00
}
2019-09-20 23:54:44 +00:00
backupServer = host
break
2019-01-28 17:17:44 +00:00
}
2019-02-27 05:34:54 +00:00
if backupServer == nil { //failed to start the backupServer, I will cleanup and exit
for _ , host := range c . EtcdHosts {
if err := docker . StartContainer ( ctx , host . DClient , host . Address , services . EtcdContainerName ) ; err != nil {
log . Warnf ( ctx , "failed to start etcd container on host [%s]: %v" , host . Address , err )
}
}
return fmt . Errorf ( "failed to start backup server on all etcd nodes: %v" , errors )
}
2019-01-28 17:17:44 +00:00
// start downloading the snapshot
for _ , host := range c . EtcdHosts {
2019-09-20 23:54:44 +00:00
if host . Address == backupServer . Address { // we skip the backup server if it's there
2019-01-28 17:17:44 +00:00
continue
}
2019-02-15 20:40:36 +00:00
if err := services . DownloadEtcdSnapshotFromBackupServer ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath , backupServer ) ; err != nil {
2019-01-28 17:17:44 +00:00
return err
}
}
// all good, let's remove the backup server container
if err := docker . DoRemoveContainer ( ctx , backupServer . DClient , services . EtcdServeBackupContainerName , backupServer . Address ) ; err != nil {
return err
}
2019-03-13 23:18:57 +00:00
backupReady = true
2019-01-28 17:17:44 +00:00
}
// s3 backup case
2019-03-08 04:05:38 +00:00
if c . Services . Etcd . BackupConfig != nil &&
2020-01-20 17:43:35 +00:00
c . Services . Etcd . BackupConfig . S3BackupConfig != nil {
log . Infof ( ctx , "[etcd] etcd s3 backup configuration found, will use s3 as source" )
2018-12-13 08:46:47 +00:00
for _ , host := range c . EtcdHosts {
2019-02-15 20:40:36 +00:00
if err := services . DownloadEtcdSnapshotFromS3 ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotPath , c . Services . Etcd ) ; err != nil {
2018-12-13 08:46:47 +00:00
return err
}
}
2019-03-13 23:18:57 +00:00
backupReady = true
}
if ! backupReady {
return fmt . Errorf ( "failed to prepare backup for restore" )
2018-12-13 08:46:47 +00:00
}
2019-01-28 17:17:44 +00:00
// this applies to all cases!
2018-11-24 10:18:24 +00:00
if isEqual := c . etcdSnapshotChecksum ( ctx , snapshotPath ) ; ! isEqual {
return fmt . Errorf ( "etcd snapshots are not consistent" )
2018-05-09 17:39:19 +00:00
}
2019-01-28 17:17:44 +00:00
return nil
}
2019-02-15 20:40:36 +00:00
2019-01-28 17:17:44 +00:00
func ( c * Cluster ) RestoreEtcdSnapshot ( ctx context . Context , snapshotPath string ) error {
2018-05-09 17:39:19 +00:00
// Start restore process on all etcd hosts
initCluster := services . GetEtcdInitialCluster ( c . EtcdHosts )
2019-12-03 23:45:26 +00:00
backupImage := c . getBackupImage ( )
2018-05-09 17:39:19 +00:00
for _ , host := range c . EtcdHosts {
2021-01-22 17:35:13 +00:00
containerTimeout := DefaultEtcdBackupConfigTimeout
if c . Services . Etcd . BackupConfig != nil && c . Services . Etcd . BackupConfig . Timeout > 0 {
containerTimeout = c . Services . Etcd . BackupConfig . Timeout
}
newCtx := context . WithValue ( ctx , docker . WaitTimeoutContextKey , containerTimeout )
if err := services . RestoreEtcdSnapshot ( newCtx , host , c . PrivateRegistriesMap , c . SystemImages . Etcd , backupImage ,
2019-12-03 23:45:26 +00:00
snapshotPath , initCluster , c . Services . Etcd ) ; err != nil {
2018-05-17 22:27:35 +00:00
return fmt . Errorf ( "[etcd] Failed to restore etcd snapshot: %v" , err )
2018-05-09 17:39:19 +00:00
}
}
return nil
}
2019-04-02 10:54:02 +00:00
func ( c * Cluster ) RemoveEtcdSnapshot ( ctx context . Context , snapshotName string ) error {
backupImage := c . getBackupImage ( )
for _ , host := range c . EtcdHosts {
2019-12-03 23:45:26 +00:00
if err := services . RunEtcdSnapshotRemove ( ctx , host , c . PrivateRegistriesMap , backupImage , snapshotName ,
false , c . Services . Etcd ) ; err != nil {
2019-04-02 10:54:02 +00:00
return err
}
}
return nil
}
2018-11-24 10:18:24 +00:00
func ( c * Cluster ) etcdSnapshotChecksum ( ctx context . Context , snapshotPath string ) bool {
log . Infof ( ctx , "[etcd] Checking if all snapshots are identical" )
etcdChecksums := [ ] string { }
2019-02-15 20:40:36 +00:00
backupImage := c . getBackupImage ( )
2018-11-24 10:18:24 +00:00
for _ , etcdHost := range c . EtcdHosts {
2019-02-15 20:40:36 +00:00
checksum , err := services . GetEtcdSnapshotChecksum ( ctx , etcdHost , c . PrivateRegistriesMap , backupImage , snapshotPath )
2018-11-24 10:18:24 +00:00
if err != nil {
return false
}
etcdChecksums = append ( etcdChecksums , checksum )
log . Infof ( ctx , "[etcd] Checksum of etcd snapshot on host [%s] is [%s]" , etcdHost . Address , checksum )
2018-05-09 17:39:19 +00:00
}
2018-11-24 10:18:24 +00:00
hostChecksum := etcdChecksums [ 0 ]
for _ , checksum := range etcdChecksums {
if checksum != hostChecksum {
return false
}
2018-05-09 17:39:19 +00:00
}
2018-11-24 10:18:24 +00:00
return true
2018-05-09 17:39:19 +00:00
}
2019-01-28 17:17:44 +00:00
2019-02-15 20:40:36 +00:00
func ( c * Cluster ) getBackupImage ( ) string {
2019-07-18 20:26:56 +00:00
rkeToolsImage , err := util . GetDefaultRKETools ( c . SystemImages . Alpine )
if err != nil {
logrus . Errorf ( "[etcd] error getting backup image %v" , err )
return ""
}
2020-01-20 17:43:35 +00:00
logrus . Debugf ( "[etcd] Image used for etcd snapshot is: [%s]" , rkeToolsImage )
2019-07-18 20:26:56 +00:00
return rkeToolsImage
2019-01-28 17:17:44 +00:00
}