2018-05-09 17:39:19 +00:00
package cmd
import (
"context"
2020-03-11 01:18:17 +00:00
"crypto/x509"
2018-05-09 17:39:19 +00:00
"fmt"
2019-03-01 00:06:45 +00:00
"strings"
2018-05-17 22:27:35 +00:00
"time"
2018-05-09 17:39:19 +00:00
2020-03-11 01:18:17 +00:00
"github.com/pkg/errors"
2018-05-09 17:39:19 +00:00
"github.com/rancher/rke/cluster"
"github.com/rancher/rke/hosts"
"github.com/rancher/rke/log"
"github.com/rancher/rke/pki"
2020-07-11 16:24:19 +00:00
v3 "github.com/rancher/rke/types"
2018-05-17 22:27:35 +00:00
"github.com/sirupsen/logrus"
2018-05-09 17:39:19 +00:00
"github.com/urfave/cli"
)
2018-12-13 08:46:47 +00:00
const s3Endpoint = "s3.amazonaws.com"
2018-05-09 17:39:19 +00:00
func EtcdCommand ( ) cli . Command {
2018-05-17 22:27:35 +00:00
snapshotFlags := [ ] cli . Flag {
2018-05-09 17:39:19 +00:00
cli . StringFlag {
Name : "name" ,
2018-12-13 08:46:47 +00:00
Usage : "Specify snapshot name" ,
2018-05-09 17:39:19 +00:00
} ,
cli . StringFlag {
Name : "config" ,
Usage : "Specify an alternate cluster YAML file" ,
Value : pki . ClusterConfig ,
EnvVar : "RKE_CONFIG" ,
} ,
2018-12-13 08:46:47 +00:00
cli . BoolFlag {
Name : "s3" ,
2019-02-21 18:38:25 +00:00
Usage : "Enabled backup to s3" ,
2018-12-13 08:46:47 +00:00
} ,
cli . StringFlag {
Name : "s3-endpoint" ,
Usage : "Specify s3 endpoint url" ,
Value : s3Endpoint ,
} ,
2019-06-11 22:31:01 +00:00
cli . StringFlag {
Name : "s3-endpoint-ca" ,
Usage : "Specify a custom CA cert to connect to S3 endpoint" ,
} ,
2018-12-13 08:46:47 +00:00
cli . StringFlag {
Name : "access-key" ,
Usage : "Specify s3 accessKey" ,
} ,
cli . StringFlag {
Name : "secret-key" ,
Usage : "Specify s3 secretKey" ,
} ,
cli . StringFlag {
Name : "bucket-name" ,
Usage : "Specify s3 bucket name" ,
} ,
cli . StringFlag {
Name : "region" ,
Usage : "Specify the s3 bucket location (optional)" ,
} ,
2019-07-18 16:43:07 +00:00
cli . StringFlag {
Name : "folder" ,
Usage : "Specify s3 folder name" ,
} ,
2018-05-09 17:39:19 +00:00
}
2020-03-12 16:02:44 +00:00
snapshotSaveFlags := append ( snapshotFlags , commonFlags ... )
snapshotRestoreFlags := [ ] cli . Flag {
cli . StringFlag {
Name : "cert-dir" ,
Usage : "Specify a certificate dir path" ,
} ,
cli . BoolFlag {
Name : "custom-certs" ,
Usage : "Use custom certificates from a cert dir" ,
} ,
2020-08-03 13:35:38 +00:00
cli . BoolFlag {
Name : "use-local-state" ,
Usage : "Use local state file (do not check or use snapshot archive for state file)" ,
} ,
2020-03-12 16:02:44 +00:00
}
snapshotRestoreFlags = append ( append ( snapshotFlags , snapshotRestoreFlags ... ) , commonFlags ... )
2018-05-09 17:39:19 +00:00
return cli . Command {
Name : "etcd" ,
2018-05-17 22:27:35 +00:00
Usage : "etcd snapshot save/restore operations in k8s cluster" ,
2018-05-09 17:39:19 +00:00
Subcommands : [ ] cli . Command {
{
2018-05-17 15:59:54 +00:00
Name : "snapshot-save" ,
Usage : "Take snapshot on all etcd hosts" ,
2020-03-12 16:02:44 +00:00
Flags : snapshotSaveFlags ,
2018-05-17 22:27:35 +00:00
Action : SnapshotSaveEtcdHostsFromCli ,
2018-05-09 17:39:19 +00:00
} ,
{
2018-05-17 15:59:54 +00:00
Name : "snapshot-restore" ,
Usage : "Restore existing snapshot" ,
2020-03-12 16:02:44 +00:00
Flags : snapshotRestoreFlags ,
2018-05-17 22:27:35 +00:00
Action : RestoreEtcdSnapshotFromCli ,
2018-05-09 17:39:19 +00:00
} ,
} ,
}
}
2018-05-17 22:27:35 +00:00
func SnapshotSaveEtcdHosts (
2018-05-09 17:39:19 +00:00
ctx context . Context ,
rkeConfig * v3 . RancherKubernetesEngineConfig ,
2018-11-07 23:54:08 +00:00
dialersOptions hosts . DialersOptions ,
flags cluster . ExternalFlags , snapshotName string ) error {
2018-05-09 17:39:19 +00:00
2018-05-17 15:59:54 +00:00
log . Infof ( ctx , "Starting saving snapshot on etcd hosts" )
2020-03-30 19:16:47 +00:00
stateFilePath := cluster . GetStateFilePath ( flags . ClusterFilePath , flags . ConfigDir )
2019-10-03 01:56:39 +00:00
kubeCluster , err := cluster . InitClusterObject ( ctx , rkeConfig , flags , "" )
2018-05-09 17:39:19 +00:00
if err != nil {
return err
}
2018-11-07 23:54:08 +00:00
if err := kubeCluster . SetupDialers ( ctx , dialersOptions ) ; err != nil {
2018-11-03 01:45:23 +00:00
return err
}
2018-05-09 17:39:19 +00:00
2018-11-07 23:54:08 +00:00
if err := kubeCluster . TunnelHosts ( ctx , flags ) ; err != nil {
2018-05-09 17:39:19 +00:00
return err
}
2020-08-14 16:42:37 +00:00
if err := kubeCluster . DeployStateFile ( ctx , stateFilePath , snapshotName ) ; err != nil {
2020-03-30 19:16:47 +00:00
return err
}
2018-11-22 00:20:24 +00:00
if err := kubeCluster . SnapshotEtcd ( ctx , snapshotName ) ; err != nil {
2018-05-29 15:21:24 +00:00
return err
}
2019-08-21 21:08:30 +00:00
log . Infof ( ctx , "Finished saving/uploading snapshot [%s] on all etcd hosts" , snapshotName )
2018-05-09 17:39:19 +00:00
return nil
}
2018-05-17 22:27:35 +00:00
func RestoreEtcdSnapshot (
2018-05-09 17:39:19 +00:00
ctx context . Context ,
rkeConfig * v3 . RancherKubernetesEngineConfig ,
2018-11-07 23:54:08 +00:00
dialersOptions hosts . DialersOptions ,
2019-06-17 20:52:15 +00:00
flags cluster . ExternalFlags ,
data map [ string ] interface { } ,
snapshotName string ) ( string , string , string , string , map [ string ] pki . CertificatePKI , error ) {
2019-03-09 01:51:41 +00:00
var APIURL , caCrt , clientCert , clientKey string
2019-10-03 01:56:39 +00:00
2020-03-30 19:16:47 +00:00
rkeFullState := & cluster . FullState { }
stateFileRetrieved := false
// Local state file
2019-10-03 01:56:39 +00:00
stateFilePath := cluster . GetStateFilePath ( flags . ClusterFilePath , flags . ConfigDir )
2020-08-03 13:35:38 +00:00
if ! flags . UseLocalState {
log . Infof ( ctx , "Checking if state file is included in snapshot file for [%s]" , snapshotName )
// Creating temp cluster to check if snapshot archive contains state file and retrieve it
tempCluster , err := cluster . InitClusterObject ( ctx , rkeConfig , flags , "" )
if err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
if err := tempCluster . SetupDialers ( ctx , dialersOptions ) ; err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
if err := tempCluster . TunnelHosts ( ctx , flags ) ; err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
// Extract state file from snapshot
stateFile , err := tempCluster . GetStateFileFromSnapshot ( ctx , snapshotName )
// If state file is not in snapshot (or can't be retrieved), fallback to local state file
if err != nil {
logrus . Infof ( "Could not extract state file from snapshot [%s] on any host, falling back to local state file: %v" , snapshotName , err )
rkeFullState , _ = cluster . ReadStateFile ( ctx , stateFilePath )
} else {
// Parse extracted state file to FullState struct
rkeFullState , err = cluster . StringToFullState ( ctx , stateFile )
if err != nil {
logrus . Errorf ( "Error when converting state file contents to rkeFullState: %v" , err )
return APIURL , caCrt , clientCert , clientKey , nil , err
}
logrus . Infof ( "State file is successfully extracted from snapshot [%s]" , snapshotName )
stateFileRetrieved = true
}
2020-03-30 19:16:47 +00:00
} else {
2020-08-03 13:35:38 +00:00
var err error
log . Infof ( ctx , "Not checking if state file is included in snapshot file for [%s], using local state file [%s]" , snapshotName , stateFilePath )
rkeFullState , err = cluster . ReadStateFile ( ctx , stateFilePath )
2020-03-30 19:16:47 +00:00
if err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
}
log . Infof ( ctx , "Restoring etcd snapshot %s" , snapshotName )
2019-10-03 01:56:39 +00:00
kubeCluster , err := cluster . InitClusterObject ( ctx , rkeConfig , flags , rkeFullState . DesiredState . EncryptionConfig )
2018-11-24 10:18:24 +00:00
if err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
2020-03-11 01:18:17 +00:00
if err := validateCerts ( rkeFullState . DesiredState ) ; err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
2020-08-03 13:35:38 +00:00
// If we can't retrieve state file from snapshot, and we don't have local, we need to check for legacy cluster
if ! stateFileRetrieved || flags . UseLocalState {
2020-03-30 19:16:47 +00:00
if err := checkLegacyCluster ( ctx , kubeCluster , rkeFullState , flags ) ; err != nil {
return APIURL , caCrt , clientCert , clientKey , nil , err
}
2019-02-14 21:00:37 +00:00
}
2018-11-24 10:18:24 +00:00
rkeFullState . CurrentState = cluster . State { }
if err := rkeFullState . WriteStateFile ( ctx , stateFilePath ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
2018-11-07 23:54:08 +00:00
if err := kubeCluster . SetupDialers ( ctx , dialersOptions ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-03 01:45:23 +00:00
}
2018-11-07 23:54:08 +00:00
if err := kubeCluster . TunnelHosts ( ctx , flags ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-05-09 17:39:19 +00:00
}
2019-03-02 02:28:40 +00:00
// if we fail after cleanup, we can't find the certs to do the download, we need to redeploy them
if err := kubeCluster . DeployRestoreCerts ( ctx , rkeFullState . DesiredState . CertificatesBundle ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2019-03-02 02:28:40 +00:00
}
2019-01-28 17:17:44 +00:00
// first download and check
if err := kubeCluster . PrepareBackup ( ctx , snapshotName ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2019-01-28 17:17:44 +00:00
}
2018-11-24 10:18:24 +00:00
log . Infof ( ctx , "Cleaning old kubernetes cluster" )
if err := kubeCluster . CleanupNodes ( ctx ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
2018-05-17 22:27:35 +00:00
if err := kubeCluster . RestoreEtcdSnapshot ( ctx , snapshotName ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-05-09 17:39:19 +00:00
}
2018-11-22 00:20:24 +00:00
2018-11-24 10:18:24 +00:00
if err := ClusterInit ( ctx , rkeConfig , dialersOptions , flags ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
2019-06-17 20:52:15 +00:00
APIURL , caCrt , clientCert , clientKey , certs , err := ClusterUp ( ctx , dialersOptions , flags , data )
2019-03-09 01:51:41 +00:00
if err != nil {
2019-03-01 00:06:45 +00:00
if ! strings . Contains ( err . Error ( ) , "Provisioning incomplete" ) {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2019-03-01 00:06:45 +00:00
}
log . Warnf ( ctx , err . Error ( ) )
2018-11-24 10:18:24 +00:00
}
2019-03-01 00:06:45 +00:00
2018-11-24 10:18:24 +00:00
if err := cluster . RestartClusterPods ( ctx , kubeCluster ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
if err := kubeCluster . RemoveOldNodes ( ctx ) ; err != nil {
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , nil , err
2018-11-24 10:18:24 +00:00
}
2018-05-17 22:27:35 +00:00
log . Infof ( ctx , "Finished restoring snapshot [%s] on all etcd hosts" , snapshotName )
2019-03-09 01:51:41 +00:00
return APIURL , caCrt , clientCert , clientKey , certs , err
2018-05-09 17:39:19 +00:00
}
2020-03-11 01:18:17 +00:00
func validateCerts ( state cluster . State ) error {
var failedErrs error
if state . RancherKubernetesEngineConfig == nil {
// possibly already started a restore
return nil
}
for name , certPKI := range state . CertificatesBundle {
if name == pki . ServiceAccountTokenKeyName || name == pki . RequestHeaderCACertName || name == pki . KubeAdminCertName {
continue
}
cert := certPKI . Certificate
if cert == nil {
if failedErrs == nil {
failedErrs = fmt . Errorf ( "Certificate [%s] is nil" , certPKI . Name )
} else {
failedErrs = errors . Wrap ( failedErrs , fmt . Sprintf ( "Certificate [%s] is nil" , certPKI . Name ) )
}
2020-03-18 20:12:53 +00:00
continue
2020-03-11 01:18:17 +00:00
}
certPool := x509 . NewCertPool ( )
certPool . AddCert ( cert )
if _ , err := cert . Verify ( x509 . VerifyOptions { Roots : certPool , KeyUsages : [ ] x509 . ExtKeyUsage { x509 . ExtKeyUsageClientAuth } } ) ; err != nil {
if failedErrs == nil {
failedErrs = fmt . Errorf ( "Certificate [%s] failed verification: %v" , certPKI . Name , err )
2020-03-18 23:52:19 +00:00
} else {
failedErrs = errors . Wrap ( failedErrs , fmt . Sprintf ( "Certificate [%s] failed verification: %v" , certPKI . Name , err ) )
2020-03-11 01:18:17 +00:00
}
}
}
if failedErrs != nil {
return errors . Wrap ( failedErrs , "[etcd] Failed to restore etcd snapshot: invalid certs" )
}
return nil
}
2018-05-17 22:27:35 +00:00
func SnapshotSaveEtcdHostsFromCli ( ctx * cli . Context ) error {
2019-09-03 20:01:22 +00:00
logrus . Infof ( "Running RKE version: %v" , ctx . App . Version )
2018-05-09 17:39:19 +00:00
clusterFile , filePath , err := resolveClusterFile ( ctx )
if err != nil {
2018-12-13 08:46:47 +00:00
return fmt . Errorf ( "failed to resolve cluster file: %v" , err )
2018-05-09 17:39:19 +00:00
}
rkeConfig , err := cluster . ParseConfig ( clusterFile )
if err != nil {
2018-12-13 08:46:47 +00:00
return fmt . Errorf ( "failed to parse cluster file: %v" , err )
2018-05-09 17:39:19 +00:00
}
rkeConfig , err = setOptionsFromCLI ( ctx , rkeConfig )
if err != nil {
return err
}
2018-05-17 22:27:35 +00:00
// Check snapshot name
etcdSnapshotName := ctx . String ( "name" )
if etcdSnapshotName == "" {
etcdSnapshotName = fmt . Sprintf ( "rke_etcd_snapshot_%s" , time . Now ( ) . Format ( time . RFC3339 ) )
2021-05-15 16:55:04 +00:00
logrus . Warnf ( "Name of the snapshot is not specified, using [%s]" , etcdSnapshotName )
2018-05-17 22:27:35 +00:00
}
2018-11-07 23:54:08 +00:00
// setting up the flags
2020-08-03 13:35:38 +00:00
flags := cluster . GetExternalFlags ( false , false , false , false , "" , filePath )
2018-11-07 23:54:08 +00:00
return SnapshotSaveEtcdHosts ( context . Background ( ) , rkeConfig , hosts . DialersOptions { } , flags , etcdSnapshotName )
2018-05-09 17:39:19 +00:00
}
2018-05-17 22:27:35 +00:00
func RestoreEtcdSnapshotFromCli ( ctx * cli . Context ) error {
2019-09-03 20:01:22 +00:00
logrus . Infof ( "Running RKE version: %v" , ctx . App . Version )
2018-05-09 17:39:19 +00:00
clusterFile , filePath , err := resolveClusterFile ( ctx )
if err != nil {
2018-12-13 08:46:47 +00:00
return fmt . Errorf ( "failed to resolve cluster file: %v" , err )
2018-05-09 17:39:19 +00:00
}
rkeConfig , err := cluster . ParseConfig ( clusterFile )
if err != nil {
2018-12-13 08:46:47 +00:00
return fmt . Errorf ( "failed to parse cluster file: %v" , err )
2018-05-09 17:39:19 +00:00
}
rkeConfig , err = setOptionsFromCLI ( ctx , rkeConfig )
if err != nil {
return err
}
2018-05-17 22:27:35 +00:00
etcdSnapshotName := ctx . String ( "name" )
if etcdSnapshotName == "" {
2018-12-13 08:46:47 +00:00
return fmt . Errorf ( "you must specify the snapshot name to restore" )
2018-05-17 22:27:35 +00:00
}
2021-01-17 15:09:12 +00:00
// Warn user if etcdSnapshotName contains extension (should just be snapshotname, not the filename)
if strings . HasSuffix ( etcdSnapshotName , ".zip" ) {
logrus . Warnf ( "The snapshot name [%s] ends with the file extension (.zip) which is not needed, the snapshot name should be provided without the extension" , etcdSnapshotName )
}
2018-11-07 23:54:08 +00:00
// setting up the flags
2020-08-03 13:35:38 +00:00
// flag to use local state file
useLocalState := ctx . Bool ( "use-local-state" )
flags := cluster . GetExternalFlags ( false , false , false , useLocalState , "" , filePath )
2020-03-12 16:02:44 +00:00
// Custom certificates and certificate dir flags
flags . CertificateDir = ctx . String ( "cert-dir" )
flags . CustomCerts = ctx . Bool ( "custom-certs" )
2018-11-07 23:54:08 +00:00
2019-06-17 20:52:15 +00:00
_ , _ , _ , _ , _ , err = RestoreEtcdSnapshot ( context . Background ( ) , rkeConfig , hosts . DialersOptions { } , flags , map [ string ] interface { } { } , etcdSnapshotName )
2019-03-09 01:51:41 +00:00
return err
2018-05-09 17:39:19 +00:00
}
2019-04-02 10:54:02 +00:00
func SnapshotRemoveFromEtcdHosts (
ctx context . Context ,
rkeConfig * v3 . RancherKubernetesEngineConfig ,
dialersOptions hosts . DialersOptions ,
flags cluster . ExternalFlags , snapshotName string ) error {
log . Infof ( ctx , "Starting snapshot remove on etcd hosts" )
2019-10-03 01:56:39 +00:00
kubeCluster , err := cluster . InitClusterObject ( ctx , rkeConfig , flags , "" )
2019-04-02 10:54:02 +00:00
if err != nil {
return err
}
if err := kubeCluster . SetupDialers ( ctx , dialersOptions ) ; err != nil {
return err
}
if err := kubeCluster . TunnelHosts ( ctx , flags ) ; err != nil {
return err
}
if err := kubeCluster . RemoveEtcdSnapshot ( ctx , snapshotName ) ; err != nil {
return err
}
log . Infof ( ctx , "Finished removing snapshot [%s] from all etcd hosts" , snapshotName )
return nil
}