2017-10-29 09:45:21 +00:00
package services
import (
2019-08-19 17:53:15 +00:00
"context"
2019-06-11 22:31:01 +00:00
"encoding/base64"
2017-11-14 18:11:21 +00:00
"fmt"
2018-05-09 17:39:19 +00:00
"path"
2018-02-21 01:53:32 +00:00
"strings"
2018-01-11 01:00:14 +00:00
"time"
2017-11-14 18:11:21 +00:00
2018-01-11 01:00:14 +00:00
etcdclient "github.com/coreos/etcd/client"
2018-05-09 17:39:19 +00:00
"github.com/docker/docker/api/types/container"
2018-02-21 01:53:32 +00:00
"github.com/pkg/errors"
2017-10-31 13:55:35 +00:00
"github.com/rancher/rke/docker"
2017-10-29 09:45:21 +00:00
"github.com/rancher/rke/hosts"
2018-01-09 22:10:56 +00:00
"github.com/rancher/rke/log"
2018-05-09 17:39:19 +00:00
"github.com/rancher/rke/pki"
2019-08-19 17:53:15 +00:00
"github.com/rancher/rke/pki/cert"
2020-07-11 16:24:19 +00:00
v3 "github.com/rancher/rke/types"
2018-10-23 23:38:00 +00:00
"github.com/rancher/rke/util"
2018-01-11 01:00:14 +00:00
"github.com/sirupsen/logrus"
2018-10-23 23:38:00 +00:00
"golang.org/x/sync/errgroup"
2017-10-29 09:45:21 +00:00
)
2018-05-15 21:06:05 +00:00
const (
2019-12-05 23:31:41 +00:00
EtcdSnapshotPath = "/opt/rke/etcd-snapshots/"
EtcdRestorePath = "/opt/rke/etcd-snapshots-restore/"
EtcdDataDir = "/var/lib/rancher/etcd/"
EtcdInitWaitTime = 10
EtcdSnapshotWaitTime = 5
EtcdPermFixContainerName = "etcd-fix-perm"
2018-05-15 21:06:05 +00:00
)
2018-05-09 17:39:19 +00:00
func RunEtcdPlane (
ctx context . Context ,
etcdHosts [ ] * hosts . Host ,
etcdNodePlanMap map [ string ] v3 . RKEConfigNodePlan ,
localConnDialerFactory hosts . DialerFactory ,
prsMap map [ string ] v3 . PrivateRegistry ,
updateWorkersOnly bool ,
alpineImage string ,
2019-01-25 19:26:29 +00:00
es v3 . ETCDService ,
certMap map [ string ] pki . CertificatePKI ) error {
2018-03-31 10:53:59 +00:00
log . Infof ( ctx , "[%s] Building up etcd plane.." , ETCDRole )
2017-10-29 09:45:21 +00:00
for _ , host := range etcdHosts {
2018-03-20 12:56:49 +00:00
if updateWorkersOnly {
continue
}
2019-07-24 19:58:50 +00:00
2018-04-11 22:54:47 +00:00
etcdProcess := etcdNodePlanMap [ host . Address ] . Processes [ EtcdContainerName ]
2019-07-24 19:58:50 +00:00
// need to run this first to set proper ownership and permissions on etcd data dir
if err := setEtcdPermissions ( ctx , host , prsMap , alpineImage , etcdProcess ) ; err != nil {
return err
}
2019-08-14 10:53:32 +00:00
imageCfg , hostCfg , _ := GetProcessConfig ( etcdProcess , host )
2018-03-21 17:20:58 +00:00
if err := docker . DoRunContainer ( ctx , host . DClient , imageCfg , hostCfg , EtcdContainerName , host . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
2018-12-13 08:46:47 +00:00
if * es . Snapshot == true {
2019-07-18 20:26:56 +00:00
rkeToolsImage , err := util . GetDefaultRKETools ( alpineImage )
if err != nil {
2018-05-09 17:39:19 +00:00
return err
}
2019-07-18 20:26:56 +00:00
if err := RunEtcdSnapshotSave ( ctx , host , prsMap , rkeToolsImage , EtcdSnapshotContainerName , false , es ) ; err != nil {
return err
}
if err := pki . SaveBackupBundleOnHost ( ctx , host , rkeToolsImage , EtcdSnapshotPath , prsMap ) ; err != nil {
2018-05-29 15:21:24 +00:00
return err
}
2020-07-31 12:23:25 +00:00
if err := createLogLink ( ctx , host , EtcdSnapshotContainerName , ETCDRole , alpineImage , prsMap ) ; err != nil {
return err
}
2018-11-09 23:34:25 +00:00
} else {
if err := docker . DoRemoveContainer ( ctx , host . DClient , EtcdSnapshotContainerName , host . Address ) ; err != nil {
return err
}
2018-05-09 17:39:19 +00:00
}
2018-03-21 17:20:58 +00:00
if err := createLogLink ( ctx , host , EtcdContainerName , ETCDRole , alpineImage , prsMap ) ; err != nil {
2017-10-29 09:45:21 +00:00
return err
}
}
2019-01-25 19:26:29 +00:00
log . Infof ( ctx , "[%s] Successfully started etcd plane.. Checking etcd cluster health" , ETCDRole )
clientCert := cert . EncodeCertPEM ( certMap [ pki . KubeNodeCertName ] . Certificate )
2019-10-30 22:11:26 +00:00
clientKey := cert . EncodePrivateKeyPEM ( certMap [ pki . KubeNodeCertName ] . Key )
var healthError error
var hosts [ ] string
2019-01-25 19:26:29 +00:00
for _ , host := range etcdHosts {
2019-08-14 10:53:32 +00:00
_ , _ , healthCheckURL := GetProcessConfig ( etcdNodePlanMap [ host . Address ] . Processes [ EtcdContainerName ] , host )
2019-10-30 22:11:26 +00:00
healthError = isEtcdHealthy ( localConnDialerFactory , host , clientCert , clientKey , healthCheckURL )
if healthError == nil {
2019-01-25 19:26:29 +00:00
break
}
2019-10-30 22:11:26 +00:00
logrus . Warn ( healthError )
hosts = append ( hosts , host . Address )
2019-01-25 19:26:29 +00:00
}
2019-10-30 22:11:26 +00:00
if healthError != nil {
return fmt . Errorf ( "etcd cluster is unhealthy: hosts [%s] failed to report healthy." +
" Check etcd container logs on each host for more information" , strings . Join ( hosts , "," ) )
2019-01-25 19:26:29 +00:00
}
2017-10-29 09:45:21 +00:00
return nil
}
2018-08-20 04:37:04 +00:00
func RestartEtcdPlane ( ctx context . Context , etcdHosts [ ] * hosts . Host ) error {
log . Infof ( ctx , "[%s] Restarting up etcd plane.." , ETCDRole )
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( etcdHosts )
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
runHost := host . ( * hosts . Host )
if err := docker . DoRestartContainer ( ctx , runHost . DClient , EtcdContainerName , runHost . Address ) ; err != nil {
errList = append ( errList , err )
}
}
return util . ErrList ( errList )
} )
}
if err := errgrp . Wait ( ) ; err != nil {
return err
}
log . Infof ( ctx , "[%s] Successfully restarted etcd plane.." , ETCDRole )
return nil
}
2018-01-19 01:48:51 +00:00
func RemoveEtcdPlane ( ctx context . Context , etcdHosts [ ] * hosts . Host , force bool ) error {
2018-03-31 10:53:59 +00:00
log . Infof ( ctx , "[%s] Tearing down etcd plane.." , ETCDRole )
2018-01-19 01:48:51 +00:00
2018-10-23 23:38:00 +00:00
var errgrp errgroup . Group
hostsQueue := util . GetObjectQueue ( etcdHosts )
for w := 0 ; w < WorkerThreads ; w ++ {
errgrp . Go ( func ( ) error {
var errList [ ] error
for host := range hostsQueue {
runHost := host . ( * hosts . Host )
if err := docker . DoRemoveContainer ( ctx , runHost . DClient , EtcdContainerName , runHost . Address ) ; err != nil {
errList = append ( errList , err )
}
2019-07-08 22:16:25 +00:00
if err := docker . DoRemoveContainer ( ctx , runHost . DClient , EtcdSnapshotContainerName , runHost . Address ) ; err != nil {
errList = append ( errList , err )
}
2018-10-23 23:38:00 +00:00
if ! runHost . IsWorker || ! runHost . IsControl || force {
// remove unschedulable kubelet on etcd host
if err := removeKubelet ( ctx , runHost ) ; err != nil {
errList = append ( errList , err )
}
if err := removeKubeproxy ( ctx , runHost ) ; err != nil {
errList = append ( errList , err )
}
if err := removeNginxProxy ( ctx , runHost ) ; err != nil {
errList = append ( errList , err )
}
if err := removeSidekick ( ctx , runHost ) ; err != nil {
errList = append ( errList , err )
}
}
}
return util . ErrList ( errList )
} )
}
if err := errgrp . Wait ( ) ; err != nil {
return err
2017-11-20 18:08:50 +00:00
}
2018-03-31 10:53:59 +00:00
log . Infof ( ctx , "[%s] Successfully tore down etcd plane.." , ETCDRole )
2017-11-20 18:08:50 +00:00
return nil
}
2018-02-21 01:53:32 +00:00
func AddEtcdMember ( ctx context . Context , toAddEtcdHost * hosts . Host , etcdHosts [ ] * hosts . Host , localConnDialerFactory hosts . DialerFactory , cert , key [ ] byte ) error {
log . Infof ( ctx , "[add/%s] Adding member [etcd-%s] to etcd cluster" , ETCDRole , toAddEtcdHost . HostnameOverride )
peerURL := fmt . Sprintf ( "https://%s:2380" , toAddEtcdHost . InternalAddress )
2018-01-11 01:00:14 +00:00
added := false
for _ , host := range etcdHosts {
2018-02-21 01:53:32 +00:00
if host . Address == toAddEtcdHost . Address {
continue
}
2018-01-16 23:10:14 +00:00
etcdClient , err := getEtcdClient ( ctx , host , localConnDialerFactory , cert , key )
2018-01-11 01:00:14 +00:00
if err != nil {
logrus . Debugf ( "Failed to create etcd client for host [%s]: %v" , host . Address , err )
continue
}
memAPI := etcdclient . NewMembersAPI ( etcdClient )
if _ , err := memAPI . Add ( ctx , peerURL ) ; err != nil {
2018-02-21 01:53:32 +00:00
logrus . Debugf ( "Failed to Add etcd member [%s] from host: %v" , host . Address , err )
2018-01-11 01:00:14 +00:00
continue
}
added = true
break
}
if ! added {
2018-02-21 01:53:32 +00:00
return fmt . Errorf ( "Failed to add etcd member [etcd-%s] to etcd cluster" , toAddEtcdHost . HostnameOverride )
2018-01-11 01:00:14 +00:00
}
2018-02-21 01:53:32 +00:00
log . Infof ( ctx , "[add/%s] Successfully Added member [etcd-%s] to etcd cluster" , ETCDRole , toAddEtcdHost . HostnameOverride )
2018-01-11 01:00:14 +00:00
return nil
}
2020-09-29 11:53:45 +00:00
func RemoveEtcdMember ( ctx context . Context , toDeleteEtcdHost * hosts . Host , etcdHosts [ ] * hosts . Host , localConnDialerFactory hosts . DialerFactory , cert , key [ ] byte , etcdNodePlanMap map [ string ] v3 . RKEConfigNodePlan ) error {
log . Infof ( ctx , "[remove/%s] Removing member [etcd-%s] from etcd cluster" , ETCDRole , toDeleteEtcdHost . HostnameOverride )
2018-01-11 01:00:14 +00:00
var mID string
removed := false
for _ , host := range etcdHosts {
2018-01-16 23:10:14 +00:00
etcdClient , err := getEtcdClient ( ctx , host , localConnDialerFactory , cert , key )
2018-01-11 01:00:14 +00:00
if err != nil {
logrus . Debugf ( "Failed to create etcd client for host [%s]: %v" , host . Address , err )
continue
}
memAPI := etcdclient . NewMembersAPI ( etcdClient )
members , err := memAPI . List ( ctx )
if err != nil {
logrus . Debugf ( "Failed to list etcd members from host [%s]: %v" , host . Address , err )
continue
}
for _ , member := range members {
2020-09-29 11:53:45 +00:00
if member . Name == fmt . Sprintf ( "etcd-%s" , toDeleteEtcdHost . HostnameOverride ) {
2018-01-11 01:00:14 +00:00
mID = member . ID
break
}
2017-10-29 09:45:21 +00:00
}
2018-01-11 01:00:14 +00:00
if err := memAPI . Remove ( ctx , mID ) ; err != nil {
logrus . Debugf ( "Failed to list etcd members from host [%s]: %v" , host . Address , err )
continue
}
2020-09-29 11:53:45 +00:00
etcdMemberDeletedTime := time . Now ( )
// Need to health check after successful member remove (especially for leader re-election)
// We will check all hosts to see if the cluster becomes healthy
var healthError error
_ , _ , healthCheckURL := GetProcessConfig ( etcdNodePlanMap [ host . Address ] . Processes [ EtcdContainerName ] , host )
logrus . Infof ( "[remove/%s] Checking etcd cluster health on [etcd-%s] after removing [etcd-%s]" , ETCDRole , host . HostnameOverride , toDeleteEtcdHost . HostnameOverride )
logrus . Debugf ( "[remove/%s] healthCheckURL for checking etcd cluster health on [etcd-%s] after removing [%s]: [%s]" , ETCDRole , host . HostnameOverride , toDeleteEtcdHost . HostnameOverride , healthCheckURL )
healthError = isEtcdHealthy ( localConnDialerFactory , host , cert , key , healthCheckURL )
if healthError == nil {
logrus . Infof ( "[remove/%s] etcd cluster health is healthy on [etcd-%s] after removing [etcd-%s]" , ETCDRole , host . HostnameOverride , toDeleteEtcdHost . HostnameOverride )
etcdHealthyTime := time . Now ( )
diffTime := etcdHealthyTime . Sub ( etcdMemberDeletedTime )
logrus . Debugf ( "Total time between etcd member deleted and etcd cluster healthy is: [%s]" , diffTime )
removed = true
break
}
logrus . Warn ( healthError )
2018-01-11 01:00:14 +00:00
}
if ! removed {
2020-09-29 11:53:45 +00:00
return fmt . Errorf ( "Failed to delete etcd member [etcd-%s] from etcd cluster" , toDeleteEtcdHost . HostnameOverride )
2017-10-29 09:45:21 +00:00
}
2020-09-29 11:53:45 +00:00
log . Infof ( ctx , "[remove/%s] Successfully removed member [etcd-%s] from etcd cluster" , ETCDRole , toDeleteEtcdHost . HostnameOverride )
2018-01-11 01:00:14 +00:00
return nil
2017-10-29 09:45:21 +00:00
}
2017-11-15 01:12:33 +00:00
2018-09-24 23:24:45 +00:00
func ReloadEtcdCluster ( ctx context . Context , readyEtcdHosts [ ] * hosts . Host , newHost * hosts . Host , localConnDialerFactory hosts . DialerFactory , cert , key [ ] byte , prsMap map [ string ] v3 . PrivateRegistry , etcdNodePlanMap map [ string ] v3 . RKEConfigNodePlan , alpineImage string ) error {
2019-08-14 10:53:32 +00:00
imageCfg , hostCfg , _ := GetProcessConfig ( etcdNodePlanMap [ newHost . Address ] . Processes [ EtcdContainerName ] , newHost )
2019-07-24 19:58:50 +00:00
if err := setEtcdPermissions ( ctx , newHost , prsMap , alpineImage , etcdNodePlanMap [ newHost . Address ] . Processes [ EtcdContainerName ] ) ; err != nil {
return err
}
2018-09-24 23:24:45 +00:00
if err := docker . DoRunContainer ( ctx , newHost . DClient , imageCfg , hostCfg , EtcdContainerName , newHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
if err := createLogLink ( ctx , newHost , EtcdContainerName , ETCDRole , alpineImage , prsMap ) ; err != nil {
return err
}
2018-10-08 18:46:15 +00:00
time . Sleep ( EtcdInitWaitTime * time . Second )
2019-10-30 22:11:26 +00:00
var healthError error
var hosts [ ] string
2018-01-11 01:00:14 +00:00
for _ , host := range readyEtcdHosts {
2019-08-14 10:53:32 +00:00
_ , _ , healthCheckURL := GetProcessConfig ( etcdNodePlanMap [ host . Address ] . Processes [ EtcdContainerName ] , host )
2019-10-30 22:11:26 +00:00
healthError = isEtcdHealthy ( localConnDialerFactory , host , cert , key , healthCheckURL )
if healthError == nil {
2018-01-11 01:00:14 +00:00
break
}
2019-10-30 22:11:26 +00:00
logrus . Warn ( healthError )
hosts = append ( hosts , host . Address )
2018-01-11 01:00:14 +00:00
}
2019-10-30 22:11:26 +00:00
if healthError != nil {
return fmt . Errorf ( "etcd cluster is unhealthy: hosts [%s] failed to report healthy." +
" Check etcd container logs on each host for more information" , strings . Join ( hosts , "," ) )
2018-01-16 23:10:14 +00:00
}
2018-01-11 01:00:14 +00:00
return nil
2017-11-15 01:12:33 +00:00
}
2018-02-21 01:53:32 +00:00
func IsEtcdMember ( ctx context . Context , etcdHost * hosts . Host , etcdHosts [ ] * hosts . Host , localConnDialerFactory hosts . DialerFactory , cert , key [ ] byte ) ( bool , error ) {
var listErr error
peerURL := fmt . Sprintf ( "https://%s:2380" , etcdHost . InternalAddress )
for _ , host := range etcdHosts {
if host . Address == etcdHost . Address {
continue
}
etcdClient , err := getEtcdClient ( ctx , host , localConnDialerFactory , cert , key )
if err != nil {
listErr = errors . Wrapf ( err , "Failed to create etcd client for host [%s]" , host . Address )
logrus . Debugf ( "Failed to create etcd client for host [%s]: %v" , host . Address , err )
continue
}
memAPI := etcdclient . NewMembersAPI ( etcdClient )
members , err := memAPI . List ( ctx )
if err != nil {
listErr = errors . Wrapf ( err , "Failed to create etcd client for host [%s]" , host . Address )
logrus . Debugf ( "Failed to list etcd cluster members [%s]: %v" , etcdHost . Address , err )
continue
}
for _ , member := range members {
if strings . Contains ( member . PeerURLs [ 0 ] , peerURL ) {
logrus . Infof ( "[etcd] member [%s] is already part of the etcd cluster" , etcdHost . Address )
return true , nil
}
}
2018-03-06 22:32:50 +00:00
// reset the list of errors to handle new hosts
listErr = nil
break
2018-02-21 01:53:32 +00:00
}
if listErr != nil {
return false , listErr
}
return false , nil
}
2018-05-09 17:39:19 +00:00
2018-12-13 08:46:47 +00:00
func RunEtcdSnapshotSave ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage string , name string , once bool , es v3 . ETCDService ) error {
2019-02-15 20:40:36 +00:00
backupCmd := "etcd-backup"
2019-03-15 17:21:34 +00:00
restartPolicy := "always"
2018-05-09 17:39:19 +00:00
imageCfg := & container . Config {
Cmd : [ ] string {
2018-08-02 00:33:46 +00:00
"/opt/rke-tools/rke-etcd-backup" ,
2019-02-15 20:40:36 +00:00
backupCmd ,
2018-12-13 08:46:47 +00:00
"save" ,
2018-05-09 17:39:19 +00:00
"--cacert" , pki . GetCertPath ( pki . CACertName ) ,
"--cert" , pki . GetCertPath ( pki . KubeNodeCertName ) ,
"--key" , pki . GetKeyPath ( pki . KubeNodeCertName ) ,
"--name" , name ,
2018-05-16 01:55:41 +00:00
"--endpoints=" + etcdHost . InternalAddress + ":2379" ,
2018-05-09 17:39:19 +00:00
} ,
2018-05-17 22:27:35 +00:00
Image : etcdSnapshotImage ,
2019-07-16 00:54:59 +00:00
Env : es . ExtraEnv ,
2018-05-09 17:39:19 +00:00
}
2019-08-21 21:08:30 +00:00
// Configure imageCfg for one time snapshot
2018-05-09 17:39:19 +00:00
if once {
imageCfg . Cmd = append ( imageCfg . Cmd , "--once" )
2019-03-15 17:21:34 +00:00
restartPolicy = "no"
2019-08-21 21:08:30 +00:00
// Configure imageCfg for rolling snapshots
2018-12-13 08:46:47 +00:00
} else if es . BackupConfig == nil {
imageCfg . Cmd = append ( imageCfg . Cmd , "--retention=" + es . Retention )
imageCfg . Cmd = append ( imageCfg . Cmd , "--creation=" + es . Creation )
2018-05-09 17:39:19 +00:00
}
2019-08-21 21:08:30 +00:00
// Configure imageCfg for S3 backups
2019-03-11 23:04:08 +00:00
if es . BackupConfig != nil {
2018-12-13 08:46:47 +00:00
imageCfg = configS3BackupImgCmd ( ctx , imageCfg , es . BackupConfig )
2018-05-09 17:39:19 +00:00
}
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
fmt . Sprintf ( "%s:/etc/kubernetes:z" , path . Join ( etcdHost . PrefixPath , "/etc/kubernetes" ) ) } ,
2018-11-08 20:25:13 +00:00
NetworkMode : container . NetworkMode ( "host" ) ,
2019-03-15 17:21:34 +00:00
RestartPolicy : container . RestartPolicy { Name : restartPolicy } ,
2018-05-09 17:39:19 +00:00
}
if once {
2019-08-21 21:08:30 +00:00
log . Infof ( ctx , "[etcd] Running snapshot save once on host [%s]" , etcdHost . Address )
logrus . Debugf ( "[etcd] Using command [%s] for snapshot save once container [%s] on host [%s]" , getSanitizedSnapshotCmd ( imageCfg , es . BackupConfig ) , EtcdSnapshotOnceContainerName , etcdHost . Address )
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdSnapshotOnceContainerName , etcdHost . Address ) ; err != nil {
return err
}
2018-05-17 22:27:35 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdSnapshotOnceContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
2018-05-09 17:39:19 +00:00
return err
}
2019-01-24 22:56:51 +00:00
status , _ , stderr , err := docker . GetContainerOutput ( ctx , etcdHost . DClient , EtcdSnapshotOnceContainerName , etcdHost . Address )
2018-05-09 17:39:19 +00:00
if status != 0 || err != nil {
2019-01-24 22:56:51 +00:00
if removeErr := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotOnceContainerName ) ; removeErr != nil {
2019-08-21 21:08:30 +00:00
log . Warnf ( ctx , "[etcd] Failed to remove container [%s] on host [%s]: %v" , removeErr , etcdHost . Address )
2019-01-24 22:56:51 +00:00
}
2018-12-13 08:46:47 +00:00
if err != nil {
2019-01-24 22:56:51 +00:00
return err
2018-12-13 08:46:47 +00:00
}
2019-08-21 21:08:30 +00:00
return fmt . Errorf ( "[etcd] Failed to take one-time snapshot on host [%s], exit code [%d]: %v" , etcdHost . Address , status , stderr )
2018-05-09 17:39:19 +00:00
}
2019-01-24 22:56:51 +00:00
2018-05-17 22:27:35 +00:00
return docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotOnceContainerName )
2018-05-09 17:39:19 +00:00
}
2019-08-22 20:26:04 +00:00
log . Infof ( ctx , "[etcd] Running rolling snapshot container [%s] on host [%s]" , EtcdSnapshotOnceContainerName , etcdHost . Address )
logrus . Debugf ( "[etcd] Using command [%s] for rolling snapshot container [%s] on host [%s]" , getSanitizedSnapshotCmd ( imageCfg , es . BackupConfig ) , EtcdSnapshotContainerName , etcdHost . Address )
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdSnapshotContainerName , etcdHost . Address ) ; err != nil {
return err
}
2018-11-08 20:25:13 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdSnapshotContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
// check if the container exited with error
snapshotCont , err := docker . InspectContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotContainerName )
if err != nil {
return err
}
2018-11-12 23:49:58 +00:00
time . Sleep ( EtcdSnapshotWaitTime * time . Second )
2018-11-08 20:25:13 +00:00
if snapshotCont . State . Status == "exited" || snapshotCont . State . Restarting {
2019-08-21 21:08:30 +00:00
log . Warnf ( ctx , "[etcd] etcd rolling snapshot container failed to start correctly" )
2018-11-08 20:25:13 +00:00
return docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotContainerName )
}
return nil
2018-05-09 17:39:19 +00:00
}
2020-03-30 19:16:47 +00:00
func RunGetStateFileFromSnapshot ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage string , name string , es v3 . ETCDService ) ( string , error ) {
backupCmd := "etcd-backup"
imageCfg := & container . Config {
Cmd : [ ] string {
"/opt/rke-tools/rke-etcd-backup" ,
backupCmd ,
"extractstatefile" ,
"--name" , name ,
} ,
Image : etcdSnapshotImage ,
Env : es . ExtraEnv ,
}
// Configure imageCfg for S3 backups
if es . BackupConfig != nil {
imageCfg = configS3BackupImgCmd ( ctx , imageCfg , es . BackupConfig )
}
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
2020-03-30 19:16:47 +00:00
} ,
NetworkMode : container . NetworkMode ( "host" ) ,
RestartPolicy : container . RestartPolicy { Name : "no" } ,
}
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdStateFileContainerName , etcdHost . Address ) ; err != nil {
return "" , err
}
if err := docker . DoRunOnetimeContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdStateFileContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return "" , err
}
statefile , err := docker . ReadFileFromContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdStateFileContainerName , "/tmp/cluster.rkestate" )
if err != nil {
return "" , err
}
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdStateFileContainerName , etcdHost . Address ) ; err != nil {
return "" , err
}
return statefile , nil
}
2019-01-28 17:17:44 +00:00
func DownloadEtcdSnapshotFromS3 ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage string , name string , es v3 . ETCDService ) error {
2018-12-13 08:46:47 +00:00
s3Backend := es . BackupConfig . S3BackupConfig
if len ( s3Backend . Endpoint ) == 0 || len ( s3Backend . BucketName ) == 0 {
return fmt . Errorf ( "failed to get snapshot [%s] from s3 on host [%s], invalid s3 configurations" , name , etcdHost . Address )
}
imageCfg := & container . Config {
Cmd : [ ] string {
"/opt/rke-tools/rke-etcd-backup" ,
"etcd-backup" ,
"download" ,
"--name" , name ,
2019-01-28 17:17:44 +00:00
"--s3-backup=true" ,
2019-01-22 21:48:53 +00:00
"--s3-endpoint=" + s3Backend . Endpoint ,
"--s3-bucketName=" + s3Backend . BucketName ,
"--s3-region=" + s3Backend . Region ,
2018-12-13 08:46:47 +00:00
} ,
Image : etcdSnapshotImage ,
2019-07-16 00:54:59 +00:00
Env : es . ExtraEnv ,
2018-12-13 08:46:47 +00:00
}
2021-03-24 20:23:34 +00:00
// Base64 encoding S3 accessKey and secretKey before add them as env variables
if len ( s3Backend . AccessKey ) > 0 || len ( s3Backend . SecretKey ) > 0 {
env := [ ] string {
"S3_ACCESS_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( s3Backend . AccessKey ) ) ,
"S3_SECRET_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( s3Backend . SecretKey ) ) ,
}
imageCfg . Env = append ( imageCfg . Env , env ... )
}
2020-01-20 17:43:35 +00:00
s3Logline := fmt . Sprintf ( "[etcd] Snapshot [%s] will be downloaded on host [%s] from S3 compatible backend at [%s] from bucket [%s] using accesskey [%s]" , name , etcdHost . Address , s3Backend . Endpoint , s3Backend . BucketName , s3Backend . AccessKey )
if s3Backend . Region != "" {
s3Logline += fmt . Sprintf ( " and using region [%s]" , s3Backend . Region )
}
2019-06-25 20:12:17 +00:00
if s3Backend . CustomCA != "" {
caStr := base64 . StdEncoding . EncodeToString ( [ ] byte ( s3Backend . CustomCA ) )
2019-06-11 22:31:01 +00:00
imageCfg . Cmd = append ( imageCfg . Cmd , "--s3-endpoint-ca=" + caStr )
2020-01-20 17:43:35 +00:00
s3Logline += fmt . Sprintf ( " and using endpoint CA [%s]" , caStr )
2019-06-11 22:31:01 +00:00
}
2019-07-18 16:43:07 +00:00
if s3Backend . Folder != "" {
imageCfg . Cmd = append ( imageCfg . Cmd , "--s3-folder=" + s3Backend . Folder )
2020-01-20 17:43:35 +00:00
s3Logline += fmt . Sprintf ( " and using folder [%s]" , s3Backend . Folder )
2019-07-18 16:43:07 +00:00
}
2020-01-20 17:43:35 +00:00
log . Infof ( ctx , s3Logline )
2018-12-13 08:46:47 +00:00
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
fmt . Sprintf ( "%s:/etc/kubernetes:z" , path . Join ( etcdHost . PrefixPath , "/etc/kubernetes" ) ) } ,
2018-12-13 08:46:47 +00:00
NetworkMode : container . NetworkMode ( "host" ) ,
2019-06-05 18:24:51 +00:00
RestartPolicy : container . RestartPolicy { Name : "no" } ,
2018-12-13 08:46:47 +00:00
}
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdDownloadBackupContainerName , etcdHost . Address ) ; err != nil {
return err
}
2018-12-13 08:46:47 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdDownloadBackupContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
2019-01-24 22:56:51 +00:00
status , _ , stderr , err := docker . GetContainerOutput ( ctx , etcdHost . DClient , EtcdDownloadBackupContainerName , etcdHost . Address )
2018-12-13 08:46:47 +00:00
if status != 0 || err != nil {
2019-01-24 22:56:51 +00:00
if removeErr := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdDownloadBackupContainerName ) ; removeErr != nil {
log . Warnf ( ctx , "Failed to remove container [%s]: %v" , removeErr )
}
if err != nil {
return err
2018-12-13 08:46:47 +00:00
}
2019-01-24 22:56:51 +00:00
return fmt . Errorf ( "Failed to download etcd snapshot from s3, exit code [%d]: %v" , status , stderr )
2018-12-13 08:46:47 +00:00
}
return docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdDownloadBackupContainerName )
}
2019-12-03 23:45:26 +00:00
func RestoreEtcdSnapshot ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry ,
etcdRestoreImage , etcdBackupImage , snapshotName , initCluster string , es v3 . ETCDService ) error {
2018-05-17 22:27:35 +00:00
log . Infof ( ctx , "[etcd] Restoring [%s] snapshot on etcd host [%s]" , snapshotName , etcdHost . Address )
2019-07-24 20:25:14 +00:00
nodeName := pki . GetCrtNameForHost ( etcdHost , pki . EtcdCertName )
2018-09-05 19:07:10 +00:00
snapshotPath := fmt . Sprintf ( "%s%s" , EtcdSnapshotPath , snapshotName )
2018-05-09 17:39:19 +00:00
2018-07-18 20:44:55 +00:00
// make sure that restore path is empty otherwise etcd restore will fail
2018-05-09 17:39:19 +00:00
imageCfg := & container . Config {
Cmd : [ ] string {
2018-05-15 21:06:05 +00:00
"sh" , "-c" , strings . Join ( [ ] string {
2018-07-18 20:44:55 +00:00
"rm -rf" , EtcdRestorePath ,
"&& /usr/local/bin/etcdctl" ,
2018-05-16 01:55:41 +00:00
fmt . Sprintf ( "--endpoints=[%s:2379]" , etcdHost . InternalAddress ) ,
2018-05-15 21:06:05 +00:00
"--cacert" , pki . GetCertPath ( pki . CACertName ) ,
"--cert" , pki . GetCertPath ( nodeName ) ,
"--key" , pki . GetKeyPath ( nodeName ) ,
2018-05-17 22:27:35 +00:00
"snapshot" , "restore" , snapshotPath ,
2018-05-15 21:06:05 +00:00
"--data-dir=" + EtcdRestorePath ,
"--name=etcd-" + etcdHost . HostnameOverride ,
"--initial-cluster=" + initCluster ,
"--initial-cluster-token=etcd-cluster-1" ,
"--initial-advertise-peer-urls=https://" + etcdHost . InternalAddress + ":2380" ,
"&& mv" , EtcdRestorePath + "*" , EtcdDataDir ,
"&& rm -rf" , EtcdRestorePath ,
} , " " ) ,
2018-05-09 17:39:19 +00:00
} ,
2019-07-16 00:54:59 +00:00
Env : append ( [ ] string { "ETCDCTL_API=3" } , es . ExtraEnv ... ) ,
2018-05-09 17:39:19 +00:00
Image : etcdRestoreImage ,
}
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
"/opt/rke/:/opt/rke/:z" ,
fmt . Sprintf ( "%s:/var/lib/rancher/etcd:z" , path . Join ( etcdHost . PrefixPath , "/var/lib/etcd" ) ) ,
fmt . Sprintf ( "%s:/etc/kubernetes:z" , path . Join ( etcdHost . PrefixPath , "/etc/kubernetes" ) ) } ,
2018-05-09 17:39:19 +00:00
NetworkMode : container . NetworkMode ( "host" ) ,
}
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdRestoreContainerName , etcdHost . Address ) ; err != nil {
return err
}
2018-05-09 17:39:19 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdRestoreContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
status , err := docker . WaitForContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdRestoreContainerName )
if err != nil {
return err
}
if status != 0 {
2018-11-24 10:18:24 +00:00
containerLog , _ , err := docker . GetContainerLogsStdoutStderr ( ctx , etcdHost . DClient , EtcdRestoreContainerName , "5" , false )
2018-07-18 20:44:55 +00:00
if err != nil {
return err
}
if err := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdRestoreContainerName ) ; err != nil {
return err
}
// printing the restore container's logs
return fmt . Errorf ( "Failed to run etcd restore container, exit status is: %d, container logs: %s" , status , containerLog )
2018-05-09 17:39:19 +00:00
}
2019-08-17 00:15:58 +00:00
if err := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdRestoreContainerName ) ; err != nil {
return err
}
2019-12-03 23:45:26 +00:00
return RunEtcdSnapshotRemove ( ctx , etcdHost , prsMap , etcdBackupImage , snapshotName , true , es )
2018-05-09 17:39:19 +00:00
}
2018-11-24 10:18:24 +00:00
2019-08-17 00:15:58 +00:00
func RunEtcdSnapshotRemove ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage string , name string , cleanupRestore bool , es v3 . ETCDService ) error {
2019-04-02 10:54:02 +00:00
log . Infof ( ctx , "[etcd] Removing snapshot [%s] from host [%s]" , name , etcdHost . Address )
imageCfg := & container . Config {
Image : etcdSnapshotImage ,
2019-07-16 00:54:59 +00:00
Env : es . ExtraEnv ,
2019-09-17 00:11:16 +00:00
Cmd : [ ] string {
"/opt/rke-tools/rke-etcd-backup" ,
"etcd-backup" ,
"delete" ,
"--name" , name ,
} ,
2019-04-02 10:54:02 +00:00
}
2019-08-17 00:15:58 +00:00
if cleanupRestore {
2019-09-17 00:11:16 +00:00
imageCfg . Cmd = append ( imageCfg . Cmd , "--cleanup" )
}
if es . BackupConfig != nil && es . BackupConfig . S3BackupConfig != nil {
s3cmd := [ ] string {
"--s3-backup" ,
"--s3-endpoint=" + es . BackupConfig . S3BackupConfig . Endpoint ,
"--s3-bucketName=" + es . BackupConfig . S3BackupConfig . BucketName ,
"--s3-region=" + es . BackupConfig . S3BackupConfig . Region ,
2019-08-17 00:15:58 +00:00
}
2021-03-24 20:23:34 +00:00
// Base64 encoding S3 accessKey and secretKey before add them as env variables
if len ( es . BackupConfig . S3BackupConfig . AccessKey ) > 0 || len ( es . BackupConfig . S3BackupConfig . SecretKey ) > 0 {
env := [ ] string {
"S3_ACCESS_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( es . BackupConfig . S3BackupConfig . AccessKey ) ) ,
"S3_SECRET_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( es . BackupConfig . S3BackupConfig . SecretKey ) ) ,
}
imageCfg . Env = append ( imageCfg . Env , env ... )
}
2019-09-17 00:11:16 +00:00
if es . BackupConfig . S3BackupConfig . CustomCA != "" {
caStr := base64 . StdEncoding . EncodeToString ( [ ] byte ( es . BackupConfig . S3BackupConfig . CustomCA ) )
s3cmd = append ( s3cmd , "--s3-endpoint-ca=" + caStr )
2019-08-17 00:15:58 +00:00
}
2019-09-17 00:11:16 +00:00
if es . BackupConfig . S3BackupConfig . Folder != "" {
s3cmd = append ( s3cmd , "--s3-folder=" + es . BackupConfig . S3BackupConfig . Folder )
}
imageCfg . Cmd = append ( imageCfg . Cmd , s3cmd ... )
2019-08-17 00:15:58 +00:00
}
2019-04-02 10:54:02 +00:00
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
2019-04-02 10:54:02 +00:00
} ,
RestartPolicy : container . RestartPolicy { Name : "no" } ,
}
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdSnapshotRemoveContainerName , etcdHost . Address ) ; err != nil {
return err
}
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdSnapshotRemoveContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
status , _ , stderr , err := docker . GetContainerOutput ( ctx , etcdHost . DClient , EtcdSnapshotRemoveContainerName , etcdHost . Address )
if status != 0 || err != nil {
if removeErr := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotRemoveContainerName ) ; removeErr != nil {
log . Warnf ( ctx , "Failed to remove container [%s]: %v" , removeErr )
}
if err != nil {
return err
}
return fmt . Errorf ( "Failed to remove snapshot [%s] on host [%s], exit code [%d]: %v" , EtcdSnapshotRemoveContainerName , etcdHost . Address , status , stderr )
}
return docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdSnapshotRemoveContainerName )
}
2018-11-24 10:18:24 +00:00
func GetEtcdSnapshotChecksum ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , alpineImage , snapshotName string ) ( string , error ) {
var checksum string
var err error
2019-05-28 20:04:10 +00:00
var stderr string
2018-11-24 10:18:24 +00:00
2019-05-28 20:04:10 +00:00
// compressedSnapshotPath := fmt.Sprintf("%s%s.%s", EtcdSnapshotPath, snapshotName, EtcdSnapshotCompressedExtension)
2018-11-24 10:18:24 +00:00
snapshotPath := fmt . Sprintf ( "%s%s" , EtcdSnapshotPath , snapshotName )
imageCfg := & container . Config {
Cmd : [ ] string {
"sh" , "-c" , strings . Join ( [ ] string {
2019-07-12 20:04:03 +00:00
" if [ -f '" , snapshotPath , "' ]; then md5sum '" , snapshotPath , "' | cut -f1 -d' ' | tr -d '\n'; else echo 'snapshot file does not exist' >&2; fi" } , "" ) ,
2018-11-24 10:18:24 +00:00
} ,
Image : alpineImage ,
}
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
"/opt/rke/:/opt/rke/:z" ,
2018-11-24 10:18:24 +00:00
} }
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdChecksumContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return checksum , err
}
if _ , err := docker . WaitForContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdChecksumContainerName ) ; err != nil {
return checksum , err
}
2019-05-28 20:04:10 +00:00
stderr , checksum , err = docker . GetContainerLogsStdoutStderr ( ctx , etcdHost . DClient , EtcdChecksumContainerName , "1" , false )
2018-11-24 10:18:24 +00:00
if err != nil {
return checksum , err
}
2019-05-28 20:04:10 +00:00
if stderr != "" {
return checksum , fmt . Errorf ( "Error output not nil from snapshot checksum container [%s]: %s" , EtcdChecksumContainerName , stderr )
}
2018-11-24 10:18:24 +00:00
if err := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdChecksumContainerName ) ; err != nil {
return checksum , err
}
return checksum , nil
}
2018-12-13 08:46:47 +00:00
func configS3BackupImgCmd ( ctx context . Context , imageCfg * container . Config , bc * v3 . BackupConfig ) * container . Config {
cmd := [ ] string {
"--creation=" + fmt . Sprintf ( "%dh" , bc . IntervalHours ) ,
"--retention=" + fmt . Sprintf ( "%dh" , bc . Retention * bc . IntervalHours ) ,
}
2019-01-14 20:20:10 +00:00
if bc . S3BackupConfig != nil {
cmd = append ( cmd , [ ] string {
"--s3-backup=true" ,
"--s3-endpoint=" + bc . S3BackupConfig . Endpoint ,
"--s3-bucketName=" + bc . S3BackupConfig . BucketName ,
"--s3-region=" + bc . S3BackupConfig . Region ,
} ... )
2021-03-24 20:23:34 +00:00
// Base64 encoding S3 accessKey and secretKey before add them as env variables
if len ( bc . S3BackupConfig . AccessKey ) > 0 || len ( bc . S3BackupConfig . SecretKey ) > 0 {
env := [ ] string {
"S3_ACCESS_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( bc . S3BackupConfig . AccessKey ) ) ,
"S3_SECRET_KEY=" + base64 . StdEncoding . EncodeToString ( [ ] byte ( bc . S3BackupConfig . SecretKey ) ) ,
}
imageCfg . Env = append ( imageCfg . Env , env ... )
}
2020-01-20 17:43:35 +00:00
s3Logline := fmt . Sprintf ( "[etcd] Snapshots configured to S3 compatible backend at [%s] to bucket [%s] using accesskey [%s]" , bc . S3BackupConfig . Endpoint , bc . S3BackupConfig . BucketName , bc . S3BackupConfig . AccessKey )
if bc . S3BackupConfig . Region != "" {
s3Logline += fmt . Sprintf ( " and using region [%s]" , bc . S3BackupConfig . Region )
}
2019-06-25 20:12:17 +00:00
if bc . S3BackupConfig . CustomCA != "" {
caStr := base64 . StdEncoding . EncodeToString ( [ ] byte ( bc . S3BackupConfig . CustomCA ) )
2019-06-11 22:31:01 +00:00
cmd = append ( cmd , "--s3-endpoint-ca=" + caStr )
2019-08-21 21:08:30 +00:00
s3Logline += fmt . Sprintf ( " and using endpoint CA [%s]" , caStr )
2019-06-11 22:31:01 +00:00
}
2019-07-18 16:43:07 +00:00
if bc . S3BackupConfig . Folder != "" {
cmd = append ( cmd , "--s3-folder=" + bc . S3BackupConfig . Folder )
2019-08-21 21:08:30 +00:00
s3Logline += fmt . Sprintf ( " and using folder [%s]" , bc . S3BackupConfig . Folder )
2019-07-18 16:43:07 +00:00
}
2019-08-21 21:08:30 +00:00
log . Infof ( ctx , s3Logline )
2019-01-14 20:20:10 +00:00
}
2018-12-13 08:46:47 +00:00
imageCfg . Cmd = append ( imageCfg . Cmd , cmd ... )
return imageCfg
}
2019-01-28 17:17:44 +00:00
func StartBackupServer ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage string , name string ) error {
log . Infof ( ctx , "[etcd] starting backup server on host [%s]" , etcdHost . Address )
imageCfg := & container . Config {
Cmd : [ ] string {
"/opt/rke-tools/rke-etcd-backup" ,
"etcd-backup" ,
"serve" ,
"--name" , name ,
"--cacert" , pki . GetCertPath ( pki . CACertName ) ,
"--cert" , pki . GetCertPath ( pki . KubeNodeCertName ) ,
"--key" , pki . GetKeyPath ( pki . KubeNodeCertName ) ,
} ,
Image : etcdSnapshotImage ,
}
2019-02-27 05:34:54 +00:00
2019-01-28 17:17:44 +00:00
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
fmt . Sprintf ( "%s:/etc/kubernetes:z" , path . Join ( etcdHost . PrefixPath , "/etc/kubernetes" ) ) } ,
2019-01-28 17:17:44 +00:00
NetworkMode : container . NetworkMode ( "host" ) ,
2019-02-27 05:34:54 +00:00
RestartPolicy : container . RestartPolicy { Name : "no" } ,
}
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdServeBackupContainerName , etcdHost . Address ) ; err != nil {
return err
}
2019-02-27 05:34:54 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdServeBackupContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
2019-03-16 19:11:34 +00:00
time . Sleep ( EtcdSnapshotWaitTime * time . Second )
2019-02-27 05:34:54 +00:00
container , err := docker . InspectContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdServeBackupContainerName )
if err != nil {
return err
}
if ! container . State . Running {
containerLog , _ , err := docker . GetContainerLogsStdoutStderr ( ctx , etcdHost . DClient , EtcdServeBackupContainerName , "1" , false )
if err != nil {
return err
}
if err := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdServeBackupContainerName ) ; err != nil {
return err
}
// printing the restore container's logs
return fmt . Errorf ( "Failed to run backup server container, container logs: %s" , containerLog )
2019-01-28 17:17:44 +00:00
}
2019-02-27 05:34:54 +00:00
return nil
2019-01-28 17:17:44 +00:00
}
func DownloadEtcdSnapshotFromBackupServer ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , etcdSnapshotImage , name string , backupServer * hosts . Host ) error {
log . Infof ( ctx , "[etcd] Get snapshot [%s] on host [%s]" , name , etcdHost . Address )
imageCfg := & container . Config {
Cmd : [ ] string {
"/opt/rke-tools/rke-etcd-backup" ,
"etcd-backup" ,
"download" ,
"--name" , name ,
2019-03-09 19:13:45 +00:00
"--local-endpoint" , backupServer . InternalAddress ,
2019-01-28 17:17:44 +00:00
"--cacert" , pki . GetCertPath ( pki . CACertName ) ,
"--cert" , pki . GetCertPath ( pki . KubeNodeCertName ) ,
"--key" , pki . GetKeyPath ( pki . KubeNodeCertName ) ,
} ,
Image : etcdSnapshotImage ,
}
hostCfg := & container . HostConfig {
Binds : [ ] string {
2021-07-29 06:59:54 +00:00
fmt . Sprintf ( "%s:/backup:z" , EtcdSnapshotPath ) ,
fmt . Sprintf ( "%s:/etc/kubernetes:z" , path . Join ( etcdHost . PrefixPath , "/etc/kubernetes" ) ) } ,
2019-01-28 17:17:44 +00:00
NetworkMode : container . NetworkMode ( "host" ) ,
RestartPolicy : container . RestartPolicy { Name : "on-failure" } ,
}
2019-03-15 19:00:32 +00:00
if err := docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdDownloadBackupContainerName , etcdHost . Address ) ; err != nil {
return err
}
2019-01-28 17:17:44 +00:00
if err := docker . DoRunContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdDownloadBackupContainerName , etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
status , _ , stderr , err := docker . GetContainerOutput ( ctx , etcdHost . DClient , EtcdDownloadBackupContainerName , etcdHost . Address )
if status != 0 || err != nil {
if removeErr := docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdDownloadBackupContainerName ) ; removeErr != nil {
log . Warnf ( ctx , "Failed to remove container [%s]: %v" , removeErr )
}
if err != nil {
return err
}
return fmt . Errorf ( "Failed to download etcd snapshot from backup server [%s], exit code [%d]: %v" , backupServer . Address , status , stderr )
}
return docker . RemoveContainer ( ctx , etcdHost . DClient , etcdHost . Address , EtcdDownloadBackupContainerName )
}
2019-07-24 19:58:50 +00:00
func setEtcdPermissions ( ctx context . Context , etcdHost * hosts . Host , prsMap map [ string ] v3 . PrivateRegistry , alpineImage string , process v3 . Process ) error {
var dataBind string
cmd := fmt . Sprintf ( "chmod 700 %s" , EtcdDataDir )
if len ( process . User ) != 0 {
cmd = fmt . Sprintf ( "chmod 700 %s ; chown -R %s %s" , EtcdDataDir , process . User , EtcdDataDir )
}
imageCfg := & container . Config {
Cmd : [ ] string {
"sh" , "-c" ,
cmd ,
} ,
Image : alpineImage ,
}
for _ , bind := range process . Binds {
if strings . Contains ( bind , "/var/lib/etcd" ) {
dataBind = bind
}
}
hostCfg := & container . HostConfig {
Binds : [ ] string { dataBind } ,
}
2019-12-05 23:31:41 +00:00
if err := docker . DoRunOnetimeContainer ( ctx , etcdHost . DClient , imageCfg , hostCfg , EtcdPermFixContainerName ,
etcdHost . Address , ETCDRole , prsMap ) ; err != nil {
return err
}
return docker . DoRemoveContainer ( ctx , etcdHost . DClient , EtcdPermFixContainerName , etcdHost . Address )
2019-07-24 19:58:50 +00:00
}
2019-08-21 21:08:30 +00:00
func getSanitizedSnapshotCmd ( imageCfg * container . Config , bc * v3 . BackupConfig ) string {
cmd := strings . Join ( imageCfg . Cmd , " " )
if bc != nil && bc . S3BackupConfig != nil {
return strings . Replace ( cmd , bc . S3BackupConfig . SecretKey , "***" , - 1 )
}
return cmd
}