diff --git a/README.md b/README.md index 3562d8e3..e0608a64 100644 --- a/README.md +++ b/README.md @@ -363,6 +363,7 @@ nodes: ``` ## Deploying Rancher 2.0 using rke + Using RKE's pluggable user addons, it's possible to deploy Rancher 2.0 server in HA with a single command. Depending how you want to manage your ssl certificates, there are 2 deployment options: @@ -416,6 +417,122 @@ kubectl -n cattle-system scale deployment cattle --replicas=3 # chown /var/run/docker.sock ``` +## Etcd Backup and Restoration + +You can configure a Rancher Kubernetes Engine (RKE) cluster to automatically create backups of etcd. In a disaster scenario, you can restore these backups, which are stored on other cluster nodes. + +### Etcd Regular Backup + +To schedule a recurring automatic etcd backup, enable the `etcd-backup` service. `etcd-backup` runs in a service container alongside the `etcd` container. `etcd-backup` automatically creates backups and stores them to its local disk. + +To enable `etcd-backup` in RKE CLI, configure the following three variables: + +``` +services: + etcd: + backup: true + creation: 5m0s + retention: 24h +``` + +- `backup`: Enables/disables etcd backups in the RKE cluster. + + Default value: `false`. +- `creation`: Time period in which `etcd-backup` creates and stores local backups. + + Default value: `5m0s` + +- `retention`: Time period before before an etcd backup expires. Expired backups are purged. + + Default value: `24h` + +After RKE runs, view the `etcd-backup` logs to confirm backups are being created automatically: +``` +# docker logs etcd-backup +time="2018-05-04T18:39:16Z" level=info msg="Initializing Rolling Backups" creation=1m0s retention=24h0m0s +time="2018-05-04T18:40:16Z" level=info msg="Created backup" name="2018-05-04T18:40:16Z_etcd" runtime=108.332814ms +time="2018-05-04T18:41:16Z" level=info msg="Created backup" name="2018-05-04T18:41:16Z_etcd" runtime=92.880112ms +time="2018-05-04T18:42:16Z" level=info msg="Created backup" name="2018-05-04T18:42:16Z_etcd" runtime=83.67642ms +time="2018-05-04T18:43:16Z" level=info msg="Created backup" name="2018-05-04T18:43:16Z_etcd" runtime=86.298499ms +``` +Backups are saved to the following directory: `/opt/rke/etcdbackup/`. Backups are created on each node that runs etcd. + + +### Etcd onetime Snapshots + +RKE also added two commands that for etcd backup management: +``` +./rke etcd backup [NAME] +``` +and +``` +./rke etcd restore [NAME] +``` + +The backup command saves a snapshot of etcd in `/opt/rke/etcdbackup`. This command also creates a container for the backup. When the backup completes, the container is removed. + +``` +# ./rke etcd backup --name snapshot + +INFO[0000] Starting Backup on etcd hosts +INFO[0000] [dialer] Setup tunnel for host [x.x.x.x] +INFO[0002] [dialer] Setup tunnel for host [y.y.y.y] +INFO[0004] [dialer] Setup tunnel for host [z.z.z.z] +INFO[0006] [etcd] Starting backup on host [x.x.x.x] +INFO[0007] [etcd] Successfully started [etcd-backup-once] container on host [x.x.x.x] +INFO[0007] [etcd] Starting backup on host [y.y.y.y] +INFO[0009] [etcd] Successfully started [etcd-backup-once] container on host [y.y.y.y] +INFO[0010] [etcd] Starting backup on host [z.z.z.z] +INFO[0011] [etcd] Successfully started [etcd-backup-once] container on host [z.z.z.z] +INFO[0011] Finished backup on all etcd hosts +``` +### Etcd Disaster recovery + +`etcd restore` is used for etcd Disaster recovery, it reverts to any snapshot stored in `/opt/rke/etcdbackup` that you explicitly define. When you run `etcd restire`, RKE removes the old etcd container if it still exists. To restore operations, RKE creates a new etcd cluster using the snapshot you choose. + +>**Warning:** Restoring an etcd backup deletes your current etcd cluster and replaces it with a new one. Before you run the `etcd restore` command, backup any important data in your current cluster. + +``` +./rke etcd restore --name snapshot --config test-aws.yml +INFO[0000] Starting restore on etcd hosts +INFO[0000] [dialer] Setup tunnel for host [x.x.x.x] +INFO[0002] [dialer] Setup tunnel for host [y.y.y.y] +INFO[0005] [dialer] Setup tunnel for host [z.z.z.z] +INFO[0007] [hosts] Cleaning up host [x.x.x.x] +INFO[0007] [hosts] Running cleaner container on host [x.x.x.x] +INFO[0008] [kube-cleaner] Successfully started [kube-cleaner] container on host [x.x.x.x] +INFO[0008] [hosts] Removing cleaner container on host [x.x.x.x] +INFO[0008] [hosts] Successfully cleaned up host [x.x.x.x] +INFO[0009] [hosts] Cleaning up host [y.y.y.y] +INFO[0009] [hosts] Running cleaner container on host [y.y.y.y] +INFO[0010] [kube-cleaner] Successfully started [kube-cleaner] container on host [y.y.y.y] +INFO[0010] [hosts] Removing cleaner container on host [y.y.y.y] +INFO[0010] [hosts] Successfully cleaned up host [y.y.y.y] +INFO[0011] [hosts] Cleaning up host [z.z.z.z] +INFO[0011] [hosts] Running cleaner container on host [z.z.z.z] +INFO[0012] [kube-cleaner] Successfully started [kube-cleaner] container on host [z.z.z.z] +INFO[0012] [hosts] Removing cleaner container on host [z.z.z.z] +INFO[0012] [hosts] Successfully cleaned up host [z.z.z.z] +INFO[0012] [etcd] Restoring [snapshot] snapshot on etcd host [x.x.x.x] +INFO[0013] [etcd] Successfully started [etcd-restore] container on host [x.x.x.x] +INFO[0014] [etcd] Restoring [snapshot] snapshot on etcd host [y.y.y.y] +INFO[0015] [etcd] Successfully started [etcd-restore] container on host [y.y.y.y] +INFO[0015] [etcd] Restoring [snapshot] snapshot on etcd host [z.z.z.z] +INFO[0016] [etcd] Successfully started [etcd-restore] container on host [z.z.z.z] +INFO[0017] [etcd] Building up etcd plane.. +INFO[0018] [etcd] Successfully started [etcd] container on host [x.x.x.x] +INFO[0020] [etcd] Successfully started [rke-log-linker] container on host [x.x.x.x] +INFO[0021] [remove/rke-log-linker] Successfully removed container on host [x.x.x.x] +INFO[0022] [etcd] Successfully started [etcd] container on host [y.y.y.y] +INFO[0023] [etcd] Successfully started [rke-log-linker] container on host [y.y.y.y] +INFO[0025] [remove/rke-log-linker] Successfully removed container on host [y.y.y.y] +INFO[0025] [etcd] Successfully started [etcd] container on host [z.z.z.z] +INFO[0027] [etcd] Successfully started [rke-log-linker] container on host [z.z.z.z] +INFO[0027] [remove/rke-log-linker] Successfully removed container on host [z.z.z.z] +INFO[0027] [etcd] Successfully started etcd plane.. +INFO[0027] Finished restoring on all etcd hosts +``` + ## License Copyright (c) 2018 [Rancher Labs, Inc.](http://rancher.com) diff --git a/cluster/certificates.go b/cluster/certificates.go index 8fdc0824..39caa7b8 100644 --- a/cluster/certificates.go +++ b/cluster/certificates.go @@ -228,3 +228,21 @@ func fetchBackupCertificates(ctx context.Context, backupHosts []*hosts.Host, kub // reporting the last error only. return nil, err } + +func fetchCertificatesFromEtcd(ctx context.Context, kubeCluster *Cluster) ([]byte, []byte, error) { + // Get kubernetes certificates from the etcd hosts + certificates := map[string]pki.CertificatePKI{} + var err error + for _, host := range kubeCluster.EtcdHosts { + certificates, err = pki.FetchCertificatesFromHost(ctx, kubeCluster.EtcdHosts, host, kubeCluster.SystemImages.Alpine, kubeCluster.LocalKubeConfigPath, kubeCluster.PrivateRegistriesMap) + if certificates != nil { + break + } + } + if err != nil || certificates == nil { + return nil, nil, fmt.Errorf("Failed to fetch certificates from etcd hosts: %v", err) + } + clientCert := cert.EncodeCertPEM(certificates[pki.KubeNodeCertName].Certificate) + clientkey := cert.EncodePrivateKeyPEM(certificates[pki.KubeNodeCertName].Key) + return clientCert, clientkey, nil +} diff --git a/cluster/cluster.go b/cluster/cluster.go index b538f822..9a8eaeee 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -77,7 +77,12 @@ func (c *Cluster) DeployControlPlane(ctx context.Context) error { if len(c.Services.Etcd.ExternalURLs) > 0 { log.Infof(ctx, "[etcd] External etcd connection string has been specified, skipping etcd plane") } else { - if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine); err != nil { + etcdBackup := services.EtcdBackup{ + Backup: c.Services.Etcd.Backup, + Creation: c.Services.Etcd.Creation, + Retention: c.Services.Etcd.Retention, + } + if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, etcdBackup); err != nil { return fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err) } } diff --git a/cluster/defaults.go b/cluster/defaults.go index 246b5729..9d99d591 100644 --- a/cluster/defaults.go +++ b/cluster/defaults.go @@ -27,7 +27,9 @@ const ( DefaultNetworkPlugin = "canal" DefaultNetworkCloudProvider = "none" - DefaultIngressController = "nginx" + DefaultIngressController = "nginx" + DefaultEtcdBackupCreationPeriod = "5m0s" + DefaultEtcdBackupRetentionPeriod = "24h" ) func setDefaultIfEmptyMapValue(configMap map[string]string, key string, value string) { @@ -105,6 +107,8 @@ func (c *Cluster) setClusterServicesDefaults() { &c.Services.Kubelet.Image: c.SystemImages.Kubernetes, &c.Services.Kubeproxy.Image: c.SystemImages.Kubernetes, &c.Services.Etcd.Image: c.SystemImages.Etcd, + &c.Services.Etcd.Creation: DefaultEtcdBackupCreationPeriod, + &c.Services.Etcd.Retention: DefaultEtcdBackupRetentionPeriod, } for k, v := range serviceConfigDefaultsMap { setDefaultIfEmpty(k, v) diff --git a/cluster/etcd.go b/cluster/etcd.go new file mode 100644 index 00000000..93cdeaa2 --- /dev/null +++ b/cluster/etcd.go @@ -0,0 +1,63 @@ +package cluster + +import ( + "context" + "fmt" + "path" + + "github.com/rancher/rke/docker" + "github.com/rancher/rke/hosts" + "github.com/rancher/rke/services" + "github.com/rancher/types/apis/management.cattle.io/v3" +) + +func (c *Cluster) BackupEtcd(ctx context.Context, backupName string) error { + for _, host := range c.EtcdHosts { + if err := services.RunEtcdBackup(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, c.Services.Etcd.Creation, c.Services.Etcd.Retention, backupName, true); err != nil { + return err + } + } + return nil +} + +func (c *Cluster) RestoreEtcdBackup(ctx context.Context, backupPath string) error { + // Stopping all etcd containers + for _, host := range c.EtcdHosts { + if err := tearDownOldEtcd(ctx, host, c.SystemImages.Alpine, c.PrivateRegistriesMap); err != nil { + return err + } + } + // Start restore process on all etcd hosts + initCluster := services.GetEtcdInitialCluster(c.EtcdHosts) + for _, host := range c.EtcdHosts { + if err := services.RestoreEtcdBackup(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Etcd, backupPath, initCluster); err != nil { + return fmt.Errorf("[etcd] Failed to restore etcd backup: %v", err) + } + } + // Deploy Etcd Plane + etcdNodePlanMap := make(map[string]v3.RKEConfigNodePlan) + // Build etcd node plan map + for _, etcdHost := range c.EtcdHosts { + etcdNodePlanMap[etcdHost.Address] = BuildRKEConfigNodePlan(ctx, c, etcdHost, etcdHost.DockerInfo) + } + etcdBackup := services.EtcdBackup{ + Backup: c.Services.Etcd.Backup, + Creation: c.Services.Etcd.Creation, + Retention: c.Services.Etcd.Retention, + } + if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, etcdBackup); err != nil { + return fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err) + } + return nil +} + +func tearDownOldEtcd(ctx context.Context, host *hosts.Host, cleanupImage string, prsMap map[string]v3.PrivateRegistry) error { + if err := docker.DoRemoveContainer(ctx, host.DClient, services.EtcdContainerName, host.Address); err != nil { + return fmt.Errorf("[etcd] Failed to stop old etcd containers: %v", err) + } + // cleanup etcd data directory + toCleanPaths := []string{ + path.Join(host.PrefixPath, hosts.ToCleanEtcdDir), + } + return host.CleanUp(ctx, toCleanPaths, cleanupImage, prsMap) +} diff --git a/cluster/plan.go b/cluster/plan.go index ab10dae5..a8473ed2 100644 --- a/cluster/plan.go +++ b/cluster/plan.go @@ -593,7 +593,7 @@ func (c *Cluster) BuildEtcdProcess(host *hosts.Host, etcdHosts []*hosts.Host, pr } Binds := []string{ - fmt.Sprintf("%s:/var/lib/rancher/etcd:z", path.Join(prefixPath, "/var/lib/etcd")), + fmt.Sprintf("%s:/var/lib/rancher/:z", path.Join(prefixPath, "/var/lib/")), fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(prefixPath, "/etc/kubernetes")), } diff --git a/cmd/etcd.go b/cmd/etcd.go new file mode 100644 index 00000000..520529a0 --- /dev/null +++ b/cmd/etcd.go @@ -0,0 +1,136 @@ +package cmd + +import ( + "context" + "fmt" + + "github.com/rancher/rke/cluster" + "github.com/rancher/rke/hosts" + "github.com/rancher/rke/log" + "github.com/rancher/rke/pki" + "github.com/rancher/types/apis/management.cattle.io/v3" + "github.com/urfave/cli" +) + +func EtcdCommand() cli.Command { + backupRestoreFlags := []cli.Flag{ + cli.StringFlag{ + Name: "name", + Usage: "Specify Backup name", + }, + cli.StringFlag{ + Name: "config", + Usage: "Specify an alternate cluster YAML file", + Value: pki.ClusterConfig, + EnvVar: "RKE_CONFIG", + }, + } + + backupRestoreFlags = append(backupRestoreFlags, sshCliOptions...) + + return cli.Command{ + Name: "etcd", + Usage: "etcd backup/restore operations in k8s cluster", + Subcommands: []cli.Command{ + { + Name: "backup", + Usage: "Take backup on all etcd hosts", + Flags: backupRestoreFlags, + Action: BackupEtcdHostsFromCli, + }, + { + Name: "restore", + Usage: "Restore existing backup", + Flags: backupRestoreFlags, + Action: RestoreEtcdBackupFromCli, + }, + }, + } +} + +func BackupEtcdHosts( + ctx context.Context, + rkeConfig *v3.RancherKubernetesEngineConfig, + dockerDialerFactory hosts.DialerFactory, + configDir, backupName string) error { + + log.Infof(ctx, "Starting Backup on etcd hosts") + kubeCluster, err := cluster.ParseCluster(ctx, rkeConfig, clusterFilePath, configDir, dockerDialerFactory, nil, nil) + if err != nil { + return err + } + + if err := kubeCluster.TunnelHosts(ctx, false); err != nil { + return err + } + if err := kubeCluster.BackupEtcd(ctx, backupName); err != nil { + return err + } + + log.Infof(ctx, "Finished backup on all etcd hosts") + return nil +} + +func RestoreEtcdBackup( + ctx context.Context, + rkeConfig *v3.RancherKubernetesEngineConfig, + dockerDialerFactory hosts.DialerFactory, + configDir, backupName string) error { + + log.Infof(ctx, "Starting restore on etcd hosts") + kubeCluster, err := cluster.ParseCluster(ctx, rkeConfig, clusterFilePath, configDir, dockerDialerFactory, nil, nil) + if err != nil { + return err + } + + if err := kubeCluster.TunnelHosts(ctx, false); err != nil { + return err + } + if err := kubeCluster.RestoreEtcdBackup(ctx, backupName); err != nil { + return err + } + + log.Infof(ctx, "Finished restoring on all etcd hosts") + return nil +} + +func BackupEtcdHostsFromCli(ctx *cli.Context) error { + clusterFile, filePath, err := resolveClusterFile(ctx) + if err != nil { + return fmt.Errorf("Failed to resolve cluster file: %v", err) + } + clusterFilePath = filePath + + rkeConfig, err := cluster.ParseConfig(clusterFile) + if err != nil { + return fmt.Errorf("Failed to parse cluster file: %v", err) + } + + rkeConfig, err = setOptionsFromCLI(ctx, rkeConfig) + if err != nil { + return err + } + + return BackupEtcdHosts(context.Background(), rkeConfig, nil, "", ctx.String("name")) +} + +func RestoreEtcdBackupFromCli(ctx *cli.Context) error { + clusterFile, filePath, err := resolveClusterFile(ctx) + if err != nil { + return fmt.Errorf("Failed to resolve cluster file: %v", err) + } + clusterFilePath = filePath + + rkeConfig, err := cluster.ParseConfig(clusterFile) + if err != nil { + return fmt.Errorf("Failed to parse cluster file: %v", err) + } + + rkeConfig, err = setOptionsFromCLI(ctx, rkeConfig) + if err != nil { + return err + } + + return RestoreEtcdBackup(context.Background(), rkeConfig, nil, "", ctx.String("name")) + +} diff --git a/docker/docker.go b/docker/docker.go index b8e04dc4..213ac4ec 100644 --- a/docker/docker.go +++ b/docker/docker.go @@ -266,23 +266,27 @@ func StopRenameContainer(ctx context.Context, dClient *client.Client, hostname s if err := StopContainer(ctx, dClient, hostname, oldContainerName); err != nil { return err } - if err := WaitForContainer(ctx, dClient, hostname, oldContainerName); err != nil { + if _, err := WaitForContainer(ctx, dClient, hostname, oldContainerName); err != nil { return nil } return RenameContainer(ctx, dClient, hostname, oldContainerName, newContainerName) } -func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) error { +func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) (int64, error) { + // We capture the status exit code of the container statusCh, errCh := dClient.ContainerWait(ctx, containerName, container.WaitConditionNotRunning) select { case err := <-errCh: if err != nil { - return fmt.Errorf("Error waiting for container [%s] on host [%s]: %v", containerName, hostname, err) + // if error is present return 1 exit code + return 1, fmt.Errorf("Error waiting for container [%s] on host [%s]: %v", containerName, hostname, err) } - case <-statusCh: + case status := <-statusCh: + // return the status exit code of the container + return status.StatusCode, nil } - return nil + return 0, nil } func IsContainerUpgradable(ctx context.Context, dClient *client.Client, imageCfg *container.Config, containerName string, hostname string, plane string) (bool, error) { diff --git a/hosts/hosts.go b/hosts/hosts.go index c47459b2..309394f6 100644 --- a/hosts/hosts.go +++ b/hosts/hosts.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "path" + "path/filepath" "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" @@ -58,6 +59,7 @@ func (h *Host) CleanUpAll(ctx context.Context, cleanerImage string, prsMap map[s path.Join(h.PrefixPath, ToCleanTempCertPath), path.Join(h.PrefixPath, ToCleanCNILib), } + if !externalEtcd { toCleanPaths = append(toCleanPaths, path.Join(h.PrefixPath, ToCleanEtcdDir)) } @@ -116,7 +118,7 @@ func (h *Host) CleanUp(ctx context.Context, toCleanPaths []string, cleanerImage return err } - if err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName); err != nil { + if _, err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName); err != nil { return err } @@ -240,8 +242,12 @@ func buildCleanerConfig(host *Host, toCleanDirs []string, cleanerImage string) ( Cmd: cmd, } bindMounts := []string{} + bindMountsMap := make(map[string]string) for _, vol := range toCleanDirs { - bindMounts = append(bindMounts, fmt.Sprintf("%s:%s:z", vol, vol)) + bindMountsMap[filepath.Dir(vol)] = vol + } + for dir := range bindMountsMap { + bindMounts = append(bindMounts, fmt.Sprintf("%s:%s:z", dir, dir)) } hostCfg := &container.HostConfig{ Binds: bindMounts, diff --git a/main.go b/main.go index 3aa846f7..eec8e81c 100644 --- a/main.go +++ b/main.go @@ -34,6 +34,7 @@ func mainErr() error { cmd.RemoveCommand(), cmd.VersionCommand(), cmd.ConfigCommand(), + cmd.EtcdCommand(), } app.Flags = []cli.Flag{ cli.BoolFlag{ diff --git a/pki/deploy.go b/pki/deploy.go index 867cee58..652f4976 100644 --- a/pki/deploy.go +++ b/pki/deploy.go @@ -134,7 +134,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho for certName, config := range crtList { certificate := CertificatePKI{} - crt, err := fetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap) + crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap) // I will only exit with an error if it's not a not-found-error and this is not an etcd certificate if err != nil && !strings.HasPrefix(certName, "kube-etcd") { if strings.Contains(err.Error(), "no such file or directory") || @@ -149,10 +149,10 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho tmpCerts[certName] = CertificatePKI{} continue } - key, err := fetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap) + key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap) if config { - config, err := fetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap) + config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap) if err != nil { return nil, err } @@ -179,7 +179,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho } -func fetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) { +func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) { imageCfg := &container.Config{ Image: image, diff --git a/services/etcd.go b/services/etcd.go index 10370666..545ae4ee 100644 --- a/services/etcd.go +++ b/services/etcd.go @@ -2,21 +2,42 @@ package services import ( "fmt" + "path" + "path/filepath" "strings" "time" "context" etcdclient "github.com/coreos/etcd/client" + "github.com/docker/docker/api/types/container" "github.com/pkg/errors" "github.com/rancher/rke/docker" "github.com/rancher/rke/hosts" "github.com/rancher/rke/log" + "github.com/rancher/rke/pki" "github.com/rancher/types/apis/management.cattle.io/v3" "github.com/sirupsen/logrus" ) -func RunEtcdPlane(ctx context.Context, etcdHosts []*hosts.Host, etcdNodePlanMap map[string]v3.RKEConfigNodePlan, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, updateWorkersOnly bool, alpineImage string) error { +type EtcdBackup struct { + // Enable or disable backup creation + Backup bool + // Creation period of the etcd backups + Creation string + // Retention period of the etcd backups + Retention string +} + +func RunEtcdPlane( + ctx context.Context, + etcdHosts []*hosts.Host, + etcdNodePlanMap map[string]v3.RKEConfigNodePlan, + localConnDialerFactory hosts.DialerFactory, + prsMap map[string]v3.PrivateRegistry, + updateWorkersOnly bool, + alpineImage string, + etcdBackup EtcdBackup) error { log.Infof(ctx, "[%s] Building up etcd plane..", ETCDRole) for _, host := range etcdHosts { if updateWorkersOnly { @@ -27,6 +48,11 @@ func RunEtcdPlane(ctx context.Context, etcdHosts []*hosts.Host, etcdNodePlanMap if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, EtcdContainerName, host.Address, ETCDRole, prsMap); err != nil { return err } + if etcdBackup.Backup { + if err := RunEtcdBackup(ctx, host, prsMap, alpineImage, etcdBackup.Creation, etcdBackup.Retention, EtcdBackupContainerName, false); err != nil { + return err + } + } if err := createLogLink(ctx, host, EtcdContainerName, ETCDRole, alpineImage, prsMap); err != nil { return err } @@ -186,3 +212,84 @@ func IsEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*hosts. } return false, nil } + +func RunEtcdBackup(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdBackupImage string, creation, retention, name string, once bool) error { + log.Infof(ctx, "[etcd] Starting backup on host [%s]", etcdHost.Address) + imageCfg := &container.Config{ + Cmd: []string{ + "/opt/rke/rke-etcd-backup", + "rolling-backup", + "--cacert", pki.GetCertPath(pki.CACertName), + "--cert", pki.GetCertPath(pki.KubeNodeCertName), + "--key", pki.GetKeyPath(pki.KubeNodeCertName), + "--name", name, + }, + Image: etcdBackupImage, + } + if once { + imageCfg.Cmd = append(imageCfg.Cmd, "--once") + } + if !once { + imageCfg.Cmd = append(imageCfg.Cmd, "--retention="+retention) + imageCfg.Cmd = append(imageCfg.Cmd, "--creation="+creation) + } + hostCfg := &container.HostConfig{ + Binds: []string{ + "/opt/rke/etcdbackup:/backup", + fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))}, + NetworkMode: container.NetworkMode("host"), + } + + if once { + if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdBackupOnceContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { + return err + } + status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdBackupOnceContainerName) + if status != 0 || err != nil { + return fmt.Errorf("Failed to take etcd backup exit code [%s]: %v", status, err) + } + return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdBackupOnceContainerName) + } + return docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdBackupContainerName, etcdHost.Address, ETCDRole, prsMap) +} + +func RestoreEtcdBackup(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdRestoreImage, backupName, initCluster string) error { + log.Infof(ctx, "[etcd] Restoring [%s] snapshot on etcd host [%s]", backupName, etcdHost.Address) + nodeName := pki.GetEtcdCrtName(etcdHost.InternalAddress) + backupPath := filepath.Join("/backup/", backupName) + + imageCfg := &container.Config{ + Cmd: []string{ + "/usr/local/bin/etcdctl", + "--cacert", pki.GetCertPath(pki.CACertName), + "--cert", pki.GetCertPath(nodeName), + "--key", pki.GetKeyPath(nodeName), + "snapshot", "restore", backupPath, + "--data-dir=/var/lib/rancher/etcd", + "--name=etcd-" + etcdHost.HostnameOverride, + "--initial-cluster=" + initCluster, + "--initial-cluster-token=etcd-cluster-1", + "--initial-advertise-peer-urls=https://" + etcdHost.InternalAddress + ":2380", + }, + Env: []string{"ETCDCTL_API=3"}, + Image: etcdRestoreImage, + } + hostCfg := &container.HostConfig{ + Binds: []string{ + "/opt/rke/etcdbackup:/backup:z", + fmt.Sprintf("%s:/var/lib/rancher/:z", path.Join(etcdHost.PrefixPath, "/var/lib/")), + fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))}, + NetworkMode: container.NetworkMode("host"), + } + if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdRestoreContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil { + return err + } + status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName) + if err != nil { + return err + } + if status != 0 { + return fmt.Errorf("Failed to run etcd restore container, exit status is: %d", status) + } + return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName) +} diff --git a/services/services.go b/services/services.go index f57f5db5..e0a7f4f3 100644 --- a/services/services.go +++ b/services/services.go @@ -27,6 +27,9 @@ const ( KubeControllerContainerName = "kube-controller-manager" SchedulerContainerName = "kube-scheduler" EtcdContainerName = "etcd" + EtcdBackupContainerName = "etcd-backup" + EtcdBackupOnceContainerName = "etcd-backup-once" + EtcdRestoreContainerName = "etcd-restore" NginxProxyContainerName = "nginx-proxy" SidekickContainerName = "service-sidekick" LogLinkContainerName = "rke-log-linker"