mirror of
https://github.com/rancher/rke.git
synced 2025-07-05 19:47:47 +00:00
Etcd Backup/restore
This commit is contained in:
parent
5d718ad123
commit
a3416e6b92
117
README.md
117
README.md
@ -363,6 +363,7 @@ nodes:
|
||||
```
|
||||
|
||||
## Deploying Rancher 2.0 using rke
|
||||
|
||||
Using RKE's pluggable user addons, it's possible to deploy Rancher 2.0 server in HA with a single command.
|
||||
|
||||
Depending how you want to manage your ssl certificates, there are 2 deployment options:
|
||||
@ -416,6 +417,122 @@ kubectl -n cattle-system scale deployment cattle --replicas=3
|
||||
# chown <user> /var/run/docker.sock
|
||||
```
|
||||
|
||||
## Etcd Backup and Restoration
|
||||
|
||||
You can configure a Rancher Kubernetes Engine (RKE) cluster to automatically create backups of etcd. In a disaster scenario, you can restore these backups, which are stored on other cluster nodes.
|
||||
|
||||
### Etcd Regular Backup
|
||||
|
||||
To schedule a recurring automatic etcd backup, enable the `etcd-backup` service. `etcd-backup` runs in a service container alongside the `etcd` container. `etcd-backup` automatically creates backups and stores them to its local disk.
|
||||
|
||||
To enable `etcd-backup` in RKE CLI, configure the following three variables:
|
||||
|
||||
```
|
||||
services:
|
||||
etcd:
|
||||
backup: true
|
||||
creation: 5m0s
|
||||
retention: 24h
|
||||
```
|
||||
|
||||
- `backup`: Enables/disables etcd backups in the RKE cluster.
|
||||
|
||||
Default value: `false`.
|
||||
- `creation`: Time period in which `etcd-backup` creates and stores local backups.
|
||||
|
||||
Default value: `5m0s`
|
||||
|
||||
- `retention`: Time period before before an etcd backup expires. Expired backups are purged.
|
||||
|
||||
Default value: `24h`
|
||||
|
||||
After RKE runs, view the `etcd-backup` logs to confirm backups are being created automatically:
|
||||
```
|
||||
# docker logs etcd-backup
|
||||
time="2018-05-04T18:39:16Z" level=info msg="Initializing Rolling Backups" creation=1m0s retention=24h0m0s
|
||||
time="2018-05-04T18:40:16Z" level=info msg="Created backup" name="2018-05-04T18:40:16Z_etcd" runtime=108.332814ms
|
||||
time="2018-05-04T18:41:16Z" level=info msg="Created backup" name="2018-05-04T18:41:16Z_etcd" runtime=92.880112ms
|
||||
time="2018-05-04T18:42:16Z" level=info msg="Created backup" name="2018-05-04T18:42:16Z_etcd" runtime=83.67642ms
|
||||
time="2018-05-04T18:43:16Z" level=info msg="Created backup" name="2018-05-04T18:43:16Z_etcd" runtime=86.298499ms
|
||||
```
|
||||
Backups are saved to the following directory: `/opt/rke/etcdbackup/`. Backups are created on each node that runs etcd.
|
||||
|
||||
|
||||
### Etcd onetime Snapshots
|
||||
|
||||
RKE also added two commands that for etcd backup management:
|
||||
```
|
||||
./rke etcd backup [NAME]
|
||||
```
|
||||
and
|
||||
```
|
||||
./rke etcd restore [NAME]
|
||||
```
|
||||
|
||||
The backup command saves a snapshot of etcd in `/opt/rke/etcdbackup`. This command also creates a container for the backup. When the backup completes, the container is removed.
|
||||
|
||||
```
|
||||
# ./rke etcd backup --name snapshot
|
||||
|
||||
INFO[0000] Starting Backup on etcd hosts
|
||||
INFO[0000] [dialer] Setup tunnel for host [x.x.x.x]
|
||||
INFO[0002] [dialer] Setup tunnel for host [y.y.y.y]
|
||||
INFO[0004] [dialer] Setup tunnel for host [z.z.z.z]
|
||||
INFO[0006] [etcd] Starting backup on host [x.x.x.x]
|
||||
INFO[0007] [etcd] Successfully started [etcd-backup-once] container on host [x.x.x.x]
|
||||
INFO[0007] [etcd] Starting backup on host [y.y.y.y]
|
||||
INFO[0009] [etcd] Successfully started [etcd-backup-once] container on host [y.y.y.y]
|
||||
INFO[0010] [etcd] Starting backup on host [z.z.z.z]
|
||||
INFO[0011] [etcd] Successfully started [etcd-backup-once] container on host [z.z.z.z]
|
||||
INFO[0011] Finished backup on all etcd hosts
|
||||
```
|
||||
### Etcd Disaster recovery
|
||||
|
||||
`etcd restore` is used for etcd Disaster recovery, it reverts to any snapshot stored in `/opt/rke/etcdbackup` that you explicitly define. When you run `etcd restire`, RKE removes the old etcd container if it still exists. To restore operations, RKE creates a new etcd cluster using the snapshot you choose.
|
||||
|
||||
>**Warning:** Restoring an etcd backup deletes your current etcd cluster and replaces it with a new one. Before you run the `etcd restore` command, backup any important data in your current cluster.
|
||||
|
||||
```
|
||||
./rke etcd restore --name snapshot --config test-aws.yml
|
||||
INFO[0000] Starting restore on etcd hosts
|
||||
INFO[0000] [dialer] Setup tunnel for host [x.x.x.x]
|
||||
INFO[0002] [dialer] Setup tunnel for host [y.y.y.y]
|
||||
INFO[0005] [dialer] Setup tunnel for host [z.z.z.z]
|
||||
INFO[0007] [hosts] Cleaning up host [x.x.x.x]
|
||||
INFO[0007] [hosts] Running cleaner container on host [x.x.x.x]
|
||||
INFO[0008] [kube-cleaner] Successfully started [kube-cleaner] container on host [x.x.x.x]
|
||||
INFO[0008] [hosts] Removing cleaner container on host [x.x.x.x]
|
||||
INFO[0008] [hosts] Successfully cleaned up host [x.x.x.x]
|
||||
INFO[0009] [hosts] Cleaning up host [y.y.y.y]
|
||||
INFO[0009] [hosts] Running cleaner container on host [y.y.y.y]
|
||||
INFO[0010] [kube-cleaner] Successfully started [kube-cleaner] container on host [y.y.y.y]
|
||||
INFO[0010] [hosts] Removing cleaner container on host [y.y.y.y]
|
||||
INFO[0010] [hosts] Successfully cleaned up host [y.y.y.y]
|
||||
INFO[0011] [hosts] Cleaning up host [z.z.z.z]
|
||||
INFO[0011] [hosts] Running cleaner container on host [z.z.z.z]
|
||||
INFO[0012] [kube-cleaner] Successfully started [kube-cleaner] container on host [z.z.z.z]
|
||||
INFO[0012] [hosts] Removing cleaner container on host [z.z.z.z]
|
||||
INFO[0012] [hosts] Successfully cleaned up host [z.z.z.z]
|
||||
INFO[0012] [etcd] Restoring [snapshot] snapshot on etcd host [x.x.x.x]
|
||||
INFO[0013] [etcd] Successfully started [etcd-restore] container on host [x.x.x.x]
|
||||
INFO[0014] [etcd] Restoring [snapshot] snapshot on etcd host [y.y.y.y]
|
||||
INFO[0015] [etcd] Successfully started [etcd-restore] container on host [y.y.y.y]
|
||||
INFO[0015] [etcd] Restoring [snapshot] snapshot on etcd host [z.z.z.z]
|
||||
INFO[0016] [etcd] Successfully started [etcd-restore] container on host [z.z.z.z]
|
||||
INFO[0017] [etcd] Building up etcd plane..
|
||||
INFO[0018] [etcd] Successfully started [etcd] container on host [x.x.x.x]
|
||||
INFO[0020] [etcd] Successfully started [rke-log-linker] container on host [x.x.x.x]
|
||||
INFO[0021] [remove/rke-log-linker] Successfully removed container on host [x.x.x.x]
|
||||
INFO[0022] [etcd] Successfully started [etcd] container on host [y.y.y.y]
|
||||
INFO[0023] [etcd] Successfully started [rke-log-linker] container on host [y.y.y.y]
|
||||
INFO[0025] [remove/rke-log-linker] Successfully removed container on host [y.y.y.y]
|
||||
INFO[0025] [etcd] Successfully started [etcd] container on host [z.z.z.z]
|
||||
INFO[0027] [etcd] Successfully started [rke-log-linker] container on host [z.z.z.z]
|
||||
INFO[0027] [remove/rke-log-linker] Successfully removed container on host [z.z.z.z]
|
||||
INFO[0027] [etcd] Successfully started etcd plane..
|
||||
INFO[0027] Finished restoring on all etcd hosts
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Copyright (c) 2018 [Rancher Labs, Inc.](http://rancher.com)
|
||||
|
@ -228,3 +228,21 @@ func fetchBackupCertificates(ctx context.Context, backupHosts []*hosts.Host, kub
|
||||
// reporting the last error only.
|
||||
return nil, err
|
||||
}
|
||||
|
||||
func fetchCertificatesFromEtcd(ctx context.Context, kubeCluster *Cluster) ([]byte, []byte, error) {
|
||||
// Get kubernetes certificates from the etcd hosts
|
||||
certificates := map[string]pki.CertificatePKI{}
|
||||
var err error
|
||||
for _, host := range kubeCluster.EtcdHosts {
|
||||
certificates, err = pki.FetchCertificatesFromHost(ctx, kubeCluster.EtcdHosts, host, kubeCluster.SystemImages.Alpine, kubeCluster.LocalKubeConfigPath, kubeCluster.PrivateRegistriesMap)
|
||||
if certificates != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil || certificates == nil {
|
||||
return nil, nil, fmt.Errorf("Failed to fetch certificates from etcd hosts: %v", err)
|
||||
}
|
||||
clientCert := cert.EncodeCertPEM(certificates[pki.KubeNodeCertName].Certificate)
|
||||
clientkey := cert.EncodePrivateKeyPEM(certificates[pki.KubeNodeCertName].Key)
|
||||
return clientCert, clientkey, nil
|
||||
}
|
||||
|
@ -77,7 +77,12 @@ func (c *Cluster) DeployControlPlane(ctx context.Context) error {
|
||||
if len(c.Services.Etcd.ExternalURLs) > 0 {
|
||||
log.Infof(ctx, "[etcd] External etcd connection string has been specified, skipping etcd plane")
|
||||
} else {
|
||||
if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine); err != nil {
|
||||
etcdBackup := services.EtcdBackup{
|
||||
Backup: c.Services.Etcd.Backup,
|
||||
Creation: c.Services.Etcd.Creation,
|
||||
Retention: c.Services.Etcd.Retention,
|
||||
}
|
||||
if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, etcdBackup); err != nil {
|
||||
return fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err)
|
||||
}
|
||||
}
|
||||
|
@ -28,6 +28,8 @@ const (
|
||||
DefaultNetworkCloudProvider = "none"
|
||||
|
||||
DefaultIngressController = "nginx"
|
||||
DefaultEtcdBackupCreationPeriod = "5m0s"
|
||||
DefaultEtcdBackupRetentionPeriod = "24h"
|
||||
)
|
||||
|
||||
func setDefaultIfEmptyMapValue(configMap map[string]string, key string, value string) {
|
||||
@ -105,6 +107,8 @@ func (c *Cluster) setClusterServicesDefaults() {
|
||||
&c.Services.Kubelet.Image: c.SystemImages.Kubernetes,
|
||||
&c.Services.Kubeproxy.Image: c.SystemImages.Kubernetes,
|
||||
&c.Services.Etcd.Image: c.SystemImages.Etcd,
|
||||
&c.Services.Etcd.Creation: DefaultEtcdBackupCreationPeriod,
|
||||
&c.Services.Etcd.Retention: DefaultEtcdBackupRetentionPeriod,
|
||||
}
|
||||
for k, v := range serviceConfigDefaultsMap {
|
||||
setDefaultIfEmpty(k, v)
|
||||
|
63
cluster/etcd.go
Normal file
63
cluster/etcd.go
Normal file
@ -0,0 +1,63 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path"
|
||||
|
||||
"github.com/rancher/rke/docker"
|
||||
"github.com/rancher/rke/hosts"
|
||||
"github.com/rancher/rke/services"
|
||||
"github.com/rancher/types/apis/management.cattle.io/v3"
|
||||
)
|
||||
|
||||
func (c *Cluster) BackupEtcd(ctx context.Context, backupName string) error {
|
||||
for _, host := range c.EtcdHosts {
|
||||
if err := services.RunEtcdBackup(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Alpine, c.Services.Etcd.Creation, c.Services.Etcd.Retention, backupName, true); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) RestoreEtcdBackup(ctx context.Context, backupPath string) error {
|
||||
// Stopping all etcd containers
|
||||
for _, host := range c.EtcdHosts {
|
||||
if err := tearDownOldEtcd(ctx, host, c.SystemImages.Alpine, c.PrivateRegistriesMap); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Start restore process on all etcd hosts
|
||||
initCluster := services.GetEtcdInitialCluster(c.EtcdHosts)
|
||||
for _, host := range c.EtcdHosts {
|
||||
if err := services.RestoreEtcdBackup(ctx, host, c.PrivateRegistriesMap, c.SystemImages.Etcd, backupPath, initCluster); err != nil {
|
||||
return fmt.Errorf("[etcd] Failed to restore etcd backup: %v", err)
|
||||
}
|
||||
}
|
||||
// Deploy Etcd Plane
|
||||
etcdNodePlanMap := make(map[string]v3.RKEConfigNodePlan)
|
||||
// Build etcd node plan map
|
||||
for _, etcdHost := range c.EtcdHosts {
|
||||
etcdNodePlanMap[etcdHost.Address] = BuildRKEConfigNodePlan(ctx, c, etcdHost, etcdHost.DockerInfo)
|
||||
}
|
||||
etcdBackup := services.EtcdBackup{
|
||||
Backup: c.Services.Etcd.Backup,
|
||||
Creation: c.Services.Etcd.Creation,
|
||||
Retention: c.Services.Etcd.Retention,
|
||||
}
|
||||
if err := services.RunEtcdPlane(ctx, c.EtcdHosts, etcdNodePlanMap, c.LocalConnDialerFactory, c.PrivateRegistriesMap, c.UpdateWorkersOnly, c.SystemImages.Alpine, etcdBackup); err != nil {
|
||||
return fmt.Errorf("[etcd] Failed to bring up Etcd Plane: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func tearDownOldEtcd(ctx context.Context, host *hosts.Host, cleanupImage string, prsMap map[string]v3.PrivateRegistry) error {
|
||||
if err := docker.DoRemoveContainer(ctx, host.DClient, services.EtcdContainerName, host.Address); err != nil {
|
||||
return fmt.Errorf("[etcd] Failed to stop old etcd containers: %v", err)
|
||||
}
|
||||
// cleanup etcd data directory
|
||||
toCleanPaths := []string{
|
||||
path.Join(host.PrefixPath, hosts.ToCleanEtcdDir),
|
||||
}
|
||||
return host.CleanUp(ctx, toCleanPaths, cleanupImage, prsMap)
|
||||
}
|
@ -593,7 +593,7 @@ func (c *Cluster) BuildEtcdProcess(host *hosts.Host, etcdHosts []*hosts.Host, pr
|
||||
}
|
||||
|
||||
Binds := []string{
|
||||
fmt.Sprintf("%s:/var/lib/rancher/etcd:z", path.Join(prefixPath, "/var/lib/etcd")),
|
||||
fmt.Sprintf("%s:/var/lib/rancher/:z", path.Join(prefixPath, "/var/lib/")),
|
||||
fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(prefixPath, "/etc/kubernetes")),
|
||||
}
|
||||
|
||||
|
136
cmd/etcd.go
Normal file
136
cmd/etcd.go
Normal file
@ -0,0 +1,136 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/rancher/rke/cluster"
|
||||
"github.com/rancher/rke/hosts"
|
||||
"github.com/rancher/rke/log"
|
||||
"github.com/rancher/rke/pki"
|
||||
"github.com/rancher/types/apis/management.cattle.io/v3"
|
||||
"github.com/urfave/cli"
|
||||
)
|
||||
|
||||
func EtcdCommand() cli.Command {
|
||||
backupRestoreFlags := []cli.Flag{
|
||||
cli.StringFlag{
|
||||
Name: "name",
|
||||
Usage: "Specify Backup name",
|
||||
},
|
||||
cli.StringFlag{
|
||||
Name: "config",
|
||||
Usage: "Specify an alternate cluster YAML file",
|
||||
Value: pki.ClusterConfig,
|
||||
EnvVar: "RKE_CONFIG",
|
||||
},
|
||||
}
|
||||
|
||||
backupRestoreFlags = append(backupRestoreFlags, sshCliOptions...)
|
||||
|
||||
return cli.Command{
|
||||
Name: "etcd",
|
||||
Usage: "etcd backup/restore operations in k8s cluster",
|
||||
Subcommands: []cli.Command{
|
||||
{
|
||||
Name: "backup",
|
||||
Usage: "Take backup on all etcd hosts",
|
||||
Flags: backupRestoreFlags,
|
||||
Action: BackupEtcdHostsFromCli,
|
||||
},
|
||||
{
|
||||
Name: "restore",
|
||||
Usage: "Restore existing backup",
|
||||
Flags: backupRestoreFlags,
|
||||
Action: RestoreEtcdBackupFromCli,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func BackupEtcdHosts(
|
||||
ctx context.Context,
|
||||
rkeConfig *v3.RancherKubernetesEngineConfig,
|
||||
dockerDialerFactory hosts.DialerFactory,
|
||||
configDir, backupName string) error {
|
||||
|
||||
log.Infof(ctx, "Starting Backup on etcd hosts")
|
||||
kubeCluster, err := cluster.ParseCluster(ctx, rkeConfig, clusterFilePath, configDir, dockerDialerFactory, nil, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := kubeCluster.TunnelHosts(ctx, false); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := kubeCluster.BackupEtcd(ctx, backupName); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof(ctx, "Finished backup on all etcd hosts")
|
||||
return nil
|
||||
}
|
||||
|
||||
func RestoreEtcdBackup(
|
||||
ctx context.Context,
|
||||
rkeConfig *v3.RancherKubernetesEngineConfig,
|
||||
dockerDialerFactory hosts.DialerFactory,
|
||||
configDir, backupName string) error {
|
||||
|
||||
log.Infof(ctx, "Starting restore on etcd hosts")
|
||||
kubeCluster, err := cluster.ParseCluster(ctx, rkeConfig, clusterFilePath, configDir, dockerDialerFactory, nil, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := kubeCluster.TunnelHosts(ctx, false); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := kubeCluster.RestoreEtcdBackup(ctx, backupName); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof(ctx, "Finished restoring on all etcd hosts")
|
||||
return nil
|
||||
}
|
||||
|
||||
func BackupEtcdHostsFromCli(ctx *cli.Context) error {
|
||||
clusterFile, filePath, err := resolveClusterFile(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to resolve cluster file: %v", err)
|
||||
}
|
||||
clusterFilePath = filePath
|
||||
|
||||
rkeConfig, err := cluster.ParseConfig(clusterFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to parse cluster file: %v", err)
|
||||
}
|
||||
|
||||
rkeConfig, err = setOptionsFromCLI(ctx, rkeConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return BackupEtcdHosts(context.Background(), rkeConfig, nil, "", ctx.String("name"))
|
||||
}
|
||||
|
||||
func RestoreEtcdBackupFromCli(ctx *cli.Context) error {
|
||||
clusterFile, filePath, err := resolveClusterFile(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to resolve cluster file: %v", err)
|
||||
}
|
||||
clusterFilePath = filePath
|
||||
|
||||
rkeConfig, err := cluster.ParseConfig(clusterFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to parse cluster file: %v", err)
|
||||
}
|
||||
|
||||
rkeConfig, err = setOptionsFromCLI(ctx, rkeConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return RestoreEtcdBackup(context.Background(), rkeConfig, nil, "", ctx.String("name"))
|
||||
|
||||
}
|
@ -266,23 +266,27 @@ func StopRenameContainer(ctx context.Context, dClient *client.Client, hostname s
|
||||
if err := StopContainer(ctx, dClient, hostname, oldContainerName); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := WaitForContainer(ctx, dClient, hostname, oldContainerName); err != nil {
|
||||
if _, err := WaitForContainer(ctx, dClient, hostname, oldContainerName); err != nil {
|
||||
return nil
|
||||
}
|
||||
return RenameContainer(ctx, dClient, hostname, oldContainerName, newContainerName)
|
||||
|
||||
}
|
||||
|
||||
func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) error {
|
||||
func WaitForContainer(ctx context.Context, dClient *client.Client, hostname string, containerName string) (int64, error) {
|
||||
// We capture the status exit code of the container
|
||||
statusCh, errCh := dClient.ContainerWait(ctx, containerName, container.WaitConditionNotRunning)
|
||||
select {
|
||||
case err := <-errCh:
|
||||
if err != nil {
|
||||
return fmt.Errorf("Error waiting for container [%s] on host [%s]: %v", containerName, hostname, err)
|
||||
// if error is present return 1 exit code
|
||||
return 1, fmt.Errorf("Error waiting for container [%s] on host [%s]: %v", containerName, hostname, err)
|
||||
}
|
||||
case <-statusCh:
|
||||
case status := <-statusCh:
|
||||
// return the status exit code of the container
|
||||
return status.StatusCode, nil
|
||||
}
|
||||
return nil
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func IsContainerUpgradable(ctx context.Context, dClient *client.Client, imageCfg *container.Config, containerName string, hostname string, plane string) (bool, error) {
|
||||
|
@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
@ -58,6 +59,7 @@ func (h *Host) CleanUpAll(ctx context.Context, cleanerImage string, prsMap map[s
|
||||
path.Join(h.PrefixPath, ToCleanTempCertPath),
|
||||
path.Join(h.PrefixPath, ToCleanCNILib),
|
||||
}
|
||||
|
||||
if !externalEtcd {
|
||||
toCleanPaths = append(toCleanPaths, path.Join(h.PrefixPath, ToCleanEtcdDir))
|
||||
}
|
||||
@ -116,7 +118,7 @@ func (h *Host) CleanUp(ctx context.Context, toCleanPaths []string, cleanerImage
|
||||
return err
|
||||
}
|
||||
|
||||
if err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName); err != nil {
|
||||
if _, err := docker.WaitForContainer(ctx, h.DClient, h.Address, CleanerContainerName); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -240,8 +242,12 @@ func buildCleanerConfig(host *Host, toCleanDirs []string, cleanerImage string) (
|
||||
Cmd: cmd,
|
||||
}
|
||||
bindMounts := []string{}
|
||||
bindMountsMap := make(map[string]string)
|
||||
for _, vol := range toCleanDirs {
|
||||
bindMounts = append(bindMounts, fmt.Sprintf("%s:%s:z", vol, vol))
|
||||
bindMountsMap[filepath.Dir(vol)] = vol
|
||||
}
|
||||
for dir := range bindMountsMap {
|
||||
bindMounts = append(bindMounts, fmt.Sprintf("%s:%s:z", dir, dir))
|
||||
}
|
||||
hostCfg := &container.HostConfig{
|
||||
Binds: bindMounts,
|
||||
|
1
main.go
1
main.go
@ -34,6 +34,7 @@ func mainErr() error {
|
||||
cmd.RemoveCommand(),
|
||||
cmd.VersionCommand(),
|
||||
cmd.ConfigCommand(),
|
||||
cmd.EtcdCommand(),
|
||||
}
|
||||
app.Flags = []cli.Flag{
|
||||
cli.BoolFlag{
|
||||
|
@ -134,7 +134,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
||||
|
||||
for certName, config := range crtList {
|
||||
certificate := CertificatePKI{}
|
||||
crt, err := fetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap)
|
||||
crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap)
|
||||
// I will only exit with an error if it's not a not-found-error and this is not an etcd certificate
|
||||
if err != nil && !strings.HasPrefix(certName, "kube-etcd") {
|
||||
if strings.Contains(err.Error(), "no such file or directory") ||
|
||||
@ -149,10 +149,10 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
||||
tmpCerts[certName] = CertificatePKI{}
|
||||
continue
|
||||
}
|
||||
key, err := fetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap)
|
||||
key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap)
|
||||
|
||||
if config {
|
||||
config, err := fetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap)
|
||||
config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -179,7 +179,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
||||
|
||||
}
|
||||
|
||||
func fetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) {
|
||||
func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) {
|
||||
|
||||
imageCfg := &container.Config{
|
||||
Image: image,
|
||||
|
109
services/etcd.go
109
services/etcd.go
@ -2,21 +2,42 @@ package services
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"context"
|
||||
|
||||
etcdclient "github.com/coreos/etcd/client"
|
||||
"github.com/docker/docker/api/types/container"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/rancher/rke/docker"
|
||||
"github.com/rancher/rke/hosts"
|
||||
"github.com/rancher/rke/log"
|
||||
"github.com/rancher/rke/pki"
|
||||
"github.com/rancher/types/apis/management.cattle.io/v3"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func RunEtcdPlane(ctx context.Context, etcdHosts []*hosts.Host, etcdNodePlanMap map[string]v3.RKEConfigNodePlan, localConnDialerFactory hosts.DialerFactory, prsMap map[string]v3.PrivateRegistry, updateWorkersOnly bool, alpineImage string) error {
|
||||
type EtcdBackup struct {
|
||||
// Enable or disable backup creation
|
||||
Backup bool
|
||||
// Creation period of the etcd backups
|
||||
Creation string
|
||||
// Retention period of the etcd backups
|
||||
Retention string
|
||||
}
|
||||
|
||||
func RunEtcdPlane(
|
||||
ctx context.Context,
|
||||
etcdHosts []*hosts.Host,
|
||||
etcdNodePlanMap map[string]v3.RKEConfigNodePlan,
|
||||
localConnDialerFactory hosts.DialerFactory,
|
||||
prsMap map[string]v3.PrivateRegistry,
|
||||
updateWorkersOnly bool,
|
||||
alpineImage string,
|
||||
etcdBackup EtcdBackup) error {
|
||||
log.Infof(ctx, "[%s] Building up etcd plane..", ETCDRole)
|
||||
for _, host := range etcdHosts {
|
||||
if updateWorkersOnly {
|
||||
@ -27,6 +48,11 @@ func RunEtcdPlane(ctx context.Context, etcdHosts []*hosts.Host, etcdNodePlanMap
|
||||
if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, EtcdContainerName, host.Address, ETCDRole, prsMap); err != nil {
|
||||
return err
|
||||
}
|
||||
if etcdBackup.Backup {
|
||||
if err := RunEtcdBackup(ctx, host, prsMap, alpineImage, etcdBackup.Creation, etcdBackup.Retention, EtcdBackupContainerName, false); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := createLogLink(ctx, host, EtcdContainerName, ETCDRole, alpineImage, prsMap); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -186,3 +212,84 @@ func IsEtcdMember(ctx context.Context, etcdHost *hosts.Host, etcdHosts []*hosts.
|
||||
}
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func RunEtcdBackup(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdBackupImage string, creation, retention, name string, once bool) error {
|
||||
log.Infof(ctx, "[etcd] Starting backup on host [%s]", etcdHost.Address)
|
||||
imageCfg := &container.Config{
|
||||
Cmd: []string{
|
||||
"/opt/rke/rke-etcd-backup",
|
||||
"rolling-backup",
|
||||
"--cacert", pki.GetCertPath(pki.CACertName),
|
||||
"--cert", pki.GetCertPath(pki.KubeNodeCertName),
|
||||
"--key", pki.GetKeyPath(pki.KubeNodeCertName),
|
||||
"--name", name,
|
||||
},
|
||||
Image: etcdBackupImage,
|
||||
}
|
||||
if once {
|
||||
imageCfg.Cmd = append(imageCfg.Cmd, "--once")
|
||||
}
|
||||
if !once {
|
||||
imageCfg.Cmd = append(imageCfg.Cmd, "--retention="+retention)
|
||||
imageCfg.Cmd = append(imageCfg.Cmd, "--creation="+creation)
|
||||
}
|
||||
hostCfg := &container.HostConfig{
|
||||
Binds: []string{
|
||||
"/opt/rke/etcdbackup:/backup",
|
||||
fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))},
|
||||
NetworkMode: container.NetworkMode("host"),
|
||||
}
|
||||
|
||||
if once {
|
||||
if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdBackupOnceContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil {
|
||||
return err
|
||||
}
|
||||
status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdBackupOnceContainerName)
|
||||
if status != 0 || err != nil {
|
||||
return fmt.Errorf("Failed to take etcd backup exit code [%s]: %v", status, err)
|
||||
}
|
||||
return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdBackupOnceContainerName)
|
||||
}
|
||||
return docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdBackupContainerName, etcdHost.Address, ETCDRole, prsMap)
|
||||
}
|
||||
|
||||
func RestoreEtcdBackup(ctx context.Context, etcdHost *hosts.Host, prsMap map[string]v3.PrivateRegistry, etcdRestoreImage, backupName, initCluster string) error {
|
||||
log.Infof(ctx, "[etcd] Restoring [%s] snapshot on etcd host [%s]", backupName, etcdHost.Address)
|
||||
nodeName := pki.GetEtcdCrtName(etcdHost.InternalAddress)
|
||||
backupPath := filepath.Join("/backup/", backupName)
|
||||
|
||||
imageCfg := &container.Config{
|
||||
Cmd: []string{
|
||||
"/usr/local/bin/etcdctl",
|
||||
"--cacert", pki.GetCertPath(pki.CACertName),
|
||||
"--cert", pki.GetCertPath(nodeName),
|
||||
"--key", pki.GetKeyPath(nodeName),
|
||||
"snapshot", "restore", backupPath,
|
||||
"--data-dir=/var/lib/rancher/etcd",
|
||||
"--name=etcd-" + etcdHost.HostnameOverride,
|
||||
"--initial-cluster=" + initCluster,
|
||||
"--initial-cluster-token=etcd-cluster-1",
|
||||
"--initial-advertise-peer-urls=https://" + etcdHost.InternalAddress + ":2380",
|
||||
},
|
||||
Env: []string{"ETCDCTL_API=3"},
|
||||
Image: etcdRestoreImage,
|
||||
}
|
||||
hostCfg := &container.HostConfig{
|
||||
Binds: []string{
|
||||
"/opt/rke/etcdbackup:/backup:z",
|
||||
fmt.Sprintf("%s:/var/lib/rancher/:z", path.Join(etcdHost.PrefixPath, "/var/lib/")),
|
||||
fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(etcdHost.PrefixPath, "/etc/kubernetes"))},
|
||||
NetworkMode: container.NetworkMode("host"),
|
||||
}
|
||||
if err := docker.DoRunContainer(ctx, etcdHost.DClient, imageCfg, hostCfg, EtcdRestoreContainerName, etcdHost.Address, ETCDRole, prsMap); err != nil {
|
||||
return err
|
||||
}
|
||||
status, err := docker.WaitForContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if status != 0 {
|
||||
return fmt.Errorf("Failed to run etcd restore container, exit status is: %d", status)
|
||||
}
|
||||
return docker.RemoveContainer(ctx, etcdHost.DClient, etcdHost.Address, EtcdRestoreContainerName)
|
||||
}
|
||||
|
@ -27,6 +27,9 @@ const (
|
||||
KubeControllerContainerName = "kube-controller-manager"
|
||||
SchedulerContainerName = "kube-scheduler"
|
||||
EtcdContainerName = "etcd"
|
||||
EtcdBackupContainerName = "etcd-backup"
|
||||
EtcdBackupOnceContainerName = "etcd-backup-once"
|
||||
EtcdRestoreContainerName = "etcd-restore"
|
||||
NginxProxyContainerName = "nginx-proxy"
|
||||
SidekickContainerName = "service-sidekick"
|
||||
LogLinkContainerName = "rke-log-linker"
|
||||
|
Loading…
Reference in New Issue
Block a user