diff --git a/cluster/state.go b/cluster/state.go index 0ab032d0..4be174a1 100644 --- a/cluster/state.go +++ b/cluster/state.go @@ -4,8 +4,10 @@ import ( "context" "fmt" "os" + "path" "time" + "github.com/rancher/rke/hosts" "github.com/rancher/rke/k8s" "github.com/rancher/rke/log" "github.com/rancher/rke/pki" @@ -30,7 +32,13 @@ func (c *Cluster) SaveClusterState(ctx context.Context, rkeConfig *v3.RancherKub } err = saveStateToKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath, rkeConfig) if err != nil { - return fmt.Errorf("[state] Failed to save configuration state: %v", err) + return fmt.Errorf("[state] Failed to save configuration state to k8s: %v", err) + } + // save state to the cluster nodes as a backup + uniqueHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, c.WorkerHosts) + err = saveStateToNodes(ctx, uniqueHosts, rkeConfig, c.SystemImages.Alpine, c.PrivateRegistriesMap) + if err != nil { + return fmt.Errorf("[state] Failed to save configuration state to nodes: %v", err) } } return nil @@ -58,7 +66,12 @@ func (c *Cluster) GetClusterState(ctx context.Context) (*Cluster, error) { return nil, nil } // Get previous kubernetes state - currentCluster = getStateFromKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath) + currentCluster, err = getStateFromKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath) + if err != nil { + // attempting to fetch state from nodes + uniqueHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, c.WorkerHosts) + currentCluster = getStateFromNodes(ctx, uniqueHosts, c.SystemImages.Alpine, c.PrivateRegistriesMap) + } // Get previous kubernetes certificates if currentCluster != nil { if err := currentCluster.InvertIndexHosts(); err != nil { @@ -69,11 +82,19 @@ func (c *Cluster) GetClusterState(ctx context.Context) (*Cluster, error) { activeEtcdHosts = removeFromHosts(inactiveHost, activeEtcdHosts) } currentCluster.Certificates, err = getClusterCerts(ctx, c.KubeClient, activeEtcdHosts) + // if getting certificates from k8s failed then we attempt to fetch the backup certs + if err != nil { + backupHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, nil) + currentCluster.Certificates, err = fetchBackupCertificates(ctx, backupHosts, c) + if err != nil { + return nil, fmt.Errorf("Failed to Get Kubernetes certificates: %v", err) + } + if currentCluster.Certificates != nil { + log.Infof(ctx, "[certificates] Certificate backup found on backup hosts") + } + } currentCluster.DockerDialerFactory = c.DockerDialerFactory currentCluster.LocalConnDialerFactory = c.LocalConnDialerFactory - if err != nil { - return nil, fmt.Errorf("Failed to Get Kubernetes certificates: %v", err) - } // make sure I have all the etcd certs, We need handle dialer failure for etcd nodes https://github.com/rancher/rancher/issues/12898 for _, host := range activeEtcdHosts { @@ -128,7 +149,21 @@ func saveStateToKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset } } -func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset, kubeConfigPath string) *Cluster { +func saveStateToNodes(ctx context.Context, uniqueHosts []*hosts.Host, clusterState *v3.RancherKubernetesEngineConfig, alpineImage string, prsMap map[string]v3.PrivateRegistry) error { + log.Infof(ctx, "[state] Saving cluster state to cluster nodes") + clusterFile, err := yaml.Marshal(*clusterState) + if err != nil { + return err + } + for _, host := range uniqueHosts { + if err := pki.DeployStateOnPlaneHost(ctx, host, alpineImage, prsMap, string(clusterFile)); err != nil { + return err + } + } + return nil +} + +func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset, kubeConfigPath string) (*Cluster, error) { log.Infof(ctx, "[state] Fetching cluster state from Kubernetes") var cfgMap *v1.ConfigMap var currentCluster Cluster @@ -151,13 +186,38 @@ func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientse clusterData := cfgMap.Data[StateConfigMapName] err := yaml.Unmarshal([]byte(clusterData), ¤tCluster) if err != nil { - return nil + return nil, fmt.Errorf("Failed to unmarshal cluster data") } - return ¤tCluster + return ¤tCluster, nil case <-time.After(time.Second * GetStateTimeout): log.Infof(ctx, "Timed out waiting for kubernetes cluster to get state") + return nil, fmt.Errorf("Timeout waiting for kubernetes cluster to get state") + } +} + +func getStateFromNodes(ctx context.Context, uniqueHosts []*hosts.Host, alpineImage string, prsMap map[string]v3.PrivateRegistry) *Cluster { + log.Infof(ctx, "[state] Fetching cluster state from Nodes") + var currentCluster Cluster + var clusterFile string + var err error + + for _, host := range uniqueHosts { + filePath := path.Join(host.PrefixPath, pki.TempCertPath, pki.ClusterStateFile) + clusterFile, err = pki.FetchFileFromHost(ctx, filePath, alpineImage, host, prsMap, pki.StateDeployerContainerName, "state") + if err == nil { + break + } + } + if len(clusterFile) == 0 { return nil } + err = yaml.Unmarshal([]byte(clusterFile), ¤tCluster) + if err != nil { + logrus.Debugf("[state] Failed to unmarshal the cluster file fetched from nodes: %v", err) + return nil + } + return ¤tCluster + } func GetK8sVersion(localConfigPath string, k8sWrapTransport k8s.WrapTransport) (string, error) { diff --git a/pki/constants.go b/pki/constants.go index 452bc5a6..20026517 100644 --- a/pki/constants.go +++ b/pki/constants.go @@ -8,6 +8,8 @@ const ( CertificatesSecretName = "k8s-certs" TempCertPath = "/etc/kubernetes/.tmp/" ClusterConfig = "cluster.yml" + ClusterStateFile = "cluster-state.yml" + ClusterStateEnv = "CLUSTER_STATE" BundleCertPath = "/backup/pki.bundle.tar.gz" CACertName = "kube-ca" diff --git a/pki/deploy.go b/pki/deploy.go index 7c3fa32d..a544ccff 100644 --- a/pki/deploy.go +++ b/pki/deploy.go @@ -20,6 +20,10 @@ import ( "k8s.io/client-go/util/cert" ) +const ( + StateDeployerContainerName = "cluster-state-deployer" +) + func DeployCertificatesOnPlaneHost(ctx context.Context, host *hosts.Host, rkeConfig v3.RancherKubernetesEngineConfig, crtMap map[string]CertificatePKI, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error { crtBundle := GenerateRKENodeCerts(ctx, rkeConfig, host.Address, crtMap) env := []string{} @@ -29,6 +33,38 @@ func DeployCertificatesOnPlaneHost(ctx context.Context, host *hosts.Host, rkeCon return doRunDeployer(ctx, host, env, certDownloaderImage, prsMap) } +func DeployStateOnPlaneHost(ctx context.Context, host *hosts.Host, stateDownloaderImage string, prsMap map[string]v3.PrivateRegistry, clusterState string) error { + // remove existing container. Only way it's still here is if previous deployment failed + if err := docker.DoRemoveContainer(ctx, host.DClient, StateDeployerContainerName, host.Address); err != nil { + return err + } + containerEnv := []string{ClusterStateEnv + "=" + clusterState} + ClusterStateFilePath := path.Join(host.PrefixPath, TempCertPath, ClusterStateFile) + imageCfg := &container.Config{ + Image: stateDownloaderImage, + Cmd: []string{ + "sh", + "-c", + fmt.Sprintf("t=$(mktemp); echo -e \"$%s\" > $t && mv $t %s && chmod 644 %s", ClusterStateEnv, ClusterStateFilePath, ClusterStateFilePath), + }, + Env: containerEnv, + } + hostCfg := &container.HostConfig{ + Binds: []string{ + fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(host.PrefixPath, "/etc/kubernetes")), + }, + Privileged: true, + } + if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, StateDeployerContainerName, host.Address, "state", prsMap); err != nil { + return err + } + if err := docker.DoRemoveContainer(ctx, host.DClient, StateDeployerContainerName, host.Address); err != nil { + return err + } + logrus.Debugf("[state] Successfully started state deployer container on node [%s]", host.Address) + return nil +} + func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error { // remove existing container. Only way it's still here is if previous deployment failed isRunning := false @@ -135,7 +171,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho for certName, config := range crtList { certificate := CertificatePKI{} - crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap) + crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificates") // I will only exit with an error if it's not a not-found-error and this is not an etcd certificate if err != nil && (!strings.HasPrefix(certName, "kube-etcd") && !strings.Contains(certName, APIProxyClientCertName) && @@ -154,10 +190,10 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho tmpCerts[certName] = CertificatePKI{} continue } - key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap) + key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificate") if config { - config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap) + config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificate") if err != nil { return nil, err } @@ -184,7 +220,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho } -func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) { +func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry, containerName, state string) (string, error) { imageCfg := &container.Config{ Image: image, @@ -195,16 +231,16 @@ func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts. }, Privileged: true, } - isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, CertFetcherContainer, true) + isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, containerName, true) if err != nil { return "", err } if !isRunning { - if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, CertFetcherContainer, host.Address, "certificates", prsMap); err != nil { + if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, containerName, host.Address, state, prsMap); err != nil { return "", err } } - file, err := docker.ReadFileFromContainer(ctx, host.DClient, host.Address, CertFetcherContainer, filePath) + file, err := docker.ReadFileFromContainer(ctx, host.DClient, host.Address, containerName, filePath) if err != nil { return "", err }