mirror of
https://github.com/rancher/rke.git
synced 2025-09-05 00:40:10 +00:00
Save state to the nodes backup path
This commit is contained in:
committed by
Alena Prokharchyk
parent
00e317250d
commit
925df98ea6
@@ -4,8 +4,10 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/rancher/rke/hosts"
|
||||||
"github.com/rancher/rke/k8s"
|
"github.com/rancher/rke/k8s"
|
||||||
"github.com/rancher/rke/log"
|
"github.com/rancher/rke/log"
|
||||||
"github.com/rancher/rke/pki"
|
"github.com/rancher/rke/pki"
|
||||||
@@ -30,7 +32,13 @@ func (c *Cluster) SaveClusterState(ctx context.Context, rkeConfig *v3.RancherKub
|
|||||||
}
|
}
|
||||||
err = saveStateToKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath, rkeConfig)
|
err = saveStateToKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath, rkeConfig)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("[state] Failed to save configuration state: %v", err)
|
return fmt.Errorf("[state] Failed to save configuration state to k8s: %v", err)
|
||||||
|
}
|
||||||
|
// save state to the cluster nodes as a backup
|
||||||
|
uniqueHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, c.WorkerHosts)
|
||||||
|
err = saveStateToNodes(ctx, uniqueHosts, rkeConfig, c.SystemImages.Alpine, c.PrivateRegistriesMap)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("[state] Failed to save configuration state to nodes: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
@@ -58,7 +66,12 @@ func (c *Cluster) GetClusterState(ctx context.Context) (*Cluster, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
// Get previous kubernetes state
|
// Get previous kubernetes state
|
||||||
currentCluster = getStateFromKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath)
|
currentCluster, err = getStateFromKubernetes(ctx, c.KubeClient, c.LocalKubeConfigPath)
|
||||||
|
if err != nil {
|
||||||
|
// attempting to fetch state from nodes
|
||||||
|
uniqueHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, c.WorkerHosts)
|
||||||
|
currentCluster = getStateFromNodes(ctx, uniqueHosts, c.SystemImages.Alpine, c.PrivateRegistriesMap)
|
||||||
|
}
|
||||||
// Get previous kubernetes certificates
|
// Get previous kubernetes certificates
|
||||||
if currentCluster != nil {
|
if currentCluster != nil {
|
||||||
if err := currentCluster.InvertIndexHosts(); err != nil {
|
if err := currentCluster.InvertIndexHosts(); err != nil {
|
||||||
@@ -69,11 +82,19 @@ func (c *Cluster) GetClusterState(ctx context.Context) (*Cluster, error) {
|
|||||||
activeEtcdHosts = removeFromHosts(inactiveHost, activeEtcdHosts)
|
activeEtcdHosts = removeFromHosts(inactiveHost, activeEtcdHosts)
|
||||||
}
|
}
|
||||||
currentCluster.Certificates, err = getClusterCerts(ctx, c.KubeClient, activeEtcdHosts)
|
currentCluster.Certificates, err = getClusterCerts(ctx, c.KubeClient, activeEtcdHosts)
|
||||||
currentCluster.DockerDialerFactory = c.DockerDialerFactory
|
// if getting certificates from k8s failed then we attempt to fetch the backup certs
|
||||||
currentCluster.LocalConnDialerFactory = c.LocalConnDialerFactory
|
if err != nil {
|
||||||
|
backupHosts := hosts.GetUniqueHostList(c.EtcdHosts, c.ControlPlaneHosts, nil)
|
||||||
|
currentCluster.Certificates, err = fetchBackupCertificates(ctx, backupHosts, c)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("Failed to Get Kubernetes certificates: %v", err)
|
return nil, fmt.Errorf("Failed to Get Kubernetes certificates: %v", err)
|
||||||
}
|
}
|
||||||
|
if currentCluster.Certificates != nil {
|
||||||
|
log.Infof(ctx, "[certificates] Certificate backup found on backup hosts")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentCluster.DockerDialerFactory = c.DockerDialerFactory
|
||||||
|
currentCluster.LocalConnDialerFactory = c.LocalConnDialerFactory
|
||||||
|
|
||||||
// make sure I have all the etcd certs, We need handle dialer failure for etcd nodes https://github.com/rancher/rancher/issues/12898
|
// make sure I have all the etcd certs, We need handle dialer failure for etcd nodes https://github.com/rancher/rancher/issues/12898
|
||||||
for _, host := range activeEtcdHosts {
|
for _, host := range activeEtcdHosts {
|
||||||
@@ -128,7 +149,21 @@ func saveStateToKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset, kubeConfigPath string) *Cluster {
|
func saveStateToNodes(ctx context.Context, uniqueHosts []*hosts.Host, clusterState *v3.RancherKubernetesEngineConfig, alpineImage string, prsMap map[string]v3.PrivateRegistry) error {
|
||||||
|
log.Infof(ctx, "[state] Saving cluster state to cluster nodes")
|
||||||
|
clusterFile, err := yaml.Marshal(*clusterState)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, host := range uniqueHosts {
|
||||||
|
if err := pki.DeployStateOnPlaneHost(ctx, host, alpineImage, prsMap, string(clusterFile)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientset, kubeConfigPath string) (*Cluster, error) {
|
||||||
log.Infof(ctx, "[state] Fetching cluster state from Kubernetes")
|
log.Infof(ctx, "[state] Fetching cluster state from Kubernetes")
|
||||||
var cfgMap *v1.ConfigMap
|
var cfgMap *v1.ConfigMap
|
||||||
var currentCluster Cluster
|
var currentCluster Cluster
|
||||||
@@ -151,13 +186,38 @@ func getStateFromKubernetes(ctx context.Context, kubeClient *kubernetes.Clientse
|
|||||||
clusterData := cfgMap.Data[StateConfigMapName]
|
clusterData := cfgMap.Data[StateConfigMapName]
|
||||||
err := yaml.Unmarshal([]byte(clusterData), ¤tCluster)
|
err := yaml.Unmarshal([]byte(clusterData), ¤tCluster)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Failed to unmarshal cluster data")
|
||||||
|
}
|
||||||
|
return ¤tCluster, nil
|
||||||
|
case <-time.After(time.Second * GetStateTimeout):
|
||||||
|
log.Infof(ctx, "Timed out waiting for kubernetes cluster to get state")
|
||||||
|
return nil, fmt.Errorf("Timeout waiting for kubernetes cluster to get state")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getStateFromNodes(ctx context.Context, uniqueHosts []*hosts.Host, alpineImage string, prsMap map[string]v3.PrivateRegistry) *Cluster {
|
||||||
|
log.Infof(ctx, "[state] Fetching cluster state from Nodes")
|
||||||
|
var currentCluster Cluster
|
||||||
|
var clusterFile string
|
||||||
|
var err error
|
||||||
|
|
||||||
|
for _, host := range uniqueHosts {
|
||||||
|
filePath := path.Join(host.PrefixPath, pki.TempCertPath, pki.ClusterStateFile)
|
||||||
|
clusterFile, err = pki.FetchFileFromHost(ctx, filePath, alpineImage, host, prsMap, pki.StateDeployerContainerName, "state")
|
||||||
|
if err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(clusterFile) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
err = yaml.Unmarshal([]byte(clusterFile), ¤tCluster)
|
||||||
|
if err != nil {
|
||||||
|
logrus.Debugf("[state] Failed to unmarshal the cluster file fetched from nodes: %v", err)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return ¤tCluster
|
return ¤tCluster
|
||||||
case <-time.After(time.Second * GetStateTimeout):
|
|
||||||
log.Infof(ctx, "Timed out waiting for kubernetes cluster to get state")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetK8sVersion(localConfigPath string, k8sWrapTransport k8s.WrapTransport) (string, error) {
|
func GetK8sVersion(localConfigPath string, k8sWrapTransport k8s.WrapTransport) (string, error) {
|
||||||
|
@@ -8,6 +8,8 @@ const (
|
|||||||
CertificatesSecretName = "k8s-certs"
|
CertificatesSecretName = "k8s-certs"
|
||||||
TempCertPath = "/etc/kubernetes/.tmp/"
|
TempCertPath = "/etc/kubernetes/.tmp/"
|
||||||
ClusterConfig = "cluster.yml"
|
ClusterConfig = "cluster.yml"
|
||||||
|
ClusterStateFile = "cluster-state.yml"
|
||||||
|
ClusterStateEnv = "CLUSTER_STATE"
|
||||||
BundleCertPath = "/backup/pki.bundle.tar.gz"
|
BundleCertPath = "/backup/pki.bundle.tar.gz"
|
||||||
|
|
||||||
CACertName = "kube-ca"
|
CACertName = "kube-ca"
|
||||||
|
@@ -20,6 +20,10 @@ import (
|
|||||||
"k8s.io/client-go/util/cert"
|
"k8s.io/client-go/util/cert"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
StateDeployerContainerName = "cluster-state-deployer"
|
||||||
|
)
|
||||||
|
|
||||||
func DeployCertificatesOnPlaneHost(ctx context.Context, host *hosts.Host, rkeConfig v3.RancherKubernetesEngineConfig, crtMap map[string]CertificatePKI, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error {
|
func DeployCertificatesOnPlaneHost(ctx context.Context, host *hosts.Host, rkeConfig v3.RancherKubernetesEngineConfig, crtMap map[string]CertificatePKI, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error {
|
||||||
crtBundle := GenerateRKENodeCerts(ctx, rkeConfig, host.Address, crtMap)
|
crtBundle := GenerateRKENodeCerts(ctx, rkeConfig, host.Address, crtMap)
|
||||||
env := []string{}
|
env := []string{}
|
||||||
@@ -29,6 +33,38 @@ func DeployCertificatesOnPlaneHost(ctx context.Context, host *hosts.Host, rkeCon
|
|||||||
return doRunDeployer(ctx, host, env, certDownloaderImage, prsMap)
|
return doRunDeployer(ctx, host, env, certDownloaderImage, prsMap)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func DeployStateOnPlaneHost(ctx context.Context, host *hosts.Host, stateDownloaderImage string, prsMap map[string]v3.PrivateRegistry, clusterState string) error {
|
||||||
|
// remove existing container. Only way it's still here is if previous deployment failed
|
||||||
|
if err := docker.DoRemoveContainer(ctx, host.DClient, StateDeployerContainerName, host.Address); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
containerEnv := []string{ClusterStateEnv + "=" + clusterState}
|
||||||
|
ClusterStateFilePath := path.Join(host.PrefixPath, TempCertPath, ClusterStateFile)
|
||||||
|
imageCfg := &container.Config{
|
||||||
|
Image: stateDownloaderImage,
|
||||||
|
Cmd: []string{
|
||||||
|
"sh",
|
||||||
|
"-c",
|
||||||
|
fmt.Sprintf("t=$(mktemp); echo -e \"$%s\" > $t && mv $t %s && chmod 644 %s", ClusterStateEnv, ClusterStateFilePath, ClusterStateFilePath),
|
||||||
|
},
|
||||||
|
Env: containerEnv,
|
||||||
|
}
|
||||||
|
hostCfg := &container.HostConfig{
|
||||||
|
Binds: []string{
|
||||||
|
fmt.Sprintf("%s:/etc/kubernetes:z", path.Join(host.PrefixPath, "/etc/kubernetes")),
|
||||||
|
},
|
||||||
|
Privileged: true,
|
||||||
|
}
|
||||||
|
if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, StateDeployerContainerName, host.Address, "state", prsMap); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := docker.DoRemoveContainer(ctx, host.DClient, StateDeployerContainerName, host.Address); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
logrus.Debugf("[state] Successfully started state deployer container on node [%s]", host.Address)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error {
|
func doRunDeployer(ctx context.Context, host *hosts.Host, containerEnv []string, certDownloaderImage string, prsMap map[string]v3.PrivateRegistry) error {
|
||||||
// remove existing container. Only way it's still here is if previous deployment failed
|
// remove existing container. Only way it's still here is if previous deployment failed
|
||||||
isRunning := false
|
isRunning := false
|
||||||
@@ -135,7 +171,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
|||||||
|
|
||||||
for certName, config := range crtList {
|
for certName, config := range crtList {
|
||||||
certificate := CertificatePKI{}
|
certificate := CertificatePKI{}
|
||||||
crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap)
|
crt, err := FetchFileFromHost(ctx, GetCertTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificates")
|
||||||
// I will only exit with an error if it's not a not-found-error and this is not an etcd certificate
|
// I will only exit with an error if it's not a not-found-error and this is not an etcd certificate
|
||||||
if err != nil && (!strings.HasPrefix(certName, "kube-etcd") &&
|
if err != nil && (!strings.HasPrefix(certName, "kube-etcd") &&
|
||||||
!strings.Contains(certName, APIProxyClientCertName) &&
|
!strings.Contains(certName, APIProxyClientCertName) &&
|
||||||
@@ -154,10 +190,10 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
|||||||
tmpCerts[certName] = CertificatePKI{}
|
tmpCerts[certName] = CertificatePKI{}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap)
|
key, err := FetchFileFromHost(ctx, GetKeyTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificate")
|
||||||
|
|
||||||
if config {
|
if config {
|
||||||
config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap)
|
config, err := FetchFileFromHost(ctx, GetConfigTempPath(certName), image, host, prsMap, CertFetcherContainer, "certificate")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -184,7 +220,7 @@ func FetchCertificatesFromHost(ctx context.Context, extraHosts []*hosts.Host, ho
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry) (string, error) {
|
func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.Host, prsMap map[string]v3.PrivateRegistry, containerName, state string) (string, error) {
|
||||||
|
|
||||||
imageCfg := &container.Config{
|
imageCfg := &container.Config{
|
||||||
Image: image,
|
Image: image,
|
||||||
@@ -195,16 +231,16 @@ func FetchFileFromHost(ctx context.Context, filePath, image string, host *hosts.
|
|||||||
},
|
},
|
||||||
Privileged: true,
|
Privileged: true,
|
||||||
}
|
}
|
||||||
isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, CertFetcherContainer, true)
|
isRunning, err := docker.IsContainerRunning(ctx, host.DClient, host.Address, containerName, true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
if !isRunning {
|
if !isRunning {
|
||||||
if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, CertFetcherContainer, host.Address, "certificates", prsMap); err != nil {
|
if err := docker.DoRunContainer(ctx, host.DClient, imageCfg, hostCfg, containerName, host.Address, state, prsMap); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
file, err := docker.ReadFileFromContainer(ctx, host.DClient, host.Address, CertFetcherContainer, filePath)
|
file, err := docker.ReadFileFromContainer(ctx, host.DClient, host.Address, containerName, filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user