mirror of
https://github.com/rancher/rke.git
synced 2025-08-31 22:46:25 +00:00
key rotation as part of ClusterUp, more robust secrets rewrite, improved logging
This commit is contained in:
@@ -6,26 +6,30 @@ import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
ghodssyaml "github.com/ghodss/yaml"
|
||||
"github.com/pkg/errors"
|
||||
normantypes "github.com/rancher/norman/types"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sync/errgroup"
|
||||
sigsyaml "sigs.k8s.io/yaml"
|
||||
|
||||
"github.com/rancher/rke/k8s"
|
||||
"github.com/rancher/rke/log"
|
||||
"github.com/rancher/rke/services"
|
||||
"github.com/rancher/rke/templates"
|
||||
v3 "github.com/rancher/rke/types"
|
||||
"github.com/rancher/rke/util"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sync/errgroup"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime"
|
||||
"k8s.io/apimachinery/pkg/runtime/serializer"
|
||||
apiserverconfig "k8s.io/apiserver/pkg/apis/config"
|
||||
apiserverconfigv1 "k8s.io/apiserver/pkg/apis/config/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/util/retry"
|
||||
sigsyaml "sigs.k8s.io/yaml"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -113,62 +117,205 @@ func (c *Cluster) DisableSecretsEncryption(ctx context.Context, currentCluster *
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
rewriteSecretsOperation = "rewrite-secrets"
|
||||
secretBatchSize = 250
|
||||
)
|
||||
|
||||
// RewriteSecrets does the following:
|
||||
// - retrieves all cluster secrets in batches with size of <secretBatchSize>
|
||||
// - triggers rewrites with new encryption key by sending each secret over a channel consumed by workers that perform the rewrite
|
||||
// - logs progress of rewrite operation
|
||||
func (c *Cluster) RewriteSecrets(ctx context.Context) error {
|
||||
log.Infof(ctx, "Rewriting cluster secrets")
|
||||
var errgrp errgroup.Group
|
||||
k8sClient, err := k8s.NewClient(c.LocalKubeConfigPath, c.K8sWrapTransport)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to initialize new kubernetes client: %v", err)
|
||||
}
|
||||
secretsList, err := k8s.GetSecretsList(k8sClient, "")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
k8sClient, cliErr := k8s.NewClient(c.LocalKubeConfigPath, c.K8sWrapTransport)
|
||||
if cliErr != nil {
|
||||
return fmt.Errorf("failed to initialize new kubernetes client: %v", cliErr)
|
||||
}
|
||||
|
||||
secretsQueue := util.GetObjectQueue(secretsList.Items)
|
||||
rewrites := make(chan interface{}, secretBatchSize)
|
||||
go func() {
|
||||
defer close(rewrites) // exiting this go routine triggers workers to exit
|
||||
|
||||
retryErr := func(err error) bool { // all returned errors can be retried
|
||||
return true
|
||||
}
|
||||
|
||||
var continueToken string
|
||||
var secrets []v1.Secret
|
||||
var restart bool
|
||||
for {
|
||||
err := retry.OnError(retry.DefaultRetry, retryErr, func() error {
|
||||
l, err := k8sClient.CoreV1().Secrets("").List(ctx, metav1.ListOptions{
|
||||
Limit: secretBatchSize, // keep the per request secrets batch size small to avoid client timeouts
|
||||
Continue: continueToken,
|
||||
})
|
||||
if err != nil {
|
||||
if isExpiredTokenErr(err) { // restart list operation due to token expiration
|
||||
logrus.Debugf("[%v] continue token expired, restarting list operation", rewriteSecretsOperation)
|
||||
continueToken = ""
|
||||
restart = true
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
secrets = append(secrets, l.Items...)
|
||||
continueToken = l.Continue
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
cliErr = err
|
||||
break
|
||||
}
|
||||
|
||||
// send this batch to workers for rewrite
|
||||
// duplicates are ok because we cache the names of secrets that have been rewritten, thus workers will only rewrite each secret once
|
||||
for _, s := range secrets {
|
||||
rewrites <- s
|
||||
}
|
||||
secrets = nil // reset secrets since they've been sent to workers
|
||||
|
||||
// if there's no continue token and the list operation doesn't need to be restarted, we've retrieved all secrets
|
||||
if continueToken == "" && !restart {
|
||||
break
|
||||
}
|
||||
|
||||
restart = false
|
||||
}
|
||||
|
||||
logrus.Debugf("[%v] All secrets retrieved and sent for rewrite", rewriteSecretsOperation)
|
||||
}()
|
||||
|
||||
// NOTE: since we retrieve secrets in batches, we don't know total number of secrets up front.
|
||||
// Telling the user how many we've rewritten so far is the best we can do
|
||||
done := make(chan struct{}, SyncWorkers)
|
||||
defer close(done)
|
||||
go func() {
|
||||
var rewritten int
|
||||
for range done {
|
||||
rewritten++
|
||||
if rewritten%50 == 0 { // log a message every 50 secrets
|
||||
log.Infof(ctx, "[%s] %v secrets rewritten", rewriteSecretsOperation, rewritten)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
getSecretID := func(s v1.Secret) string {
|
||||
return strings.Join([]string{s.Namespace, s.Name}, "/")
|
||||
}
|
||||
|
||||
// track secrets that have been rewritten
|
||||
// this is needed in case the continue token expires and the list secrets operation needs to be restarted
|
||||
rewritten := make(map[string]struct{})
|
||||
var rmtx sync.RWMutex
|
||||
|
||||
// spawn workers to perform secret rewrites
|
||||
var errgrp errgroup.Group
|
||||
for w := 0; w < SyncWorkers; w++ {
|
||||
errgrp.Go(func() error {
|
||||
var errList []error
|
||||
for secret := range secretsQueue {
|
||||
for secret := range rewrites {
|
||||
s := secret.(v1.Secret)
|
||||
err := rewriteSecret(k8sClient, &s)
|
||||
if err != nil {
|
||||
errList = append(errList, err)
|
||||
id := getSecretID(s)
|
||||
|
||||
rmtx.RLock()
|
||||
_, ok := rewritten[id]
|
||||
rmtx.RUnlock()
|
||||
|
||||
if !ok {
|
||||
err := rewriteSecret(k8sClient, &s)
|
||||
if err != nil {
|
||||
errList = append(errList, err)
|
||||
}
|
||||
|
||||
rmtx.Lock()
|
||||
rewritten[id] = struct{}{}
|
||||
rmtx.Unlock()
|
||||
|
||||
done <- struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
return util.ErrList(errList)
|
||||
})
|
||||
}
|
||||
if err := errgrp.Wait(); err != nil {
|
||||
return err
|
||||
logrus.Errorf("[%v] error: %v", rewriteSecretsOperation, err)
|
||||
return err // worker error from rewrites
|
||||
}
|
||||
log.Infof(ctx, "Cluster secrets rewritten successfully")
|
||||
return nil
|
||||
|
||||
if cliErr != nil {
|
||||
log.Infof(ctx, "[%s] Operation encountered error: %v", rewriteSecretsOperation, cliErr)
|
||||
} else {
|
||||
log.Infof(ctx, "[%s] Operation completed", rewriteSecretsOperation)
|
||||
}
|
||||
|
||||
return cliErr
|
||||
}
|
||||
|
||||
func (c *Cluster) RotateEncryptionKey(ctx context.Context, fullState *FullState) error {
|
||||
//generate new key
|
||||
// generate new key
|
||||
newKey, err := generateEncryptionKey()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
oldKey, err := c.extractActiveKey(c.EncryptionConfig.EncryptionProviderFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// reverse the keys order in the file, making newKey the Active Key
|
||||
initialKeyList := []*encryptionKey{ // order is critical here!
|
||||
newKey,
|
||||
oldKey,
|
||||
}
|
||||
initialProviderConfig, err := providerFileFromKeyList(keyList{KeyList: initialKeyList})
|
||||
|
||||
logrus.Debug("adding new encryption key, provider config: [newKey, oldKey]")
|
||||
|
||||
// Ensure encryption is done with newKey
|
||||
err = c.updateEncryptionProvider(ctx, []*encryptionKey{newKey, oldKey}, fullState)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.EncryptionConfig.EncryptionProviderFile = initialProviderConfig
|
||||
|
||||
// rewrite secrets via updates to secrets
|
||||
if err := c.RewriteSecrets(ctx); err != nil {
|
||||
// if there's a rewrite error, the cluster will need to be restored, so redeploy the initial encryption provider config
|
||||
var updateErr error
|
||||
for i := 0; i < 3; i++ { // up to 3 retries
|
||||
updateErr = c.updateEncryptionProvider(ctx, []*encryptionKey{oldKey}, fullState)
|
||||
if updateErr == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if updateErr != nil {
|
||||
err = errors.Wrap(err, updateErr.Error())
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
// At this point, all secrets have been rewritten using the newKey, so we remove the old one.
|
||||
logrus.Debug("removing old encryption key, provider config: [newKey]")
|
||||
|
||||
err = c.updateEncryptionProvider(ctx, []*encryptionKey{newKey}, fullState)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Cluster) updateEncryptionProvider(ctx context.Context, keys []*encryptionKey, fullState *FullState) error {
|
||||
providerConfig, err := providerFileFromKeyList(keyList{KeyList: keys})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.EncryptionConfig.EncryptionProviderFile = providerConfig
|
||||
if err := c.DeployEncryptionProviderFile(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// commit to state as soon as possible
|
||||
logrus.Debugf("[%s] Updating cluster state", services.ControlRole)
|
||||
if err := c.UpdateClusterCurrentState(ctx, fullState); err != nil {
|
||||
@@ -177,30 +324,7 @@ func (c *Cluster) RotateEncryptionKey(ctx context.Context, fullState *FullState)
|
||||
if err := services.RestartKubeAPIWithHealthcheck(ctx, c.ControlPlaneHosts, c.LocalConnDialerFactory, c.Certificates); err != nil {
|
||||
return err
|
||||
}
|
||||
// rewrite secrets
|
||||
if err := c.RewriteSecrets(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
// At this point, all secrets have been rewritten using the newKey, so we remove the old one.
|
||||
finalKeyList := []*encryptionKey{
|
||||
newKey,
|
||||
}
|
||||
finalProviderConfig, err := providerFileFromKeyList(keyList{KeyList: finalKeyList})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.EncryptionConfig.EncryptionProviderFile = finalProviderConfig
|
||||
if err := c.DeployEncryptionProviderFile(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
// commit to state
|
||||
logrus.Debugf("[%s] Updating cluster state", services.ControlRole)
|
||||
if err := c.UpdateClusterCurrentState(ctx, fullState); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := services.RestartKubeAPIWithHealthcheck(ctx, c.ControlPlaneHosts, c.LocalConnDialerFactory, c.Certificates); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -301,6 +425,18 @@ func (c *Cluster) generateDisabledEncryptionProviderFile() (string, error) {
|
||||
return disabledProviderFileFromKey(key)
|
||||
}
|
||||
|
||||
const (
|
||||
errExpiredToken = "The provided continue parameter is too old"
|
||||
)
|
||||
|
||||
// isExpiredTokenErr returns true if the error passed in is due to a continue token expiring
|
||||
func isExpiredTokenErr(err error) bool {
|
||||
if strings.Contains(err.Error(), errExpiredToken) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func rewriteSecret(k8sClient *kubernetes.Clientset, secret *v1.Secret) error {
|
||||
var err error
|
||||
if err = k8s.UpdateSecret(k8sClient, secret); err == nil {
|
||||
@@ -309,6 +445,10 @@ func rewriteSecret(k8sClient *kubernetes.Clientset, secret *v1.Secret) error {
|
||||
if apierrors.IsConflict(err) {
|
||||
secret, err = k8s.GetSecret(k8sClient, secret.Name, secret.Namespace)
|
||||
if err != nil {
|
||||
// if the secret no longer exists, we can skip it since it does not need to be rewritten
|
||||
if apierrors.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
err = k8s.UpdateSecret(k8sClient, secret)
|
||||
@@ -335,6 +475,7 @@ func isEncryptionEnabled(rkeConfig *v3.RancherKubernetesEngineConfig) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func isEncryptionCustomConfig(rkeConfig *v3.RancherKubernetesEngineConfig) bool {
|
||||
if isEncryptionEnabled(rkeConfig) &&
|
||||
rkeConfig.Services.KubeAPI.SecretsEncryptionConfig.CustomConfig != nil {
|
||||
|
Reference in New Issue
Block a user