diff --git a/pkg/kubelet/certificate/kubelet.go b/pkg/kubelet/certificate/kubelet.go index cea55f0c34e..31e77108ce0 100644 --- a/pkg/kubelet/certificate/kubelet.go +++ b/pkg/kubelet/certificate/kubelet.go @@ -25,7 +25,7 @@ import ( "sort" certificates "k8s.io/api/certificates/v1beta1" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" clientset "k8s.io/client-go/kubernetes" certificatesclient "k8s.io/client-go/kubernetes/typed/certificates/v1beta1" @@ -52,17 +52,38 @@ func NewKubeletServerCertificateManager(kubeClient clientset.Interface, kubeCfg if err != nil { return nil, fmt.Errorf("failed to initialize server certificate store: %v", err) } - var certificateExpiration = compbasemetrics.NewGauge( + certificateExpiration := compbasemetrics.NewGauge( &compbasemetrics.GaugeOpts{ - Namespace: metrics.KubeletSubsystem, - Subsystem: "certificate_manager", - Name: "server_expiration_seconds", + Subsystem: metrics.KubeletSubsystem, + Name: "certificate_manager_server_expiration_seconds", Help: "Gauge of the lifetime of a certificate. The value is the date the certificate will expire in seconds since January 1, 1970 UTC.", StabilityLevel: compbasemetrics.ALPHA, }, ) legacyregistry.MustRegister(certificateExpiration) + certificateRotationAge := compbasemetrics.NewHistogram( + &compbasemetrics.HistogramOpts{ + Subsystem: metrics.KubeletSubsystem, + Name: "certificate_manager_server_rotation_seconds", + Help: "Histogram of the number of seconds the previous certificate lived before being rotated.", + Buckets: []float64{ + 60, // 1 minute + 3600, // 1 hour + 14400, // 4 hours + 86400, // 1 day + 604800, // 1 week + 2592000, // 1 month + 7776000, // 3 months + 15552000, // 6 months + 31104000, // 1 year + 124416000, // 4 years + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + ) + legacyregistry.MustRegister(certificateRotationAge) + getTemplate := func() *x509.CertificateRequest { hostnames, ips := addressesToHostnamesAndIPs(getAddresses()) // don't return a template if we have no addresses to request for @@ -100,6 +121,7 @@ func NewKubeletServerCertificateManager(kubeClient clientset.Interface, kubeCfg }, CertificateStore: certificateStore, CertificateExpiration: certificateExpiration, + CertificateRotation: certificateRotationAge, }) if err != nil { return nil, fmt.Errorf("failed to initialize server certificate manager: %v", err) diff --git a/staging/src/k8s.io/client-go/util/certificate/certificate_manager.go b/staging/src/k8s.io/client-go/util/certificate/certificate_manager.go index ef38791552d..55f5f5cbf6e 100644 --- a/staging/src/k8s.io/client-go/util/certificate/certificate_manager.go +++ b/staging/src/k8s.io/client-go/util/certificate/certificate_manager.go @@ -113,8 +113,17 @@ type Config struct { // quickly replaced with a unique cert/key pair. BootstrapKeyPEM []byte // CertificateExpiration will record a metric that shows the remaining - // lifetime of the certificate. + // lifetime of the certificate. This metric is a gauge because only the + // current cert expiry time is really useful. Reading this metric at any + // time simply gives the next expiration date, no need to keep some + // history (histogram) of all previous expiry dates. CertificateExpiration Gauge + // CertificateRotation will record a metric showing the time in seconds + // that certificates lived before being rotated. This metric is a histogram + // because there is value in keeping a history of rotation cadences. It + // allows one to setup monitoring and alerting of unexpected rotation + // behavior and track trends in rotation frequency. + CertificateRotation Histogram } // Store is responsible for getting and updating the current certificate. @@ -139,6 +148,12 @@ type Gauge interface { Set(float64) } +// Histogram will record the time a rotated certificate was used before being +// rotated. +type Histogram interface { + Observe(float64) +} + // NoCertKeyError indicates there is no cert/key currently available. type NoCertKeyError string @@ -163,6 +178,7 @@ type manager struct { certStore Store certificateExpiration Gauge + certificateRotation Histogram // the following variables must only be accessed under certAccessLock certAccessLock sync.RWMutex @@ -174,6 +190,9 @@ type manager struct { clientFn CSRClientFunc stopCh chan struct{} stopped bool + + // Set to time.Now but can be stubbed out for testing + now func() time.Time } // NewManager returns a new certificate manager. A certificate manager is @@ -203,6 +222,8 @@ func NewManager(config *Config) (Manager, error) { cert: cert, forceRotation: forceRotation, certificateExpiration: config.CertificateExpiration, + certificateRotation: config.CertificateRotation, + now: time.Now, } return &m, nil @@ -215,7 +236,7 @@ func NewManager(config *Config) (Manager, error) { func (m *manager) Current() *tls.Certificate { m.certAccessLock.RLock() defer m.certAccessLock.RUnlock() - if m.cert != nil && m.cert.Leaf != nil && time.Now().After(m.cert.Leaf.NotAfter) { + if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) { klog.V(2).Infof("Current certificate is expired.") return nil } @@ -256,7 +277,7 @@ func (m *manager) Start() { templateChanged := make(chan struct{}) go wait.Until(func() { deadline := m.nextRotationDeadline() - if sleepInterval := deadline.Sub(time.Now()); sleepInterval > 0 { + if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 { klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval) timer := time.NewTimer(sleepInterval) @@ -421,7 +442,10 @@ func (m *manager) rotateCerts() (bool, error) { return false, nil } - m.updateCached(cert) + if old := m.updateCached(cert); old != nil && m.certificateRotation != nil { + m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds()) + } + return true, nil } @@ -490,14 +514,14 @@ func (m *manager) nextRotationDeadline() time.Time { // forceRotation is not protected by locks if m.forceRotation { m.forceRotation = false - return time.Now() + return m.now() } m.certAccessLock.RLock() defer m.certAccessLock.RUnlock() if !m.certSatisfiesTemplateLocked() { - return time.Now() + return m.now() } notAfter := m.cert.Leaf.NotAfter @@ -523,13 +547,15 @@ var jitteryDuration = func(totalDuration float64) time.Duration { return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3) } -// updateCached sets the most recent retrieved cert. It also sets the server -// as assumed healthy. -func (m *manager) updateCached(cert *tls.Certificate) { +// updateCached sets the most recent retrieved cert and returns the old cert. +// It also sets the server as assumed healthy. +func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate { m.certAccessLock.Lock() defer m.certAccessLock.Unlock() m.serverHealth = true + old := m.cert m.cert = cert + return old } // updateServerError takes an error returned by the server and infers diff --git a/staging/src/k8s.io/client-go/util/certificate/certificate_manager_test.go b/staging/src/k8s.io/client-go/util/certificate/certificate_manager_test.go index 671cffc4cbb..85969857eb1 100644 --- a/staging/src/k8s.io/client-go/util/certificate/certificate_manager_test.go +++ b/staging/src/k8s.io/client-go/util/certificate/certificate_manager_test.go @@ -163,12 +163,17 @@ func TestNewManagerNoRotation(t *testing.T) { } } -type gaugeMock struct { +type metricMock struct { calls int lastValue float64 } -func (g *gaugeMock) Set(v float64) { +func (g *metricMock) Set(v float64) { + g.calls++ + g.lastValue = v +} + +func (g *metricMock) Observe(v float64) { g.calls++ g.lastValue = v } @@ -195,7 +200,7 @@ func TestSetRotationDeadline(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - g := gaugeMock{} + g := metricMock{} m := manager{ cert: &tls.Certificate{ Leaf: &x509.Certificate{ @@ -206,6 +211,7 @@ func TestSetRotationDeadline(t *testing.T) { getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, usages: []certificates.KeyUsage{}, certificateExpiration: &g, + now: func() time.Time { return now }, } jitteryDuration = func(float64) time.Duration { return time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7) } lowerBound := tc.notBefore.Add(time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7)) @@ -383,6 +389,7 @@ func TestCertSatisfiesTemplate(t *testing.T) { m := manager{ cert: tlsCert, getTemplate: func() *x509.CertificateRequest { return tc.template }, + now: time.Now, } result := m.certSatisfiesTemplate() @@ -407,6 +414,7 @@ func TestRotateCertCreateCSRError(t *testing.T) { clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { return fakeClient{failureType: createError}, nil }, + now: func() time.Time { return now }, } if success, err := m.rotateCerts(); success { @@ -430,6 +438,7 @@ func TestRotateCertWaitingForResultError(t *testing.T) { clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { return fakeClient{failureType: watchError}, nil }, + now: func() time.Time { return now }, } defer func(t time.Duration) { certificateWaitTimeout = t }(certificateWaitTimeout) @@ -945,6 +954,40 @@ func TestServerHealth(t *testing.T) { } } +func TestRotationLogsDuration(t *testing.T) { + h := metricMock{} + now := time.Now() + certIss := now.Add(-2 * time.Hour) + m := manager{ + cert: &tls.Certificate{ + Leaf: &x509.Certificate{ + NotBefore: certIss, + NotAfter: now.Add(-1 * time.Hour), + }, + }, + certStore: &fakeStore{cert: expiredStoreCertData.certificate}, + getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, + clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { + return &fakeClient{ + certificatePEM: apiServerCertData.certificatePEM, + }, nil + }, + certificateRotation: &h, + now: func() time.Time { return now }, + } + ok, err := m.rotateCerts() + if err != nil || !ok { + t.Errorf("failed to rotate certs: %v", err) + } + if h.calls != 1 { + t.Errorf("rotation metric was not called") + } + if h.lastValue != now.Sub(certIss).Seconds() { + t.Errorf("rotation metric did not record the right value got: %f; want %f", h.lastValue, now.Sub(certIss).Seconds()) + } + +} + type fakeClientFailureType int const (