From 7bd7ed86216089fd56de7d4b7aa29e5de41cf612 Mon Sep 17 00:00:00 2001 From: Samuel Davidson Date: Mon, 28 Oct 2019 14:09:40 -0700 Subject: [PATCH] Added rotation metric to certificate manager Kubernetes-commit: 7adb18120079016ed8aea1bd40e5cde161827a1d --- util/certificate/certificate_manager.go | 44 ++++++++++++++---- util/certificate/certificate_manager_test.go | 49 ++++++++++++++++++-- 2 files changed, 81 insertions(+), 12 deletions(-) diff --git a/util/certificate/certificate_manager.go b/util/certificate/certificate_manager.go index ef387915..55f5f5cb 100644 --- a/util/certificate/certificate_manager.go +++ b/util/certificate/certificate_manager.go @@ -113,8 +113,17 @@ type Config struct { // quickly replaced with a unique cert/key pair. BootstrapKeyPEM []byte // CertificateExpiration will record a metric that shows the remaining - // lifetime of the certificate. + // lifetime of the certificate. This metric is a gauge because only the + // current cert expiry time is really useful. Reading this metric at any + // time simply gives the next expiration date, no need to keep some + // history (histogram) of all previous expiry dates. CertificateExpiration Gauge + // CertificateRotation will record a metric showing the time in seconds + // that certificates lived before being rotated. This metric is a histogram + // because there is value in keeping a history of rotation cadences. It + // allows one to setup monitoring and alerting of unexpected rotation + // behavior and track trends in rotation frequency. + CertificateRotation Histogram } // Store is responsible for getting and updating the current certificate. @@ -139,6 +148,12 @@ type Gauge interface { Set(float64) } +// Histogram will record the time a rotated certificate was used before being +// rotated. +type Histogram interface { + Observe(float64) +} + // NoCertKeyError indicates there is no cert/key currently available. type NoCertKeyError string @@ -163,6 +178,7 @@ type manager struct { certStore Store certificateExpiration Gauge + certificateRotation Histogram // the following variables must only be accessed under certAccessLock certAccessLock sync.RWMutex @@ -174,6 +190,9 @@ type manager struct { clientFn CSRClientFunc stopCh chan struct{} stopped bool + + // Set to time.Now but can be stubbed out for testing + now func() time.Time } // NewManager returns a new certificate manager. A certificate manager is @@ -203,6 +222,8 @@ func NewManager(config *Config) (Manager, error) { cert: cert, forceRotation: forceRotation, certificateExpiration: config.CertificateExpiration, + certificateRotation: config.CertificateRotation, + now: time.Now, } return &m, nil @@ -215,7 +236,7 @@ func NewManager(config *Config) (Manager, error) { func (m *manager) Current() *tls.Certificate { m.certAccessLock.RLock() defer m.certAccessLock.RUnlock() - if m.cert != nil && m.cert.Leaf != nil && time.Now().After(m.cert.Leaf.NotAfter) { + if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) { klog.V(2).Infof("Current certificate is expired.") return nil } @@ -256,7 +277,7 @@ func (m *manager) Start() { templateChanged := make(chan struct{}) go wait.Until(func() { deadline := m.nextRotationDeadline() - if sleepInterval := deadline.Sub(time.Now()); sleepInterval > 0 { + if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 { klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval) timer := time.NewTimer(sleepInterval) @@ -421,7 +442,10 @@ func (m *manager) rotateCerts() (bool, error) { return false, nil } - m.updateCached(cert) + if old := m.updateCached(cert); old != nil && m.certificateRotation != nil { + m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds()) + } + return true, nil } @@ -490,14 +514,14 @@ func (m *manager) nextRotationDeadline() time.Time { // forceRotation is not protected by locks if m.forceRotation { m.forceRotation = false - return time.Now() + return m.now() } m.certAccessLock.RLock() defer m.certAccessLock.RUnlock() if !m.certSatisfiesTemplateLocked() { - return time.Now() + return m.now() } notAfter := m.cert.Leaf.NotAfter @@ -523,13 +547,15 @@ var jitteryDuration = func(totalDuration float64) time.Duration { return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3) } -// updateCached sets the most recent retrieved cert. It also sets the server -// as assumed healthy. -func (m *manager) updateCached(cert *tls.Certificate) { +// updateCached sets the most recent retrieved cert and returns the old cert. +// It also sets the server as assumed healthy. +func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate { m.certAccessLock.Lock() defer m.certAccessLock.Unlock() m.serverHealth = true + old := m.cert m.cert = cert + return old } // updateServerError takes an error returned by the server and infers diff --git a/util/certificate/certificate_manager_test.go b/util/certificate/certificate_manager_test.go index 671cffc4..85969857 100644 --- a/util/certificate/certificate_manager_test.go +++ b/util/certificate/certificate_manager_test.go @@ -163,12 +163,17 @@ func TestNewManagerNoRotation(t *testing.T) { } } -type gaugeMock struct { +type metricMock struct { calls int lastValue float64 } -func (g *gaugeMock) Set(v float64) { +func (g *metricMock) Set(v float64) { + g.calls++ + g.lastValue = v +} + +func (g *metricMock) Observe(v float64) { g.calls++ g.lastValue = v } @@ -195,7 +200,7 @@ func TestSetRotationDeadline(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - g := gaugeMock{} + g := metricMock{} m := manager{ cert: &tls.Certificate{ Leaf: &x509.Certificate{ @@ -206,6 +211,7 @@ func TestSetRotationDeadline(t *testing.T) { getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, usages: []certificates.KeyUsage{}, certificateExpiration: &g, + now: func() time.Time { return now }, } jitteryDuration = func(float64) time.Duration { return time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7) } lowerBound := tc.notBefore.Add(time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7)) @@ -383,6 +389,7 @@ func TestCertSatisfiesTemplate(t *testing.T) { m := manager{ cert: tlsCert, getTemplate: func() *x509.CertificateRequest { return tc.template }, + now: time.Now, } result := m.certSatisfiesTemplate() @@ -407,6 +414,7 @@ func TestRotateCertCreateCSRError(t *testing.T) { clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { return fakeClient{failureType: createError}, nil }, + now: func() time.Time { return now }, } if success, err := m.rotateCerts(); success { @@ -430,6 +438,7 @@ func TestRotateCertWaitingForResultError(t *testing.T) { clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { return fakeClient{failureType: watchError}, nil }, + now: func() time.Time { return now }, } defer func(t time.Duration) { certificateWaitTimeout = t }(certificateWaitTimeout) @@ -945,6 +954,40 @@ func TestServerHealth(t *testing.T) { } } +func TestRotationLogsDuration(t *testing.T) { + h := metricMock{} + now := time.Now() + certIss := now.Add(-2 * time.Hour) + m := manager{ + cert: &tls.Certificate{ + Leaf: &x509.Certificate{ + NotBefore: certIss, + NotAfter: now.Add(-1 * time.Hour), + }, + }, + certStore: &fakeStore{cert: expiredStoreCertData.certificate}, + getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, + clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { + return &fakeClient{ + certificatePEM: apiServerCertData.certificatePEM, + }, nil + }, + certificateRotation: &h, + now: func() time.Time { return now }, + } + ok, err := m.rotateCerts() + if err != nil || !ok { + t.Errorf("failed to rotate certs: %v", err) + } + if h.calls != 1 { + t.Errorf("rotation metric was not called") + } + if h.lastValue != now.Sub(certIss).Seconds() { + t.Errorf("rotation metric did not record the right value got: %f; want %f", h.lastValue, now.Sub(certIss).Seconds()) + } + +} + type fakeClientFailureType int const (