Added rotation metric to certificate manager

Kubernetes-commit: 7adb18120079016ed8aea1bd40e5cde161827a1d
This commit is contained in:
Samuel Davidson 2019-10-28 14:09:40 -07:00 committed by Kubernetes Publisher
parent 890ae18798
commit 7bd7ed8621
2 changed files with 81 additions and 12 deletions

View File

@ -113,8 +113,17 @@ type Config struct {
// quickly replaced with a unique cert/key pair.
BootstrapKeyPEM []byte
// CertificateExpiration will record a metric that shows the remaining
// lifetime of the certificate.
// lifetime of the certificate. This metric is a gauge because only the
// current cert expiry time is really useful. Reading this metric at any
// time simply gives the next expiration date, no need to keep some
// history (histogram) of all previous expiry dates.
CertificateExpiration Gauge
// CertificateRotation will record a metric showing the time in seconds
// that certificates lived before being rotated. This metric is a histogram
// because there is value in keeping a history of rotation cadences. It
// allows one to setup monitoring and alerting of unexpected rotation
// behavior and track trends in rotation frequency.
CertificateRotation Histogram
}
// Store is responsible for getting and updating the current certificate.
@ -139,6 +148,12 @@ type Gauge interface {
Set(float64)
}
// Histogram will record the time a rotated certificate was used before being
// rotated.
type Histogram interface {
Observe(float64)
}
// NoCertKeyError indicates there is no cert/key currently available.
type NoCertKeyError string
@ -163,6 +178,7 @@ type manager struct {
certStore Store
certificateExpiration Gauge
certificateRotation Histogram
// the following variables must only be accessed under certAccessLock
certAccessLock sync.RWMutex
@ -174,6 +190,9 @@ type manager struct {
clientFn CSRClientFunc
stopCh chan struct{}
stopped bool
// Set to time.Now but can be stubbed out for testing
now func() time.Time
}
// NewManager returns a new certificate manager. A certificate manager is
@ -203,6 +222,8 @@ func NewManager(config *Config) (Manager, error) {
cert: cert,
forceRotation: forceRotation,
certificateExpiration: config.CertificateExpiration,
certificateRotation: config.CertificateRotation,
now: time.Now,
}
return &m, nil
@ -215,7 +236,7 @@ func NewManager(config *Config) (Manager, error) {
func (m *manager) Current() *tls.Certificate {
m.certAccessLock.RLock()
defer m.certAccessLock.RUnlock()
if m.cert != nil && m.cert.Leaf != nil && time.Now().After(m.cert.Leaf.NotAfter) {
if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) {
klog.V(2).Infof("Current certificate is expired.")
return nil
}
@ -256,7 +277,7 @@ func (m *manager) Start() {
templateChanged := make(chan struct{})
go wait.Until(func() {
deadline := m.nextRotationDeadline()
if sleepInterval := deadline.Sub(time.Now()); sleepInterval > 0 {
if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 {
klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval)
timer := time.NewTimer(sleepInterval)
@ -421,7 +442,10 @@ func (m *manager) rotateCerts() (bool, error) {
return false, nil
}
m.updateCached(cert)
if old := m.updateCached(cert); old != nil && m.certificateRotation != nil {
m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds())
}
return true, nil
}
@ -490,14 +514,14 @@ func (m *manager) nextRotationDeadline() time.Time {
// forceRotation is not protected by locks
if m.forceRotation {
m.forceRotation = false
return time.Now()
return m.now()
}
m.certAccessLock.RLock()
defer m.certAccessLock.RUnlock()
if !m.certSatisfiesTemplateLocked() {
return time.Now()
return m.now()
}
notAfter := m.cert.Leaf.NotAfter
@ -523,13 +547,15 @@ var jitteryDuration = func(totalDuration float64) time.Duration {
return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3)
}
// updateCached sets the most recent retrieved cert. It also sets the server
// as assumed healthy.
func (m *manager) updateCached(cert *tls.Certificate) {
// updateCached sets the most recent retrieved cert and returns the old cert.
// It also sets the server as assumed healthy.
func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate {
m.certAccessLock.Lock()
defer m.certAccessLock.Unlock()
m.serverHealth = true
old := m.cert
m.cert = cert
return old
}
// updateServerError takes an error returned by the server and infers

View File

@ -163,12 +163,17 @@ func TestNewManagerNoRotation(t *testing.T) {
}
}
type gaugeMock struct {
type metricMock struct {
calls int
lastValue float64
}
func (g *gaugeMock) Set(v float64) {
func (g *metricMock) Set(v float64) {
g.calls++
g.lastValue = v
}
func (g *metricMock) Observe(v float64) {
g.calls++
g.lastValue = v
}
@ -195,7 +200,7 @@ func TestSetRotationDeadline(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
g := gaugeMock{}
g := metricMock{}
m := manager{
cert: &tls.Certificate{
Leaf: &x509.Certificate{
@ -206,6 +211,7 @@ func TestSetRotationDeadline(t *testing.T) {
getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} },
usages: []certificates.KeyUsage{},
certificateExpiration: &g,
now: func() time.Time { return now },
}
jitteryDuration = func(float64) time.Duration { return time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7) }
lowerBound := tc.notBefore.Add(time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7))
@ -383,6 +389,7 @@ func TestCertSatisfiesTemplate(t *testing.T) {
m := manager{
cert: tlsCert,
getTemplate: func() *x509.CertificateRequest { return tc.template },
now: time.Now,
}
result := m.certSatisfiesTemplate()
@ -407,6 +414,7 @@ func TestRotateCertCreateCSRError(t *testing.T) {
clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) {
return fakeClient{failureType: createError}, nil
},
now: func() time.Time { return now },
}
if success, err := m.rotateCerts(); success {
@ -430,6 +438,7 @@ func TestRotateCertWaitingForResultError(t *testing.T) {
clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) {
return fakeClient{failureType: watchError}, nil
},
now: func() time.Time { return now },
}
defer func(t time.Duration) { certificateWaitTimeout = t }(certificateWaitTimeout)
@ -945,6 +954,40 @@ func TestServerHealth(t *testing.T) {
}
}
func TestRotationLogsDuration(t *testing.T) {
h := metricMock{}
now := time.Now()
certIss := now.Add(-2 * time.Hour)
m := manager{
cert: &tls.Certificate{
Leaf: &x509.Certificate{
NotBefore: certIss,
NotAfter: now.Add(-1 * time.Hour),
},
},
certStore: &fakeStore{cert: expiredStoreCertData.certificate},
getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} },
clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) {
return &fakeClient{
certificatePEM: apiServerCertData.certificatePEM,
}, nil
},
certificateRotation: &h,
now: func() time.Time { return now },
}
ok, err := m.rotateCerts()
if err != nil || !ok {
t.Errorf("failed to rotate certs: %v", err)
}
if h.calls != 1 {
t.Errorf("rotation metric was not called")
}
if h.lastValue != now.Sub(certIss).Seconds() {
t.Errorf("rotation metric did not record the right value got: %f; want %f", h.lastValue, now.Sub(certIss).Seconds())
}
}
type fakeClientFailureType int
const (