From afad3417596327fe3c8814808415c59b4d06414f Mon Sep 17 00:00:00 2001 From: Tim Allclair Date: Tue, 26 Oct 2021 18:19:10 -0700 Subject: [PATCH] Cache fast-path metrics & update unit tests --- .../pod-security-admission/api/helpers.go | 7 +- .../src/k8s.io/pod-security-admission/go.mod | 1 + .../pod-security-admission/metrics/metrics.go | 220 ++++++++++++++---- .../metrics/metrics_test.go | 126 +++++----- 4 files changed, 258 insertions(+), 96 deletions(-) diff --git a/staging/src/k8s.io/pod-security-admission/api/helpers.go b/staging/src/k8s.io/pod-security-admission/api/helpers.go index ee175993d9f..5e309fddc2c 100644 --- a/staging/src/k8s.io/pod-security-admission/api/helpers.go +++ b/staging/src/k8s.io/pod-security-admission/api/helpers.go @@ -21,6 +21,7 @@ import ( "regexp" "strconv" "strings" + "unicode" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/component-base/version" @@ -76,7 +77,11 @@ func GetAPIVersion() Version { if err != nil { return v } - minor, err := strconv.Atoi(apiVersion.Minor) + // split the "normal" + and - for semver stuff to get the leading minor number + minorString := strings.FieldsFunc(apiVersion.Minor, func(r rune) bool { + return !unicode.IsDigit(r) + })[0] + minor, err := strconv.Atoi(minorString) if err != nil { return v } diff --git a/staging/src/k8s.io/pod-security-admission/go.mod b/staging/src/k8s.io/pod-security-admission/go.mod index f8156b818c0..38c4358b67f 100644 --- a/staging/src/k8s.io/pod-security-admission/go.mod +++ b/staging/src/k8s.io/pod-security-admission/go.mod @@ -5,6 +5,7 @@ module k8s.io/pod-security-admission go 1.16 require ( + github.com/blang/semver v3.5.1+incompatible github.com/google/go-cmp v0.5.5 github.com/spf13/cobra v1.2.1 github.com/spf13/pflag v1.0.5 diff --git a/staging/src/k8s.io/pod-security-admission/metrics/metrics.go b/staging/src/k8s.io/pod-security-admission/metrics/metrics.go index d6ab481f098..bb6d384b21d 100644 --- a/staging/src/k8s.io/pod-security-admission/metrics/metrics.go +++ b/staging/src/k8s.io/pod-security-admission/metrics/metrics.go @@ -19,7 +19,9 @@ package metrics import ( "strconv" "strings" + "sync" + "github.com/blang/semver" admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime/schema" @@ -47,34 +49,18 @@ type Recorder interface { type PrometheusRecorder struct { apiVersion api.Version - evaluationsCounter *metrics.CounterVec - exemptionsCounter *metrics.CounterVec + evaluationsCounter *evaluationsCounter + exemptionsCounter *exemptionsCounter errorsCounter *metrics.CounterVec } var _ Recorder = &PrometheusRecorder{} func NewPrometheusRecorder(version api.Version) *PrometheusRecorder { - evaluationsCounter := metrics.NewCounterVec( - &metrics.CounterOpts{ - Name: "pod_security_evaluations_total", - Help: "Number of policy evaluations that occurred, not counting ignored or exempt requests.", - StabilityLevel: metrics.ALPHA, - }, - []string{"decision", "policy_level", "policy_version", "mode", "request_operation", "resource", "subresource"}, - ) - exemptionsCounter := metrics.NewCounterVec( - &metrics.CounterOpts{ - Name: "pod_security_exemptions_total", - Help: "Number of exempt requests, not counting ignored or out of scope requests.", - StabilityLevel: metrics.ALPHA, - }, - []string{"request_operation", "resource", "subresource"}, - ) errorsCounter := metrics.NewCounterVec( &metrics.CounterOpts{ Name: "pod_security_errors_total", - Help: "Number of errors prevent normal evaluation. Non-fatal errors are evaluated against a default policy.", + Help: "Number of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.", StabilityLevel: metrics.ALPHA, }, []string{"fatal", "request_operation", "resource", "subresource"}, @@ -82,8 +68,8 @@ func NewPrometheusRecorder(version api.Version) *PrometheusRecorder { return &PrometheusRecorder{ apiVersion: version, - evaluationsCounter: evaluationsCounter, - exemptionsCounter: exemptionsCounter, + evaluationsCounter: newEvaluationsCounter(), + exemptionsCounter: newExemptionsCounter(), errorsCounter: errorsCounter, } } @@ -101,13 +87,8 @@ func (r *PrometheusRecorder) Reset() { } func (r *PrometheusRecorder) RecordEvaluation(decision Decision, policy api.LevelVersion, evalMode Mode, attrs api.Attributes) { - dec := string(decision) - operation := operationLabel(attrs.GetOperation()) - resource := resourceLabel(attrs.GetResource()) - subresource := attrs.GetSubresource() - var version string - if policy.Version.Latest() { + if policy.Version.Latest() || policy.Level == api.LevelPrivileged { // Privileged is always effectively latest. version = "latest" } else { if !r.apiVersion.Older(policy.Version) { @@ -117,29 +98,44 @@ func (r *PrometheusRecorder) RecordEvaluation(decision Decision, policy api.Leve } } - r.evaluationsCounter.WithLabelValues(dec, string(policy.Level), - version, string(evalMode), operation, resource, subresource).Inc() + r.evaluationsCounter.CachedInc(evaluationsLabels{ + decision: string(decision), + level: string(policy.Level), + version: version, + mode: string(evalMode), + operation: operationLabel(attrs.GetOperation()), + resource: resourceLabel(attrs.GetResource()), + subresource: attrs.GetSubresource(), + }) } func (r *PrometheusRecorder) RecordExemption(attrs api.Attributes) { - operation := operationLabel(attrs.GetOperation()) - resource := resourceLabel(attrs.GetResource()) - subresource := attrs.GetSubresource() - r.exemptionsCounter.WithLabelValues(operation, resource, subresource).Inc() + r.exemptionsCounter.CachedInc(exemptionsLabels{ + operation: operationLabel(attrs.GetOperation()), + resource: resourceLabel(attrs.GetResource()), + subresource: attrs.GetSubresource(), + }) } func (r *PrometheusRecorder) RecordError(fatal bool, attrs api.Attributes) { - operation := operationLabel(attrs.GetOperation()) - resource := resourceLabel(attrs.GetResource()) - subresource := attrs.GetSubresource() - r.errorsCounter.WithLabelValues(strconv.FormatBool(fatal), operation, resource, subresource).Inc() + r.errorsCounter.WithLabelValues( + strconv.FormatBool(fatal), + operationLabel(attrs.GetOperation()), + resourceLabel(attrs.GetResource()), + attrs.GetSubresource(), + ).Inc() } +var ( + podResource = corev1.Resource("pods") + namespaceResource = corev1.Resource("namespaces") +) + func resourceLabel(resource schema.GroupVersionResource) string { switch resource.GroupResource() { - case corev1.Resource("pods"): + case podResource: return "pod" - case corev1.Resource("namespace"): + case namespaceResource: return "namespace" default: // Assume any other resource is a valid input to pod-security, and therefore a controller. @@ -158,3 +154,149 @@ func operationLabel(op admissionv1.Operation) string { return strings.ToLower(string(op)) } } + +type evaluationsLabels struct { + decision string + level string + version string + mode string + operation string + resource string + subresource string +} + +func (l *evaluationsLabels) labels() []string { + return []string{l.decision, l.level, l.version, l.mode, l.operation, l.resource, l.subresource} +} + +type exemptionsLabels struct { + operation string + resource string + subresource string +} + +func (l *exemptionsLabels) labels() []string { + return []string{l.operation, l.resource, l.subresource} +} + +type evaluationsCounter struct { + *metrics.CounterVec + + cache map[evaluationsLabels]metrics.CounterMetric + cacheLock sync.RWMutex +} + +func newEvaluationsCounter() *evaluationsCounter { + return &evaluationsCounter{ + CounterVec: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "pod_security_evaluations_total", + Help: "Number of policy evaluations that occurred, not counting ignored or exempt requests.", + StabilityLevel: metrics.ALPHA, + }, + []string{"decision", "policy_level", "policy_version", "mode", "request_operation", "resource", "subresource"}, + ), + cache: make(map[evaluationsLabels]metrics.CounterMetric), + } +} + +func (c *evaluationsCounter) CachedInc(l evaluationsLabels) { + c.cacheLock.RLock() + defer c.cacheLock.RUnlock() + + if cachedCounter, ok := c.cache[l]; ok { + cachedCounter.Inc() + } else { + c.CounterVec.WithLabelValues(l.labels()...).Inc() + } +} + +func (c *evaluationsCounter) Create(version *semver.Version) bool { + c.cacheLock.Lock() + defer c.cacheLock.Unlock() + if c.CounterVec.Create(version) { + c.populateCache() + return true + } else { + return false + } +} + +func (c *evaluationsCounter) Reset() { + c.cacheLock.Lock() + defer c.cacheLock.Unlock() + c.CounterVec.Reset() + c.populateCache() +} + +func (c *evaluationsCounter) populateCache() { + labelsToCache := []evaluationsLabels{ + {decision: "allow", level: "privileged", version: "latest", mode: "enforce", operation: "create", resource: "pod", subresource: ""}, + {decision: "allow", level: "privileged", version: "latest", mode: "enforce", operation: "update", resource: "pod", subresource: ""}, + } + for _, l := range labelsToCache { + c.cache[l] = c.CounterVec.WithLabelValues(l.labels()...) + } +} + +type exemptionsCounter struct { + *metrics.CounterVec + + cache map[exemptionsLabels]metrics.CounterMetric + cacheLock sync.RWMutex +} + +func newExemptionsCounter() *exemptionsCounter { + return &exemptionsCounter{ + CounterVec: metrics.NewCounterVec( + &metrics.CounterOpts{ + Name: "pod_security_exemptions_total", + Help: "Number of exempt requests, not counting ignored or out of scope requests.", + StabilityLevel: metrics.ALPHA, + }, + []string{"request_operation", "resource", "subresource"}, + ), + cache: make(map[exemptionsLabels]metrics.CounterMetric), + } +} + +func (c *exemptionsCounter) CachedInc(l exemptionsLabels) { + c.cacheLock.RLock() + defer c.cacheLock.RUnlock() + + if cachedCounter, ok := c.cache[l]; ok { + cachedCounter.Inc() + } else { + c.CounterVec.WithLabelValues(l.labels()...).Inc() + } +} + +func (c *exemptionsCounter) Create(version *semver.Version) bool { + c.cacheLock.Lock() + defer c.cacheLock.Unlock() + if c.CounterVec.Create(version) { + c.populateCache() + return true + } else { + return false + } +} + +func (c *exemptionsCounter) Reset() { + c.cacheLock.Lock() + defer c.cacheLock.Unlock() + c.CounterVec.Reset() + c.populateCache() +} + +func (c *exemptionsCounter) populateCache() { + labelsToCache := []exemptionsLabels{ + {operation: "create", resource: "pod", subresource: ""}, + {operation: "update", resource: "pod", subresource: ""}, + {operation: "create", resource: "controller", subresource: ""}, + {operation: "update", resource: "controller", subresource: ""}, + } + for _, l := range labelsToCache { + c.cache[l] = c.CounterVec.WithLabelValues(l.labels()...) + } +} diff --git a/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go b/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go index 72081a89434..657e1aa6d56 100644 --- a/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go +++ b/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go @@ -17,20 +17,21 @@ limitations under the License. package metrics import ( - "strconv" + "bytes" + "fmt" + "sort" "strings" "testing" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" admissionv1 "k8s.io/api/admission/v1" appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/testutil" "k8s.io/pod-security-admission/api" + + "github.com/stretchr/testify/assert" ) var ( @@ -71,21 +72,18 @@ func TestRecordEvaluation(t *testing.T) { Resource: resource, Operation: op, }) - expectedLabels := map[string]string{ - "decision": string(decision), - "policy_level": string(level), - "policy_version": expectedVersion, - "mode": string(mode), - "request_operation": strings.ToLower(string(op)), - "resource": expectedResource, - "subresource": "", - } - val, err := testutil.GetCounterMetricValue(recorder.evaluationsCounter.With(expectedLabels)) - require.NoError(t, err, expectedLabels) - if !assert.EqualValues(t, 1, val, expectedLabels) { - findMetric(t, registry, "pod_security_evaluations_total") + if level == api.LevelPrivileged { + expectedVersion = "latest" } + expected := fmt.Sprintf(` + # HELP pod_security_evaluations_total [ALPHA] Number of policy evaluations that occurred, not counting ignored or exempt requests. + # TYPE pod_security_evaluations_total counter + pod_security_evaluations_total{decision="%s",mode="%s",policy_level="%s",policy_version="%s",request_operation="%s",resource="%s",subresource=""} 1 + `, decision, mode, level, expectedVersion, strings.ToLower(string(op)), expectedResource) + expected = expectCachedMetrics("pod_security_evaluations_total", expected) + + assert.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(expected), "pod_security_evaluations_total")) recorder.Reset() } @@ -103,23 +101,24 @@ func TestRecordExemption(t *testing.T) { for _, op := range operations { for resource, expectedResource := range resourceExpectations { - recorder.RecordExemption(&api.AttributesRecord{ - Resource: resource, - Operation: op, - }) - expectedLabels := map[string]string{ - "request_operation": strings.ToLower(string(op)), - "resource": expectedResource, - "subresource": "", - } - val, err := testutil.GetCounterMetricValue(recorder.exemptionsCounter.With(expectedLabels)) - require.NoError(t, err, expectedLabels) + for _, subresource := range []string{"", "ephemeralcontainers"} { + recorder.RecordExemption(&api.AttributesRecord{ + Resource: resource, + Operation: op, + Subresource: subresource, + }) - if !assert.EqualValues(t, 1, val, expectedLabels) { - findMetric(t, registry, "pod_security_exemptions_total") - } + expected := fmt.Sprintf(` + # HELP pod_security_exemptions_total [ALPHA] Number of exempt requests, not counting ignored or out of scope requests. + # TYPE pod_security_exemptions_total counter + pod_security_exemptions_total{request_operation="%s",resource="%s",subresource="%s"} 1 + `, strings.ToLower(string(op)), expectedResource, subresource) + expected = expectCachedMetrics("pod_security_exemptions_total", expected) - recorder.Reset() + assert.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(expected), "pod_security_exemptions_total")) + + recorder.Reset() + } } } } @@ -136,18 +135,14 @@ func TestRecordError(t *testing.T) { Resource: resource, Operation: op, }) - expectedLabels := map[string]string{ - "fatal": strconv.FormatBool(fatal), - "request_operation": strings.ToLower(string(op)), - "resource": expectedResource, - "subresource": "", - } - val, err := testutil.GetCounterMetricValue(recorder.errorsCounter.With(expectedLabels)) - require.NoError(t, err, expectedLabels) - if !assert.EqualValues(t, 1, val, expectedLabels) { - findMetric(t, registry, "pod_security_errors_total") - } + expected := bytes.NewBufferString(fmt.Sprintf(` + # HELP pod_security_errors_total [ALPHA] Number of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation. + # TYPE pod_security_errors_total counter + pod_security_errors_total{fatal="%t",request_operation="%s",resource="%s",subresource=""} 1 + `, fatal, strings.ToLower(string(op)), expectedResource)) + + assert.NoError(t, testutil.GatherAndCompare(registry, expected, "pod_security_errors_total")) recorder.Reset() } @@ -164,19 +159,38 @@ func levelVersion(level api.Level, version string) api.LevelVersion { return lv } -// findMetric dumps non-zero metric samples for the metric with the given name, to help with debugging. -func findMetric(t *testing.T, gatherer metrics.Gatherer, metricName string) { - t.Helper() - m, _ := gatherer.Gather() - for _, mFamily := range m { - if mFamily.GetName() == metricName { - for _, metric := range mFamily.GetMetric() { - if metric.GetCounter().GetValue() > 0 { - t.Logf("Found metric: %s", metric.String()) - } - } - return +// The cached metrics should always be present (value 0 if not counted). +var expectedCachedMetrics = map[string][]string{ + "pod_security_evaluations_total": { + `pod_security_evaluations_total{decision="allow",mode="enforce",policy_level="privileged",policy_version="latest",request_operation="create",resource="pod",subresource=""}`, + `pod_security_evaluations_total{decision="allow",mode="enforce",policy_level="privileged",policy_version="latest",request_operation="update",resource="pod",subresource=""}`, + }, + "pod_security_exemptions_total": { + `pod_security_exemptions_total{request_operation="create",resource="controller",subresource=""}`, + `pod_security_exemptions_total{request_operation="create",resource="pod",subresource=""}`, + `pod_security_exemptions_total{request_operation="update",resource="controller",subresource=""}`, + `pod_security_exemptions_total{request_operation="update",resource="pod",subresource=""}`, + }, +} + +func expectCachedMetrics(metricName, expected string) string { + expectations := strings.Split(strings.TrimSpace(expected), "\n") + for i, expectation := range expectations { + expectations[i] = strings.TrimSpace(expectation) // Whitespace messes with sorting. + } + for _, cached := range expectedCachedMetrics[metricName] { + expectations = addZeroExpectation(expectations, cached) + } + sort.Strings(expectations[:len(expectations)-1]) + return "\n" + strings.Join(expectations, "\n") + "\n" +} + +// addZeroExpectation adds the mixin as an empty sample if not already present. +func addZeroExpectation(currentExpectations []string, mixin string) []string { + for _, current := range currentExpectations { + if strings.HasPrefix(current, mixin) { + return currentExpectations // Mixin value already present. } } - t.Errorf("Expected metric %s not found", metricName) + return append(currentExpectations, fmt.Sprintf("%s 0", mixin)) }