From afad3417596327fe3c8814808415c59b4d06414f Mon Sep 17 00:00:00 2001
From: Tim Allclair <timallclair@gmail.com>
Date: Tue, 26 Oct 2021 18:19:10 -0700
Subject: [PATCH] Cache fast-path metrics & update unit tests

---
 .../pod-security-admission/api/helpers.go     |   7 +-
 .../src/k8s.io/pod-security-admission/go.mod  |   1 +
 .../pod-security-admission/metrics/metrics.go | 220 ++++++++++++++----
 .../metrics/metrics_test.go                   | 126 +++++-----
 4 files changed, 258 insertions(+), 96 deletions(-)

diff --git a/staging/src/k8s.io/pod-security-admission/api/helpers.go b/staging/src/k8s.io/pod-security-admission/api/helpers.go
index ee175993d9f..5e309fddc2c 100644
--- a/staging/src/k8s.io/pod-security-admission/api/helpers.go
+++ b/staging/src/k8s.io/pod-security-admission/api/helpers.go
@@ -21,6 +21,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"unicode"
 
 	"k8s.io/apimachinery/pkg/util/validation/field"
 	"k8s.io/component-base/version"
@@ -76,7 +77,11 @@ func GetAPIVersion() Version {
 	if err != nil {
 		return v
 	}
-	minor, err := strconv.Atoi(apiVersion.Minor)
+	// split the "normal" + and - for semver stuff to get the leading minor number
+	minorString := strings.FieldsFunc(apiVersion.Minor, func(r rune) bool {
+		return !unicode.IsDigit(r)
+	})[0]
+	minor, err := strconv.Atoi(minorString)
 	if err != nil {
 		return v
 	}
diff --git a/staging/src/k8s.io/pod-security-admission/go.mod b/staging/src/k8s.io/pod-security-admission/go.mod
index f8156b818c0..38c4358b67f 100644
--- a/staging/src/k8s.io/pod-security-admission/go.mod
+++ b/staging/src/k8s.io/pod-security-admission/go.mod
@@ -5,6 +5,7 @@ module k8s.io/pod-security-admission
 go 1.16
 
 require (
+	github.com/blang/semver v3.5.1+incompatible
 	github.com/google/go-cmp v0.5.5
 	github.com/spf13/cobra v1.2.1
 	github.com/spf13/pflag v1.0.5
diff --git a/staging/src/k8s.io/pod-security-admission/metrics/metrics.go b/staging/src/k8s.io/pod-security-admission/metrics/metrics.go
index d6ab481f098..bb6d384b21d 100644
--- a/staging/src/k8s.io/pod-security-admission/metrics/metrics.go
+++ b/staging/src/k8s.io/pod-security-admission/metrics/metrics.go
@@ -19,7 +19,9 @@ package metrics
 import (
 	"strconv"
 	"strings"
+	"sync"
 
+	"github.com/blang/semver"
 	admissionv1 "k8s.io/api/admission/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
@@ -47,34 +49,18 @@ type Recorder interface {
 type PrometheusRecorder struct {
 	apiVersion api.Version
 
-	evaluationsCounter *metrics.CounterVec
-	exemptionsCounter  *metrics.CounterVec
+	evaluationsCounter *evaluationsCounter
+	exemptionsCounter  *exemptionsCounter
 	errorsCounter      *metrics.CounterVec
 }
 
 var _ Recorder = &PrometheusRecorder{}
 
 func NewPrometheusRecorder(version api.Version) *PrometheusRecorder {
-	evaluationsCounter := metrics.NewCounterVec(
-		&metrics.CounterOpts{
-			Name:           "pod_security_evaluations_total",
-			Help:           "Number of policy evaluations that occurred, not counting ignored or exempt requests.",
-			StabilityLevel: metrics.ALPHA,
-		},
-		[]string{"decision", "policy_level", "policy_version", "mode", "request_operation", "resource", "subresource"},
-	)
-	exemptionsCounter := metrics.NewCounterVec(
-		&metrics.CounterOpts{
-			Name:           "pod_security_exemptions_total",
-			Help:           "Number of exempt requests, not counting ignored or out of scope requests.",
-			StabilityLevel: metrics.ALPHA,
-		},
-		[]string{"request_operation", "resource", "subresource"},
-	)
 	errorsCounter := metrics.NewCounterVec(
 		&metrics.CounterOpts{
 			Name:           "pod_security_errors_total",
-			Help:           "Number of errors prevent normal evaluation. Non-fatal errors are evaluated against a default policy.",
+			Help:           "Number of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.",
 			StabilityLevel: metrics.ALPHA,
 		},
 		[]string{"fatal", "request_operation", "resource", "subresource"},
@@ -82,8 +68,8 @@ func NewPrometheusRecorder(version api.Version) *PrometheusRecorder {
 
 	return &PrometheusRecorder{
 		apiVersion:         version,
-		evaluationsCounter: evaluationsCounter,
-		exemptionsCounter:  exemptionsCounter,
+		evaluationsCounter: newEvaluationsCounter(),
+		exemptionsCounter:  newExemptionsCounter(),
 		errorsCounter:      errorsCounter,
 	}
 }
@@ -101,13 +87,8 @@ func (r *PrometheusRecorder) Reset() {
 }
 
 func (r *PrometheusRecorder) RecordEvaluation(decision Decision, policy api.LevelVersion, evalMode Mode, attrs api.Attributes) {
-	dec := string(decision)
-	operation := operationLabel(attrs.GetOperation())
-	resource := resourceLabel(attrs.GetResource())
-	subresource := attrs.GetSubresource()
-
 	var version string
-	if policy.Version.Latest() {
+	if policy.Version.Latest() || policy.Level == api.LevelPrivileged { // Privileged is always effectively latest.
 		version = "latest"
 	} else {
 		if !r.apiVersion.Older(policy.Version) {
@@ -117,29 +98,44 @@ func (r *PrometheusRecorder) RecordEvaluation(decision Decision, policy api.Leve
 		}
 	}
 
-	r.evaluationsCounter.WithLabelValues(dec, string(policy.Level),
-		version, string(evalMode), operation, resource, subresource).Inc()
+	r.evaluationsCounter.CachedInc(evaluationsLabels{
+		decision:    string(decision),
+		level:       string(policy.Level),
+		version:     version,
+		mode:        string(evalMode),
+		operation:   operationLabel(attrs.GetOperation()),
+		resource:    resourceLabel(attrs.GetResource()),
+		subresource: attrs.GetSubresource(),
+	})
 }
 
 func (r *PrometheusRecorder) RecordExemption(attrs api.Attributes) {
-	operation := operationLabel(attrs.GetOperation())
-	resource := resourceLabel(attrs.GetResource())
-	subresource := attrs.GetSubresource()
-	r.exemptionsCounter.WithLabelValues(operation, resource, subresource).Inc()
+	r.exemptionsCounter.CachedInc(exemptionsLabels{
+		operation:   operationLabel(attrs.GetOperation()),
+		resource:    resourceLabel(attrs.GetResource()),
+		subresource: attrs.GetSubresource(),
+	})
 }
 
 func (r *PrometheusRecorder) RecordError(fatal bool, attrs api.Attributes) {
-	operation := operationLabel(attrs.GetOperation())
-	resource := resourceLabel(attrs.GetResource())
-	subresource := attrs.GetSubresource()
-	r.errorsCounter.WithLabelValues(strconv.FormatBool(fatal), operation, resource, subresource).Inc()
+	r.errorsCounter.WithLabelValues(
+		strconv.FormatBool(fatal),
+		operationLabel(attrs.GetOperation()),
+		resourceLabel(attrs.GetResource()),
+		attrs.GetSubresource(),
+	).Inc()
 }
 
+var (
+	podResource       = corev1.Resource("pods")
+	namespaceResource = corev1.Resource("namespaces")
+)
+
 func resourceLabel(resource schema.GroupVersionResource) string {
 	switch resource.GroupResource() {
-	case corev1.Resource("pods"):
+	case podResource:
 		return "pod"
-	case corev1.Resource("namespace"):
+	case namespaceResource:
 		return "namespace"
 	default:
 		// Assume any other resource is a valid input to pod-security, and therefore a controller.
@@ -158,3 +154,149 @@ func operationLabel(op admissionv1.Operation) string {
 		return strings.ToLower(string(op))
 	}
 }
+
+type evaluationsLabels struct {
+	decision    string
+	level       string
+	version     string
+	mode        string
+	operation   string
+	resource    string
+	subresource string
+}
+
+func (l *evaluationsLabels) labels() []string {
+	return []string{l.decision, l.level, l.version, l.mode, l.operation, l.resource, l.subresource}
+}
+
+type exemptionsLabels struct {
+	operation   string
+	resource    string
+	subresource string
+}
+
+func (l *exemptionsLabels) labels() []string {
+	return []string{l.operation, l.resource, l.subresource}
+}
+
+type evaluationsCounter struct {
+	*metrics.CounterVec
+
+	cache     map[evaluationsLabels]metrics.CounterMetric
+	cacheLock sync.RWMutex
+}
+
+func newEvaluationsCounter() *evaluationsCounter {
+	return &evaluationsCounter{
+		CounterVec: metrics.NewCounterVec(
+			&metrics.CounterOpts{
+				Name:           "pod_security_evaluations_total",
+				Help:           "Number of policy evaluations that occurred, not counting ignored or exempt requests.",
+				StabilityLevel: metrics.ALPHA,
+			},
+			[]string{"decision", "policy_level", "policy_version", "mode", "request_operation", "resource", "subresource"},
+		),
+		cache: make(map[evaluationsLabels]metrics.CounterMetric),
+	}
+}
+
+func (c *evaluationsCounter) CachedInc(l evaluationsLabels) {
+	c.cacheLock.RLock()
+	defer c.cacheLock.RUnlock()
+
+	if cachedCounter, ok := c.cache[l]; ok {
+		cachedCounter.Inc()
+	} else {
+		c.CounterVec.WithLabelValues(l.labels()...).Inc()
+	}
+}
+
+func (c *evaluationsCounter) Create(version *semver.Version) bool {
+	c.cacheLock.Lock()
+	defer c.cacheLock.Unlock()
+	if c.CounterVec.Create(version) {
+		c.populateCache()
+		return true
+	} else {
+		return false
+	}
+}
+
+func (c *evaluationsCounter) Reset() {
+	c.cacheLock.Lock()
+	defer c.cacheLock.Unlock()
+	c.CounterVec.Reset()
+	c.populateCache()
+}
+
+func (c *evaluationsCounter) populateCache() {
+	labelsToCache := []evaluationsLabels{
+		{decision: "allow", level: "privileged", version: "latest", mode: "enforce", operation: "create", resource: "pod", subresource: ""},
+		{decision: "allow", level: "privileged", version: "latest", mode: "enforce", operation: "update", resource: "pod", subresource: ""},
+	}
+	for _, l := range labelsToCache {
+		c.cache[l] = c.CounterVec.WithLabelValues(l.labels()...)
+	}
+}
+
+type exemptionsCounter struct {
+	*metrics.CounterVec
+
+	cache     map[exemptionsLabels]metrics.CounterMetric
+	cacheLock sync.RWMutex
+}
+
+func newExemptionsCounter() *exemptionsCounter {
+	return &exemptionsCounter{
+		CounterVec: metrics.NewCounterVec(
+			&metrics.CounterOpts{
+				Name:           "pod_security_exemptions_total",
+				Help:           "Number of exempt requests, not counting ignored or out of scope requests.",
+				StabilityLevel: metrics.ALPHA,
+			},
+			[]string{"request_operation", "resource", "subresource"},
+		),
+		cache: make(map[exemptionsLabels]metrics.CounterMetric),
+	}
+}
+
+func (c *exemptionsCounter) CachedInc(l exemptionsLabels) {
+	c.cacheLock.RLock()
+	defer c.cacheLock.RUnlock()
+
+	if cachedCounter, ok := c.cache[l]; ok {
+		cachedCounter.Inc()
+	} else {
+		c.CounterVec.WithLabelValues(l.labels()...).Inc()
+	}
+}
+
+func (c *exemptionsCounter) Create(version *semver.Version) bool {
+	c.cacheLock.Lock()
+	defer c.cacheLock.Unlock()
+	if c.CounterVec.Create(version) {
+		c.populateCache()
+		return true
+	} else {
+		return false
+	}
+}
+
+func (c *exemptionsCounter) Reset() {
+	c.cacheLock.Lock()
+	defer c.cacheLock.Unlock()
+	c.CounterVec.Reset()
+	c.populateCache()
+}
+
+func (c *exemptionsCounter) populateCache() {
+	labelsToCache := []exemptionsLabels{
+		{operation: "create", resource: "pod", subresource: ""},
+		{operation: "update", resource: "pod", subresource: ""},
+		{operation: "create", resource: "controller", subresource: ""},
+		{operation: "update", resource: "controller", subresource: ""},
+	}
+	for _, l := range labelsToCache {
+		c.cache[l] = c.CounterVec.WithLabelValues(l.labels()...)
+	}
+}
diff --git a/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go b/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go
index 72081a89434..657e1aa6d56 100644
--- a/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go
+++ b/staging/src/k8s.io/pod-security-admission/metrics/metrics_test.go
@@ -17,20 +17,21 @@ limitations under the License.
 package metrics
 
 import (
-	"strconv"
+	"bytes"
+	"fmt"
+	"sort"
 	"strings"
 	"testing"
 
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 	admissionv1 "k8s.io/api/admission/v1"
 	appsv1 "k8s.io/api/apps/v1"
 	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
-	"k8s.io/component-base/metrics"
 	"k8s.io/component-base/metrics/testutil"
 	"k8s.io/pod-security-admission/api"
+
+	"github.com/stretchr/testify/assert"
 )
 
 var (
@@ -71,21 +72,18 @@ func TestRecordEvaluation(t *testing.T) {
 								Resource:  resource,
 								Operation: op,
 							})
-							expectedLabels := map[string]string{
-								"decision":          string(decision),
-								"policy_level":      string(level),
-								"policy_version":    expectedVersion,
-								"mode":              string(mode),
-								"request_operation": strings.ToLower(string(op)),
-								"resource":          expectedResource,
-								"subresource":       "",
-							}
-							val, err := testutil.GetCounterMetricValue(recorder.evaluationsCounter.With(expectedLabels))
-							require.NoError(t, err, expectedLabels)
 
-							if !assert.EqualValues(t, 1, val, expectedLabels) {
-								findMetric(t, registry, "pod_security_evaluations_total")
+							if level == api.LevelPrivileged {
+								expectedVersion = "latest"
 							}
+							expected := fmt.Sprintf(`
+							# HELP pod_security_evaluations_total [ALPHA] Number of policy evaluations that occurred, not counting ignored or exempt requests.
+        	            	# TYPE pod_security_evaluations_total counter
+							pod_security_evaluations_total{decision="%s",mode="%s",policy_level="%s",policy_version="%s",request_operation="%s",resource="%s",subresource=""} 1
+							`, decision, mode, level, expectedVersion, strings.ToLower(string(op)), expectedResource)
+							expected = expectCachedMetrics("pod_security_evaluations_total", expected)
+
+							assert.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(expected), "pod_security_evaluations_total"))
 
 							recorder.Reset()
 						}
@@ -103,23 +101,24 @@ func TestRecordExemption(t *testing.T) {
 
 	for _, op := range operations {
 		for resource, expectedResource := range resourceExpectations {
-			recorder.RecordExemption(&api.AttributesRecord{
-				Resource:  resource,
-				Operation: op,
-			})
-			expectedLabels := map[string]string{
-				"request_operation": strings.ToLower(string(op)),
-				"resource":          expectedResource,
-				"subresource":       "",
-			}
-			val, err := testutil.GetCounterMetricValue(recorder.exemptionsCounter.With(expectedLabels))
-			require.NoError(t, err, expectedLabels)
+			for _, subresource := range []string{"", "ephemeralcontainers"} {
+				recorder.RecordExemption(&api.AttributesRecord{
+					Resource:    resource,
+					Operation:   op,
+					Subresource: subresource,
+				})
 
-			if !assert.EqualValues(t, 1, val, expectedLabels) {
-				findMetric(t, registry, "pod_security_exemptions_total")
-			}
+				expected := fmt.Sprintf(`
+				# HELP pod_security_exemptions_total [ALPHA] Number of exempt requests, not counting ignored or out of scope requests.
+				# TYPE pod_security_exemptions_total counter
+				pod_security_exemptions_total{request_operation="%s",resource="%s",subresource="%s"} 1
+				`, strings.ToLower(string(op)), expectedResource, subresource)
+				expected = expectCachedMetrics("pod_security_exemptions_total", expected)
 
-			recorder.Reset()
+				assert.NoError(t, testutil.GatherAndCompare(registry, bytes.NewBufferString(expected), "pod_security_exemptions_total"))
+
+				recorder.Reset()
+			}
 		}
 	}
 }
@@ -136,18 +135,14 @@ func TestRecordError(t *testing.T) {
 					Resource:  resource,
 					Operation: op,
 				})
-				expectedLabels := map[string]string{
-					"fatal":             strconv.FormatBool(fatal),
-					"request_operation": strings.ToLower(string(op)),
-					"resource":          expectedResource,
-					"subresource":       "",
-				}
-				val, err := testutil.GetCounterMetricValue(recorder.errorsCounter.With(expectedLabels))
-				require.NoError(t, err, expectedLabels)
 
-				if !assert.EqualValues(t, 1, val, expectedLabels) {
-					findMetric(t, registry, "pod_security_errors_total")
-				}
+				expected := bytes.NewBufferString(fmt.Sprintf(`
+				# HELP pod_security_errors_total [ALPHA] Number of errors preventing normal evaluation. Non-fatal errors may result in the latest restricted profile being used for evaluation.
+				# TYPE pod_security_errors_total counter
+				pod_security_errors_total{fatal="%t",request_operation="%s",resource="%s",subresource=""} 1
+				`, fatal, strings.ToLower(string(op)), expectedResource))
+
+				assert.NoError(t, testutil.GatherAndCompare(registry, expected, "pod_security_errors_total"))
 
 				recorder.Reset()
 			}
@@ -164,19 +159,38 @@ func levelVersion(level api.Level, version string) api.LevelVersion {
 	return lv
 }
 
-// findMetric dumps non-zero metric samples for the metric with the given name, to help with debugging.
-func findMetric(t *testing.T, gatherer metrics.Gatherer, metricName string) {
-	t.Helper()
-	m, _ := gatherer.Gather()
-	for _, mFamily := range m {
-		if mFamily.GetName() == metricName {
-			for _, metric := range mFamily.GetMetric() {
-				if metric.GetCounter().GetValue() > 0 {
-					t.Logf("Found metric: %s", metric.String())
-				}
-			}
-			return
+// The cached metrics should always be present (value 0 if not counted).
+var expectedCachedMetrics = map[string][]string{
+	"pod_security_evaluations_total": {
+		`pod_security_evaluations_total{decision="allow",mode="enforce",policy_level="privileged",policy_version="latest",request_operation="create",resource="pod",subresource=""}`,
+		`pod_security_evaluations_total{decision="allow",mode="enforce",policy_level="privileged",policy_version="latest",request_operation="update",resource="pod",subresource=""}`,
+	},
+	"pod_security_exemptions_total": {
+		`pod_security_exemptions_total{request_operation="create",resource="controller",subresource=""}`,
+		`pod_security_exemptions_total{request_operation="create",resource="pod",subresource=""}`,
+		`pod_security_exemptions_total{request_operation="update",resource="controller",subresource=""}`,
+		`pod_security_exemptions_total{request_operation="update",resource="pod",subresource=""}`,
+	},
+}
+
+func expectCachedMetrics(metricName, expected string) string {
+	expectations := strings.Split(strings.TrimSpace(expected), "\n")
+	for i, expectation := range expectations {
+		expectations[i] = strings.TrimSpace(expectation) // Whitespace messes with sorting.
+	}
+	for _, cached := range expectedCachedMetrics[metricName] {
+		expectations = addZeroExpectation(expectations, cached)
+	}
+	sort.Strings(expectations[:len(expectations)-1])
+	return "\n" + strings.Join(expectations, "\n") + "\n"
+}
+
+// addZeroExpectation adds the mixin as an empty sample if not already present.
+func addZeroExpectation(currentExpectations []string, mixin string) []string {
+	for _, current := range currentExpectations {
+		if strings.HasPrefix(current, mixin) {
+			return currentExpectations // Mixin value already present.
 		}
 	}
-	t.Errorf("Expected metric %s not found", metricName)
+	return append(currentExpectations, fmt.Sprintf("%s 0", mixin))
 }