From bdc08eaa4b9ec38fa20573691a27e99862b4729c Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 5 Oct 2022 19:21:27 +0200 Subject: [PATCH] e2e: node: add tests for cpumanager metrics Add tests to verify the cpumanager metrics are populated. Signed-off-by: Francesco Romani --- test/e2e_node/cpu_manager_metrics_test.go | 206 ++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 test/e2e_node/cpu_manager_metrics_test.go diff --git a/test/e2e_node/cpu_manager_metrics_test.go b/test/e2e_node/cpu_manager_metrics_test.go new file mode 100644 index 00000000000..a899d393210 --- /dev/null +++ b/test/e2e_node/cpu_manager_metrics_test.go @@ -0,0 +1,206 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "fmt" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + "github.com/onsi/gomega/gstruct" + "github.com/onsi/gomega/types" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" + "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" + "k8s.io/kubernetes/pkg/kubelet/cm/cpuset" + "k8s.io/kubernetes/test/e2e/framework" + e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + admissionapi "k8s.io/pod-security-admission/api" +) + +var _ = SIGDescribe("CPU Manager Metrics [Serial][Feature:CPUManager]", func() { + f := framework.NewDefaultFramework("cpumanager-metrics") + f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged + + ginkgo.Context("when querying /metrics", func() { + var oldCfg *kubeletconfig.KubeletConfiguration + var testPod *v1.Pod + var smtLevel int + + ginkgo.BeforeEach(func() { + var err error + if oldCfg == nil { + oldCfg, err = getCurrentKubeletConfig() + framework.ExpectNoError(err) + } + + fullCPUsOnlyOpt := fmt.Sprintf("option=%s", cpumanager.FullPCPUsOnlyOption) + _, cpuAlloc, _ := getLocalNodeCPUDetails(f) + smtLevel = getSMTLevel() + + // strict SMT alignment is trivially verified and granted on non-SMT systems + if smtLevel < 2 { + e2eskipper.Skipf("Skipping CPU Manager %s tests since SMT disabled", fullCPUsOnlyOpt) + } + + // our tests want to allocate up to a full core, so we need at last 2*2=4 virtual cpus + if cpuAlloc < int64(smtLevel*2) { + e2eskipper.Skipf("Skipping CPU Manager %s tests since the CPU capacity < 4", fullCPUsOnlyOpt) + } + + framework.Logf("SMT level %d", smtLevel) + + // TODO: we assume the first available CPUID is 0, which is pretty fair, but we should probably + // check what we do have in the node. + cpuPolicyOptions := map[string]string{ + cpumanager.FullPCPUsOnlyOption: "true", + } + newCfg := configureCPUManagerInKubelet(oldCfg, + &cpuManagerKubeletArguments{ + policyName: string(cpumanager.PolicyStatic), + enableCPUManager: true, + reservedSystemCPUs: cpuset.NewCPUSet(0), + enableCPUManagerOptions: true, + options: cpuPolicyOptions, + }, + ) + updateKubeletConfig(f, newCfg, true) + }) + + ginkgo.AfterEach(func() { + if testPod != nil { + deletePodSyncByName(f, testPod.Name) + } + updateKubeletConfig(f, oldCfg, true) + }) + + ginkgo.It("should report zero pinning counters after a fresh restart", func() { + // we updated the kubelet config in BeforeEach, so we can assume we start fresh. + // being [Serial], we can also assume noone else but us is running pods. + ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running") + + matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(0), + }), + "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(0), + }), + }) + + ginkgo.By("Giving the Kubelet time to start up and produce metrics") + gomega.Eventually(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + ginkgo.By("Ensuring the metrics match the expectations a few more times") + gomega.Consistently(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + }) + + ginkgo.It("should report pinning failures when the cpumanager allocation is known to fail", func() { + ginkgo.By("Creating the test pod which will be rejected for SMTAlignmentError") + testPod = e2epod.NewPodClient(f).Create(makeGuaranteedCPUExclusiveSleeperPod("smt-align-err", 1)) + + // we updated the kubelet config in BeforeEach, so we can assume we start fresh. + // being [Serial], we can also assume noone else but us is running pods. + ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit") + + matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(1), + }), + "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(1), + }), + }) + + ginkgo.By("Giving the Kubelet time to start up and produce metrics") + gomega.Eventually(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + ginkgo.By("Ensuring the metrics match the expectations a few more times") + gomega.Consistently(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + }) + + ginkgo.It("should not report any pinning failures when the cpumanager allocation is expected to succeed", func() { + ginkgo.By("Creating the test pod") + testPod = e2epod.NewPodClient(f).Create(makeGuaranteedCPUExclusiveSleeperPod("smt-align-ok", smtLevel)) + + // we updated the kubelet config in BeforeEach, so we can assume we start fresh. + // being [Serial], we can also assume noone else but us is running pods. + ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted") + + matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ + "kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(1), + }), + "kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ + "": timelessSample(0), + }), + }) + + ginkgo.By("Giving the Kubelet time to start up and produce metrics") + gomega.Eventually(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + ginkgo.By("Ensuring the metrics match the expectations a few more times") + gomega.Consistently(getCPUManagerMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) + }) + }) +}) + +func getCPUManagerMetrics() (e2emetrics.KubeletMetrics, error) { + // we are running out of good names, so we need to be unnecessarily specific to avoid clashes + ginkgo.By("getting CPU Manager metrics from the metrics API") + return e2emetrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName+":10255", "/metrics") +} + +func makeGuaranteedCPUExclusiveSleeperPod(name string, cpus int) *v1.Pod { + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name + "-pod", + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Name: name + "-cnt", + Image: busyboxImage, + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", cpus)), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse(fmt.Sprintf("%d", cpus)), + v1.ResourceMemory: resource.MustParse("64Mi"), + }, + }, + Command: []string{"sh", "-c", "sleep", "1d"}, + }, + }, + }, + } +} + +func timelessSample(value interface{}) types.GomegaMatcher { + return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{ + // We already check Metric when matching the Id + "Metric": gstruct.Ignore(), + "Value": gomega.BeNumerically("==", value), + "Timestamp": gstruct.Ignore(), + })) +}