node: memory manager: fix the mm metrics test

fixes for the memory manager tests by correctly restoring
the kubelet config after each test. We need to do before all
the related tests run, in order to make sure to restore the
correct values.

Add more debug facilities to troubleshoot further failures.

Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
Francesco Romani 2024-03-04 13:46:09 +01:00
parent ef9965ebc6
commit a5d771c911

View File

@ -20,6 +20,7 @@ package e2enode
import (
"context"
"fmt"
"time"
"github.com/onsi/ginkgo/v2"
@ -27,9 +28,13 @@ import (
"github.com/onsi/gomega/gstruct"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
clientset "k8s.io/client-go/kubernetes"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
admissionapi "k8s.io/pod-security-admission/api"
)
@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
ginkgo.Context("when querying /metrics", func() {
var oldCfg *kubeletconfig.KubeletConfiguration
var testPod *v1.Pod
ginkgo.BeforeEach(func(ctx context.Context) {
var oldCfg *kubeletconfig.KubeletConfiguration
var err error
if oldCfg == nil {
oldCfg, err = getCurrentKubeletConfig(ctx)
@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
}
updateKubeletConfig(ctx, f, oldCfg, true)
})
count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
})
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
"": timelessSample(0), // intentionally use stricter value
}),
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
"": timelessSample(0), // intentionally use stricter value
}),
})
@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
ctnName: "memmngrcnt",
cpus: "100m",
memory: "1000Gi"},
}))
}),
)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal) != int64(errVal) {
return fmt.Errorf("expected total requests equal to total errors")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
})
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
{
ctnName: "memmngrcnt",
cpus: "100m",
memory: "64Mi"},
}))
memory: "64Mi",
},
}),
)
printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1),
}),
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
})
@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
ginkgo.By("Ensuring the metrics match the expectations a few more times")
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
values, err := getKubeletMetrics(ctx)
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
err = validateMetrics(
values,
"kubelet_memory_manager_pinning_requests_total",
"kubelet_memory_manager_pinning_errors_total",
func(totVal, errVal float64) error {
if int64(totVal-errVal) < 1 {
return fmt.Errorf("expected total requests equal to total errors + 1")
}
return nil
},
)
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
})
})
})
func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
totalSamples := values[totalKey]
errorSamples := values[errorKey]
if len(totalSamples) != len(errorSamples) {
return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
}
for idx := range totalSamples {
if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
return err
}
}
return nil
}
// printAllPodsOnNode outputs status of all kubelet pods into log.
// Note considering the e2e_node environment we will always have exactly 1 node, but still.
func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
nodeSelector := fields.Set{
"spec.nodeName": nodeName,
}.AsSelector().String()
podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
FieldSelector: nodeSelector,
})
if err != nil {
framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
return 0
}
count := 0
framework.Logf("begin listing pods: %d found", len(podList.Items))
for _, p := range podList.Items {
framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
for _, c := range p.Status.ContainerStatuses {
framework.Logf("\tContainer %v ready: %v, restart count %v",
c.Name, c.Ready, c.RestartCount)
}
count++
}
framework.Logf("end listing pods: %d found", len(podList.Items))
return count
}