mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-01 07:47:56 +00:00
node: memory manager: fix the mm metrics test
fixes for the memory manager tests by correctly restoring the kubelet config after each test. We need to do before all the related tests run, in order to make sure to restore the correct values. Add more debug facilities to troubleshoot further failures. Signed-off-by: Francesco Romani <fromani@redhat.com>
This commit is contained in:
parent
ef9965ebc6
commit
a5d771c911
@ -20,6 +20,7 @@ package e2enode
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
@ -27,9 +28,13 @@ import (
|
||||
"github.com/onsi/gomega/gstruct"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/fields"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
|
||||
"k8s.io/kubernetes/test/e2e/feature"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
admissionapi "k8s.io/pod-security-admission/api"
|
||||
)
|
||||
@ -39,10 +44,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
|
||||
|
||||
ginkgo.Context("when querying /metrics", func() {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var testPod *v1.Pod
|
||||
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
||||
var err error
|
||||
if oldCfg == nil {
|
||||
oldCfg, err = getCurrentKubeletConfig(ctx)
|
||||
@ -73,6 +78,9 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
}
|
||||
updateKubeletConfig(ctx, f, oldCfg, true)
|
||||
})
|
||||
|
||||
count := printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
|
||||
gomega.Expect(count).To(gomega.BeZero(), "unexpected pods on %q, please check output above", framework.TestContext.NodeName)
|
||||
})
|
||||
|
||||
ginkgo.It("should report zero pinning counters after a fresh restart", func(ctx context.Context) {
|
||||
@ -82,10 +90,10 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
|
||||
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
"": timelessSample(0), // intentionally use stricter value
|
||||
}),
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
"": timelessSample(0), // intentionally use stricter value
|
||||
}),
|
||||
})
|
||||
|
||||
@ -103,7 +111,8 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "1000Gi"},
|
||||
}))
|
||||
}),
|
||||
)
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
@ -122,6 +131,21 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
|
||||
values, err := getKubeletMetrics(ctx)
|
||||
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
|
||||
err = validateMetrics(
|
||||
values,
|
||||
"kubelet_memory_manager_pinning_requests_total",
|
||||
"kubelet_memory_manager_pinning_errors_total",
|
||||
func(totVal, errVal float64) error {
|
||||
if int64(totVal) != int64(errVal) {
|
||||
return fmt.Errorf("expected total requests equal to total errors")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
|
||||
})
|
||||
|
||||
ginkgo.It("should not report any pinning failures when the memorymanager allocation is expected to succeed", func(ctx context.Context) {
|
||||
@ -131,8 +155,12 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
{
|
||||
ctnName: "memmngrcnt",
|
||||
cpus: "100m",
|
||||
memory: "64Mi"},
|
||||
}))
|
||||
memory: "64Mi",
|
||||
},
|
||||
}),
|
||||
)
|
||||
|
||||
printAllPodsOnNode(ctx, f.ClientSet, framework.TestContext.NodeName)
|
||||
|
||||
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
|
||||
// being [Serial], we can also assume noone else but us is running pods.
|
||||
@ -141,7 +169,7 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
"kubelet_memory_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(1),
|
||||
}),
|
||||
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"kubelet_memory_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
|
||||
"": timelessSample(0),
|
||||
}),
|
||||
})
|
||||
@ -150,6 +178,64 @@ var _ = SIGDescribe("Memory Manager Metrics", framework.WithSerial(), feature.Me
|
||||
gomega.Eventually(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
ginkgo.By("Ensuring the metrics match the expectations a few more times")
|
||||
gomega.Consistently(getKubeletMetrics, 1*time.Minute, 15*time.Second).WithContext(ctx).Should(matchResourceMetrics)
|
||||
|
||||
values, err := getKubeletMetrics(ctx)
|
||||
framework.ExpectNoError(err, "error getting the kubelet metrics for sanity check")
|
||||
err = validateMetrics(
|
||||
values,
|
||||
"kubelet_memory_manager_pinning_requests_total",
|
||||
"kubelet_memory_manager_pinning_errors_total",
|
||||
func(totVal, errVal float64) error {
|
||||
if int64(totVal-errVal) < 1 {
|
||||
return fmt.Errorf("expected total requests equal to total errors + 1")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
)
|
||||
framework.ExpectNoError(err, "error validating the kubelet metrics between each other")
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
func validateMetrics(values e2emetrics.KubeletMetrics, totalKey, errorKey string, checkFn func(totVal, errVal float64) error) error {
|
||||
totalSamples := values[totalKey]
|
||||
errorSamples := values[errorKey]
|
||||
if len(totalSamples) != len(errorSamples) {
|
||||
return fmt.Errorf("inconsistent samples, total=%d error=%d", len(totalSamples), len(errorSamples))
|
||||
}
|
||||
for idx := range totalSamples {
|
||||
if err := checkFn(float64(totalSamples[idx].Value), float64(errorSamples[idx].Value)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// printAllPodsOnNode outputs status of all kubelet pods into log.
|
||||
// Note considering the e2e_node environment we will always have exactly 1 node, but still.
|
||||
func printAllPodsOnNode(ctx context.Context, c clientset.Interface, nodeName string) int {
|
||||
nodeSelector := fields.Set{
|
||||
"spec.nodeName": nodeName,
|
||||
}.AsSelector().String()
|
||||
|
||||
podList, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{
|
||||
FieldSelector: nodeSelector,
|
||||
})
|
||||
if err != nil {
|
||||
framework.Logf("Unable to retrieve pods for node %v: %v", nodeName, err)
|
||||
return 0
|
||||
}
|
||||
count := 0
|
||||
framework.Logf("begin listing pods: %d found", len(podList.Items))
|
||||
for _, p := range podList.Items {
|
||||
framework.Logf("%s/%s node %s (expected: %s) status %v QoS %s message %s reason %s (%d container statuses recorded)",
|
||||
p.Namespace, p.Name, p.Spec.NodeName, nodeName, p.Status.Phase, p.Status.QOSClass, p.Status.Message, p.Status.Reason, len(p.Status.ContainerStatuses))
|
||||
for _, c := range p.Status.ContainerStatuses {
|
||||
framework.Logf("\tContainer %v ready: %v, restart count %v",
|
||||
c.Name, c.Ready, c.RestartCount)
|
||||
}
|
||||
count++
|
||||
}
|
||||
framework.Logf("end listing pods: %d found", len(podList.Items))
|
||||
return count
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user