Merge pull request #32163 from mtaufen/more-eviction-logging

Automatic merge from submit-queue

Log pressure condition, memory usage, events in memory eviction test

I want to log this to help us debug some of the latest memory eviction test flakes, where we are seeing burstable "fail" before the besteffort. I saw (in the logs) attempts by the eviction manager to evict besteffort a while before burstable phase changed to "Failed", but the besteffort's phase appeared to remain "Running". I want to see the pressure condition interleaved with the pod phases to get a sense of the eviction manager's knowledge vs. pod phase.
This commit is contained in:
Kubernetes Submit Queue 2016-09-09 18:37:55 -07:00 committed by GitHub
commit 09efe0457d
2 changed files with 188 additions and 127 deletions

View File

@ -42,6 +42,7 @@ import (
"text/tabwriter" "text/tabwriter"
"time" "time"
"github.com/golang/glog"
"k8s.io/kubernetes/federation/client/clientset_generated/federation_release_1_4" "k8s.io/kubernetes/federation/client/clientset_generated/federation_release_1_4"
"k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api"
apierrs "k8s.io/kubernetes/pkg/api/errors" apierrs "k8s.io/kubernetes/pkg/api/errors"
@ -5170,3 +5171,14 @@ func CreateFileForGoBinData(gobindataPath, outputFilename string) error {
} }
return nil return nil
} }
func ListNamespaceEvents(c *client.Client, ns string) error {
ls, err := c.Events(ns).List(api.ListOptions{})
if err != nil {
return err
}
for _, event := range ls.Items {
glog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message)
}
return nil
}

View File

@ -36,6 +36,19 @@ import (
var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", func() { var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", func() {
f := framework.NewDefaultFramework("eviction-test") f := framework.NewDefaultFramework("eviction-test")
// This is a dummy context to wrap the outer AfterEach, which will run after the inner AfterEach.
// We want to list all of the node and pod events, including any that occur while waiting for
// memory pressure reduction, even if we time out while waiting.
Context("", func() {
AfterEach(func() {
glog.Infof("Summary of node events during the memory eviction test:")
err := framework.ListNamespaceEvents(f.Client, f.Namespace.Name)
framework.ExpectNoError(err)
glog.Infof("Summary of pod events during the memory eviction test:")
err = framework.ListNamespaceEvents(f.Client, "")
framework.ExpectNoError(err)
})
Context("when there is memory pressure", func() { Context("when there is memory pressure", func() {
AfterEach(func() { AfterEach(func() {
// Wait for the memory pressure condition to disappear from the node status before continuing. // Wait for the memory pressure condition to disappear from the node status before continuing.
@ -155,6 +168,40 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
glog.Infof("pod phase: guaranteed: %v, burstable: %v, besteffort: %v", gteedPh, burstPh, bestPh) glog.Infof("pod phase: guaranteed: %v, burstable: %v, besteffort: %v", gteedPh, burstPh, bestPh)
// NOTE/TODO(mtaufen): This should help us debug why burstable appears to fail before besteffort in some
// scenarios. We have seen some evidence that the eviction manager has in fact done the
// right thing and evicted the besteffort first, and attempted to change the besteffort phase
// to "Failed" when it evicts it, but that for some reason the test isn't seeing the updated
// phase. I'm trying to confirm or deny this.
// The eviction manager starts trying to evict things when the node comes under memory
// pressure, and the eviction manager reports this information in the pressure condition. If we
// see the eviction manager reporting a pressure condition for a while without the besteffort failing,
// and we see that the manager did in fact evict the besteffort (this should be in the Kubelet log), we
// will have more reason to believe the phase is out of date.
nodeList, err := f.Client.Nodes().List(api.ListOptions{})
if err != nil {
glog.Errorf("tried to get node list but got error: %v", err)
}
if len(nodeList.Items) != 1 {
glog.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
}
node := nodeList.Items[0]
_, pressure := api.GetNodeCondition(&node.Status, api.NodeMemoryPressure)
glog.Infof("node pressure condition: %s", pressure)
// NOTE/TODO(mtaufen): Also log (at least temporarily) the actual memory consumption on the node.
// I used this to plot memory usage from a successful test run and it looks the
// way I would expect. I want to see what the plot from a flake looks like.
summary, err := getNodeSummary()
if err != nil {
return err
}
if summary.Node.Memory.WorkingSetBytes != nil {
wset := *summary.Node.Memory.WorkingSetBytes
glog.Infof("Node's working set is (bytes): %v", wset)
}
if bestPh == api.PodRunning { if bestPh == api.PodRunning {
Expect(burstPh).NotTo(Equal(api.PodFailed), "burstable pod failed before best effort pod") Expect(burstPh).NotTo(Equal(api.PodFailed), "burstable pod failed before best effort pod")
Expect(gteedPh).NotTo(Equal(api.PodFailed), "guaranteed pod failed before best effort pod") Expect(gteedPh).NotTo(Equal(api.PodFailed), "guaranteed pod failed before best effort pod")
@ -172,6 +219,8 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
}) })
}) })
})
}) })
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res api.ResourceRequirements) *api.Pod { func createMemhogPod(f *framework.Framework, genName string, ctnName string, res api.ResourceRequirements) *api.Pod {
@ -213,7 +262,7 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
// 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick // 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick
// to fill ~4Gi of memory, so initial ballpark 12Mi/tick. // to fill ~4Gi of memory, so initial ballpark 12Mi/tick.
// We might see flakes due to timeout if the total memory on the nodes increases. // We might see flakes due to timeout if the total memory on the nodes increases.
Args: []string{"-mem-alloc-size", "12Mi", "-mem-alloc-sleep", "10s", "-mem-total", memLimit}, Args: []string{"-mem-alloc-size", "120Mi", "-mem-alloc-sleep", "5s", "-mem-total", memLimit},
Resources: res, Resources: res,
}, },
}, },