Merge pull request #32163 from mtaufen/more-eviction-logging

Automatic merge from submit-queue

Log pressure condition, memory usage, events in memory eviction test

I want to log this to help us debug some of the latest memory eviction test flakes, where we are seeing burstable "fail" before the besteffort. I saw (in the logs) attempts by the eviction manager to evict besteffort a while before burstable phase changed to "Failed", but the besteffort's phase appeared to remain "Running". I want to see the pressure condition interleaved with the pod phases to get a sense of the eviction manager's knowledge vs. pod phase.
This commit is contained in:
Kubernetes Submit Queue 2016-09-09 18:37:55 -07:00 committed by GitHub
commit 09efe0457d
2 changed files with 188 additions and 127 deletions

View File

@ -42,6 +42,7 @@ import (
"text/tabwriter"
"time"
"github.com/golang/glog"
"k8s.io/kubernetes/federation/client/clientset_generated/federation_release_1_4"
"k8s.io/kubernetes/pkg/api"
apierrs "k8s.io/kubernetes/pkg/api/errors"
@ -5170,3 +5171,14 @@ func CreateFileForGoBinData(gobindataPath, outputFilename string) error {
}
return nil
}
func ListNamespaceEvents(c *client.Client, ns string) error {
ls, err := c.Events(ns).List(api.ListOptions{})
if err != nil {
return err
}
for _, event := range ls.Items {
glog.Infof("Event(%#v): type: '%v' reason: '%v' %v", event.InvolvedObject, event.Type, event.Reason, event.Message)
}
return nil
}

View File

@ -36,6 +36,19 @@ import (
var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", func() {
f := framework.NewDefaultFramework("eviction-test")
// This is a dummy context to wrap the outer AfterEach, which will run after the inner AfterEach.
// We want to list all of the node and pod events, including any that occur while waiting for
// memory pressure reduction, even if we time out while waiting.
Context("", func() {
AfterEach(func() {
glog.Infof("Summary of node events during the memory eviction test:")
err := framework.ListNamespaceEvents(f.Client, f.Namespace.Name)
framework.ExpectNoError(err)
glog.Infof("Summary of pod events during the memory eviction test:")
err = framework.ListNamespaceEvents(f.Client, "")
framework.ExpectNoError(err)
})
Context("when there is memory pressure", func() {
AfterEach(func() {
// Wait for the memory pressure condition to disappear from the node status before continuing.
@ -155,6 +168,40 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
glog.Infof("pod phase: guaranteed: %v, burstable: %v, besteffort: %v", gteedPh, burstPh, bestPh)
// NOTE/TODO(mtaufen): This should help us debug why burstable appears to fail before besteffort in some
// scenarios. We have seen some evidence that the eviction manager has in fact done the
// right thing and evicted the besteffort first, and attempted to change the besteffort phase
// to "Failed" when it evicts it, but that for some reason the test isn't seeing the updated
// phase. I'm trying to confirm or deny this.
// The eviction manager starts trying to evict things when the node comes under memory
// pressure, and the eviction manager reports this information in the pressure condition. If we
// see the eviction manager reporting a pressure condition for a while without the besteffort failing,
// and we see that the manager did in fact evict the besteffort (this should be in the Kubelet log), we
// will have more reason to believe the phase is out of date.
nodeList, err := f.Client.Nodes().List(api.ListOptions{})
if err != nil {
glog.Errorf("tried to get node list but got error: %v", err)
}
if len(nodeList.Items) != 1 {
glog.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
}
node := nodeList.Items[0]
_, pressure := api.GetNodeCondition(&node.Status, api.NodeMemoryPressure)
glog.Infof("node pressure condition: %s", pressure)
// NOTE/TODO(mtaufen): Also log (at least temporarily) the actual memory consumption on the node.
// I used this to plot memory usage from a successful test run and it looks the
// way I would expect. I want to see what the plot from a flake looks like.
summary, err := getNodeSummary()
if err != nil {
return err
}
if summary.Node.Memory.WorkingSetBytes != nil {
wset := *summary.Node.Memory.WorkingSetBytes
glog.Infof("Node's working set is (bytes): %v", wset)
}
if bestPh == api.PodRunning {
Expect(burstPh).NotTo(Equal(api.PodFailed), "burstable pod failed before best effort pod")
Expect(gteedPh).NotTo(Equal(api.PodFailed), "guaranteed pod failed before best effort pod")
@ -172,6 +219,8 @@ var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", fu
})
})
})
})
func createMemhogPod(f *framework.Framework, genName string, ctnName string, res api.ResourceRequirements) *api.Pod {
@ -213,7 +262,7 @@ func createMemhogPod(f *framework.Framework, genName string, ctnName string, res
// 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick
// to fill ~4Gi of memory, so initial ballpark 12Mi/tick.
// We might see flakes due to timeout if the total memory on the nodes increases.
Args: []string{"-mem-alloc-size", "12Mi", "-mem-alloc-sleep", "10s", "-mem-total", memLimit},
Args: []string{"-mem-alloc-size", "120Mi", "-mem-alloc-sleep", "5s", "-mem-total", memLimit},
Resources: res,
},
},