From ab85b9a7d666c4b26b210074ca994e66a67ed184 Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Tue, 22 Sep 2015 15:08:53 -0700 Subject: [PATCH 1/2] e2e: add a function to reset the stats in resource monitor The function can be called in a long-running test to clear exsiting stats (free the memory), and start over. --- test/e2e/kubelet_stats.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/e2e/kubelet_stats.go b/test/e2e/kubelet_stats.go index 26c2e971f9c..2e6c1a8b5e1 100644 --- a/test/e2e/kubelet_stats.go +++ b/test/e2e/kubelet_stats.go @@ -25,6 +25,7 @@ import ( "sort" "strconv" "strings" + "sync" "text/tabwriter" "time" @@ -346,6 +347,7 @@ func computeContainerResourceUsage(name string, oldStats, newStats *cadvisor.Con // list of containers, computes and cache resource usage up to // maxEntriesPerContainer for each container. type resourceCollector struct { + lock sync.RWMutex node string containers []string client *client.Client @@ -390,6 +392,8 @@ func (r *resourceCollector) collectStats(oldStats map[string]*cadvisor.Container Logf("Error getting container info on %q, err: %v", r.node, err) return } + r.lock.Lock() + defer r.lock.Unlock() for _, name := range r.containers { info, ok := infos[name] if !ok || len(info.Stats) < 1 { @@ -405,6 +409,8 @@ func (r *resourceCollector) collectStats(oldStats map[string]*cadvisor.Container // LogLatest logs the latest resource usage of each container. func (r *resourceCollector) LogLatest() { + r.lock.RLock() + defer r.lock.RUnlock() stats := make(map[string]*containerResourceUsage) for _, name := range r.containers { s := r.buffers[name][len(r.buffers)-1] @@ -417,6 +423,15 @@ func (r *resourceCollector) LogLatest() { Logf("\n%s", formatResourceUsageStats(r.node, stats)) } +// Reset frees the stats and start over. +func (r *resourceCollector) Reset() { + r.lock.Lock() + defer r.lock.Unlock() + for _, name := range r.containers { + r.buffers[name] = []*containerResourceUsage{} + } +} + type resourceUsageByCPU []*containerResourceUsage func (r resourceUsageByCPU) Len() int { return len(r) } @@ -428,6 +443,8 @@ var percentiles = [...]float64{0.05, 0.50, 0.90, 0.95} // GetBasicCPUStats returns the 5-th, 50-th, and 95-th, percentiles the cpu // usage in cores for containerName. This method examines all data currently in the buffer. func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 { + r.lock.RLock() + defer r.lock.RUnlock() result := make(map[float64]float64, len(percentiles)) usages := r.buffers[containerName] sort.Sort(resourceUsageByCPU(usages)) @@ -476,6 +493,12 @@ func (r *resourceMonitor) Stop() { } } +func (r *resourceMonitor) Reset() { + for _, collector := range r.collectors { + collector.Reset() + } +} + func (r *resourceMonitor) LogLatest() { for _, collector := range r.collectors { collector.LogLatest() From 0b8244505c0487ea76a21b6c837b9820aedf2169 Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Tue, 22 Sep 2015 16:38:36 -0700 Subject: [PATCH 2/2] Add a slow e2e test to monitor kubelet resource usage This test tracks kubelet resource usage over a long period of time (1hr) when running N pods (e.g., N=0,50), and prints out the resource usage. This would give us an idea how much kubelet's management overhead is in a stable cluster. Some followup items: * Use a more realistic workload (e.g., including probing) * Fail the test if the resource usage is too high. Caveat: * We assume the scheduler would do a decent job distributing the pause pods, but we should double check. * Cluster addon pods could be unevenly distributed and skews the resource usage on nodes. --- hack/jenkins/e2e.sh | 2 + test/e2e/kubelet.go | 13 ---- test/e2e/kubelet_perf.go | 121 ++++++++++++++++++++++++++++++++++++++ test/e2e/kubelet_stats.go | 13 ++-- 4 files changed, 131 insertions(+), 18 deletions(-) create mode 100644 test/e2e/kubelet_perf.go diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index 5e94e87af4f..876ece94541 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -134,6 +134,7 @@ GCE_FLAKY_TESTS=( GCE_SLOW_TESTS=( "SchedulerPredicates\svalidates\sMaxPods\slimit " # 8 min, file: scheduler_predicates.go, PR: #13315 "Nodes\sResize" # 3 min 30 sec, file: resize_nodes.go, issue: #13323 + "resource\susage\stracking" # 1 hour, file: kubelet_perf.go, slow by design ) # Tests which are not able to be run in parallel. @@ -147,6 +148,7 @@ GCE_PARALLEL_SKIP_TESTS=( "SchedulerPredicates" "Services.*restarting" "Shell.*services" + "resource\susage\stracking" ) # Tests which are known to be flaky when run in parallel. diff --git a/test/e2e/kubelet.go b/test/e2e/kubelet.go index 3d18494e2ad..50c471e14f8 100644 --- a/test/e2e/kubelet.go +++ b/test/e2e/kubelet.go @@ -37,7 +37,6 @@ const ( pollInterval = 1 * time.Second // Interval to poll /stats/container on a node containerStatsPollingInterval = 5 * time.Second - resourceCollectionTime = 1 * time.Minute ) // getPodMatches returns a set of pod names on the given node that matches the @@ -160,16 +159,4 @@ var _ = Describe("kubelet", func() { }) } }) - - Describe("Monitor resource usage on node", func() { - It("Ask kubelet to report container resource usage", func() { - // TODO: After gathering some numbers, we should set a resource - // limit for each container and fail the test if the usage exceeds - // the preset limit. - By(fmt.Sprintf("Waiting %v to collect resource usage on node", resourceCollectionTime)) - time.Sleep(resourceCollectionTime) - resourceMonitor.LogLatest() - resourceMonitor.LogCPUSummary() - }) - }) }) diff --git a/test/e2e/kubelet_perf.go b/test/e2e/kubelet_perf.go new file mode 100644 index 00000000000..1febb72083b --- /dev/null +++ b/test/e2e/kubelet_perf.go @@ -0,0 +1,121 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + "time" + + client "k8s.io/kubernetes/pkg/client/unversioned" + "k8s.io/kubernetes/pkg/fields" + "k8s.io/kubernetes/pkg/labels" + "k8s.io/kubernetes/pkg/util" + "k8s.io/kubernetes/pkg/util/sets" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const ( + // Interval to poll /stats/container on a node + containerStatsPollingPeriod = 10 * time.Second + // The monitoring time for one test. + monitoringTime = 30 * time.Minute + // The periodic reporting period. + reportingPeriod = 5 * time.Minute +) + +func logPodsOnNodes(c *client.Client, nodeNames []string) { + for _, n := range nodeNames { + podList, err := GetKubeletPods(c, n) + if err != nil { + Logf("Unable to retrieve kubelet pods for node %v", n) + continue + } + Logf("%d pods are running on node %v", len(podList.Items), n) + } +} + +var _ = Describe("Kubelet", func() { + var numNodes int + var nodeNames sets.String + framework := NewFramework("kubelet-perf") + var resourceMonitor *resourceMonitor + + BeforeEach(func() { + nodes, err := framework.Client.Nodes().List(labels.Everything(), fields.Everything()) + expectNoError(err) + numNodes = len(nodes.Items) + nodeNames = sets.NewString() + for _, node := range nodes.Items { + nodeNames.Insert(node.Name) + } + resourceMonitor = newResourceMonitor(framework.Client, targetContainers(), containerStatsPollingPeriod) + resourceMonitor.Start() + }) + + AfterEach(func() { + resourceMonitor.Stop() + }) + + Describe("resource usage tracking", func() { + density := []int{0, 50} + for _, podsPerNode := range density { + name := fmt.Sprintf( + "over %v with %d pods per node.", monitoringTime, podsPerNode) + It(name, func() { + totalPods := podsPerNode * numNodes + By(fmt.Sprintf("Creating a RC of %d pods and wait until all pods of this RC are running", totalPods)) + rcName := fmt.Sprintf("resource%d-%s", totalPods, string(util.NewUUID())) + + // TODO: Use a more realistic workload + Expect(RunRC(RCConfig{ + Client: framework.Client, + Name: rcName, + Namespace: framework.Namespace.Name, + Image: "gcr.io/google_containers/pause:go", + Replicas: totalPods, + })).NotTo(HaveOccurred()) + + // Log once and flush the stats. + resourceMonitor.LogLatest() + resourceMonitor.Reset() + + By("Start monitoring resource usage") + // Periodically dump the cpu summary until the deadline is met. + // Note that without calling resourceMonitor.Reset(), the stats + // would occupy increasingly more memory. This should be fine + // for the current test duration, but we should reclaim the + // entries if we plan to monitor longer (e.g., 8 hours). + deadline := time.Now().Add(monitoringTime) + for time.Now().Before(deadline) { + time.Sleep(reportingPeriod) + Logf("Still running...") + logPodsOnNodes(framework.Client, nodeNames.List()) + } + + By("Reporting overall resource usage") + logPodsOnNodes(framework.Client, nodeNames.List()) + resourceMonitor.LogCPUSummary() + resourceMonitor.LogLatest() + + By("Deleting the RC") + DeleteRC(framework.Client, framework.Namespace.Name, rcName) + }) + } + }) +}) diff --git a/test/e2e/kubelet_stats.go b/test/e2e/kubelet_stats.go index 2e6c1a8b5e1..1d6998197c2 100644 --- a/test/e2e/kubelet_stats.go +++ b/test/e2e/kubelet_stats.go @@ -438,10 +438,11 @@ func (r resourceUsageByCPU) Len() int { return len(r) } func (r resourceUsageByCPU) Swap(i, j int) { r[i], r[j] = r[j], r[i] } func (r resourceUsageByCPU) Less(i, j int) bool { return r[i].CPUUsageInCores < r[j].CPUUsageInCores } -var percentiles = [...]float64{0.05, 0.50, 0.90, 0.95} +// The percentiles to report. +var percentiles = [...]float64{0.05, 0.20, 0.50, 0.70, 0.90, 0.95, 0.99} -// GetBasicCPUStats returns the 5-th, 50-th, and 95-th, percentiles the cpu -// usage in cores for containerName. This method examines all data currently in the buffer. +// GetBasicCPUStats returns the percentiles the cpu usage in cores for +// containerName. This method examines all data currently in the buffer. func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 { r.lock.RLock() defer r.lock.RUnlock() @@ -451,7 +452,9 @@ func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]f for _, q := range percentiles { index := int(float64(len(usages))*q) - 1 if index < 0 { - index = 0 + // We don't have enough data. + result[q] = 0 + continue } result[q] = usages[index].CPUUsageInCores } @@ -506,7 +509,7 @@ func (r *resourceMonitor) LogLatest() { } func (r *resourceMonitor) LogCPUSummary() { - // Example output for a node: + // Example output for a node (the percentiles may differ): // CPU usage of containers on node "e2e-test-yjhong-minion-0vj7": // container 5th% 50th% 90th% 95th% // "/" 0.051 0.159 0.387 0.455