mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-03 17:30:00 +00:00
Merge pull request #14453 from yujuhong/kubelet_benchmark
Add a slow e2e test to monitor kubelet resource usage
This commit is contained in:
commit
10a7f2b1fa
@ -136,6 +136,7 @@ GCE_FLAKY_TESTS=(
|
|||||||
GCE_SLOW_TESTS=(
|
GCE_SLOW_TESTS=(
|
||||||
"SchedulerPredicates\svalidates\sMaxPods\slimit " # 8 min, file: scheduler_predicates.go, PR: #13315
|
"SchedulerPredicates\svalidates\sMaxPods\slimit " # 8 min, file: scheduler_predicates.go, PR: #13315
|
||||||
"Nodes\sResize" # 3 min 30 sec, file: resize_nodes.go, issue: #13323
|
"Nodes\sResize" # 3 min 30 sec, file: resize_nodes.go, issue: #13323
|
||||||
|
"resource\susage\stracking" # 1 hour, file: kubelet_perf.go, slow by design
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tests which are not able to be run in parallel.
|
# Tests which are not able to be run in parallel.
|
||||||
@ -149,6 +150,7 @@ GCE_PARALLEL_SKIP_TESTS=(
|
|||||||
"SchedulerPredicates"
|
"SchedulerPredicates"
|
||||||
"Services.*restarting"
|
"Services.*restarting"
|
||||||
"Shell.*services"
|
"Shell.*services"
|
||||||
|
"resource\susage\stracking"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Tests which are known to be flaky when run in parallel.
|
# Tests which are known to be flaky when run in parallel.
|
||||||
|
@ -37,7 +37,6 @@ const (
|
|||||||
pollInterval = 1 * time.Second
|
pollInterval = 1 * time.Second
|
||||||
// Interval to poll /stats/container on a node
|
// Interval to poll /stats/container on a node
|
||||||
containerStatsPollingInterval = 5 * time.Second
|
containerStatsPollingInterval = 5 * time.Second
|
||||||
resourceCollectionTime = 1 * time.Minute
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// getPodMatches returns a set of pod names on the given node that matches the
|
// getPodMatches returns a set of pod names on the given node that matches the
|
||||||
@ -160,16 +159,4 @@ var _ = Describe("kubelet", func() {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
Describe("Monitor resource usage on node", func() {
|
|
||||||
It("Ask kubelet to report container resource usage", func() {
|
|
||||||
// TODO: After gathering some numbers, we should set a resource
|
|
||||||
// limit for each container and fail the test if the usage exceeds
|
|
||||||
// the preset limit.
|
|
||||||
By(fmt.Sprintf("Waiting %v to collect resource usage on node", resourceCollectionTime))
|
|
||||||
time.Sleep(resourceCollectionTime)
|
|
||||||
resourceMonitor.LogLatest()
|
|
||||||
resourceMonitor.LogCPUSummary()
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
121
test/e2e/kubelet_perf.go
Normal file
121
test/e2e/kubelet_perf.go
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package e2e
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
client "k8s.io/kubernetes/pkg/client/unversioned"
|
||||||
|
"k8s.io/kubernetes/pkg/fields"
|
||||||
|
"k8s.io/kubernetes/pkg/labels"
|
||||||
|
"k8s.io/kubernetes/pkg/util"
|
||||||
|
"k8s.io/kubernetes/pkg/util/sets"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Interval to poll /stats/container on a node
|
||||||
|
containerStatsPollingPeriod = 10 * time.Second
|
||||||
|
// The monitoring time for one test.
|
||||||
|
monitoringTime = 30 * time.Minute
|
||||||
|
// The periodic reporting period.
|
||||||
|
reportingPeriod = 5 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
|
func logPodsOnNodes(c *client.Client, nodeNames []string) {
|
||||||
|
for _, n := range nodeNames {
|
||||||
|
podList, err := GetKubeletPods(c, n)
|
||||||
|
if err != nil {
|
||||||
|
Logf("Unable to retrieve kubelet pods for node %v", n)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
Logf("%d pods are running on node %v", len(podList.Items), n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = Describe("Kubelet", func() {
|
||||||
|
var numNodes int
|
||||||
|
var nodeNames sets.String
|
||||||
|
framework := NewFramework("kubelet-perf")
|
||||||
|
var resourceMonitor *resourceMonitor
|
||||||
|
|
||||||
|
BeforeEach(func() {
|
||||||
|
nodes, err := framework.Client.Nodes().List(labels.Everything(), fields.Everything())
|
||||||
|
expectNoError(err)
|
||||||
|
numNodes = len(nodes.Items)
|
||||||
|
nodeNames = sets.NewString()
|
||||||
|
for _, node := range nodes.Items {
|
||||||
|
nodeNames.Insert(node.Name)
|
||||||
|
}
|
||||||
|
resourceMonitor = newResourceMonitor(framework.Client, targetContainers(), containerStatsPollingPeriod)
|
||||||
|
resourceMonitor.Start()
|
||||||
|
})
|
||||||
|
|
||||||
|
AfterEach(func() {
|
||||||
|
resourceMonitor.Stop()
|
||||||
|
})
|
||||||
|
|
||||||
|
Describe("resource usage tracking", func() {
|
||||||
|
density := []int{0, 50}
|
||||||
|
for _, podsPerNode := range density {
|
||||||
|
name := fmt.Sprintf(
|
||||||
|
"over %v with %d pods per node.", monitoringTime, podsPerNode)
|
||||||
|
It(name, func() {
|
||||||
|
totalPods := podsPerNode * numNodes
|
||||||
|
By(fmt.Sprintf("Creating a RC of %d pods and wait until all pods of this RC are running", totalPods))
|
||||||
|
rcName := fmt.Sprintf("resource%d-%s", totalPods, string(util.NewUUID()))
|
||||||
|
|
||||||
|
// TODO: Use a more realistic workload
|
||||||
|
Expect(RunRC(RCConfig{
|
||||||
|
Client: framework.Client,
|
||||||
|
Name: rcName,
|
||||||
|
Namespace: framework.Namespace.Name,
|
||||||
|
Image: "gcr.io/google_containers/pause:go",
|
||||||
|
Replicas: totalPods,
|
||||||
|
})).NotTo(HaveOccurred())
|
||||||
|
|
||||||
|
// Log once and flush the stats.
|
||||||
|
resourceMonitor.LogLatest()
|
||||||
|
resourceMonitor.Reset()
|
||||||
|
|
||||||
|
By("Start monitoring resource usage")
|
||||||
|
// Periodically dump the cpu summary until the deadline is met.
|
||||||
|
// Note that without calling resourceMonitor.Reset(), the stats
|
||||||
|
// would occupy increasingly more memory. This should be fine
|
||||||
|
// for the current test duration, but we should reclaim the
|
||||||
|
// entries if we plan to monitor longer (e.g., 8 hours).
|
||||||
|
deadline := time.Now().Add(monitoringTime)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
time.Sleep(reportingPeriod)
|
||||||
|
Logf("Still running...")
|
||||||
|
logPodsOnNodes(framework.Client, nodeNames.List())
|
||||||
|
}
|
||||||
|
|
||||||
|
By("Reporting overall resource usage")
|
||||||
|
logPodsOnNodes(framework.Client, nodeNames.List())
|
||||||
|
resourceMonitor.LogCPUSummary()
|
||||||
|
resourceMonitor.LogLatest()
|
||||||
|
|
||||||
|
By("Deleting the RC")
|
||||||
|
DeleteRC(framework.Client, framework.Namespace.Name, rcName)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
})
|
@ -25,6 +25,7 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"text/tabwriter"
|
"text/tabwriter"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -346,6 +347,7 @@ func computeContainerResourceUsage(name string, oldStats, newStats *cadvisor.Con
|
|||||||
// list of containers, computes and cache resource usage up to
|
// list of containers, computes and cache resource usage up to
|
||||||
// maxEntriesPerContainer for each container.
|
// maxEntriesPerContainer for each container.
|
||||||
type resourceCollector struct {
|
type resourceCollector struct {
|
||||||
|
lock sync.RWMutex
|
||||||
node string
|
node string
|
||||||
containers []string
|
containers []string
|
||||||
client *client.Client
|
client *client.Client
|
||||||
@ -390,6 +392,8 @@ func (r *resourceCollector) collectStats(oldStats map[string]*cadvisor.Container
|
|||||||
Logf("Error getting container info on %q, err: %v", r.node, err)
|
Logf("Error getting container info on %q, err: %v", r.node, err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
r.lock.Lock()
|
||||||
|
defer r.lock.Unlock()
|
||||||
for _, name := range r.containers {
|
for _, name := range r.containers {
|
||||||
info, ok := infos[name]
|
info, ok := infos[name]
|
||||||
if !ok || len(info.Stats) < 1 {
|
if !ok || len(info.Stats) < 1 {
|
||||||
@ -405,6 +409,8 @@ func (r *resourceCollector) collectStats(oldStats map[string]*cadvisor.Container
|
|||||||
|
|
||||||
// LogLatest logs the latest resource usage of each container.
|
// LogLatest logs the latest resource usage of each container.
|
||||||
func (r *resourceCollector) LogLatest() {
|
func (r *resourceCollector) LogLatest() {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
stats := make(map[string]*containerResourceUsage)
|
stats := make(map[string]*containerResourceUsage)
|
||||||
for _, name := range r.containers {
|
for _, name := range r.containers {
|
||||||
s := r.buffers[name][len(r.buffers)-1]
|
s := r.buffers[name][len(r.buffers)-1]
|
||||||
@ -417,24 +423,38 @@ func (r *resourceCollector) LogLatest() {
|
|||||||
Logf("\n%s", formatResourceUsageStats(r.node, stats))
|
Logf("\n%s", formatResourceUsageStats(r.node, stats))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reset frees the stats and start over.
|
||||||
|
func (r *resourceCollector) Reset() {
|
||||||
|
r.lock.Lock()
|
||||||
|
defer r.lock.Unlock()
|
||||||
|
for _, name := range r.containers {
|
||||||
|
r.buffers[name] = []*containerResourceUsage{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type resourceUsageByCPU []*containerResourceUsage
|
type resourceUsageByCPU []*containerResourceUsage
|
||||||
|
|
||||||
func (r resourceUsageByCPU) Len() int { return len(r) }
|
func (r resourceUsageByCPU) Len() int { return len(r) }
|
||||||
func (r resourceUsageByCPU) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
func (r resourceUsageByCPU) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
||||||
func (r resourceUsageByCPU) Less(i, j int) bool { return r[i].CPUUsageInCores < r[j].CPUUsageInCores }
|
func (r resourceUsageByCPU) Less(i, j int) bool { return r[i].CPUUsageInCores < r[j].CPUUsageInCores }
|
||||||
|
|
||||||
var percentiles = [...]float64{0.05, 0.50, 0.90, 0.95}
|
// The percentiles to report.
|
||||||
|
var percentiles = [...]float64{0.05, 0.20, 0.50, 0.70, 0.90, 0.95, 0.99}
|
||||||
|
|
||||||
// GetBasicCPUStats returns the 5-th, 50-th, and 95-th, percentiles the cpu
|
// GetBasicCPUStats returns the percentiles the cpu usage in cores for
|
||||||
// usage in cores for containerName. This method examines all data currently in the buffer.
|
// containerName. This method examines all data currently in the buffer.
|
||||||
func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 {
|
func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
result := make(map[float64]float64, len(percentiles))
|
result := make(map[float64]float64, len(percentiles))
|
||||||
usages := r.buffers[containerName]
|
usages := r.buffers[containerName]
|
||||||
sort.Sort(resourceUsageByCPU(usages))
|
sort.Sort(resourceUsageByCPU(usages))
|
||||||
for _, q := range percentiles {
|
for _, q := range percentiles {
|
||||||
index := int(float64(len(usages))*q) - 1
|
index := int(float64(len(usages))*q) - 1
|
||||||
if index < 0 {
|
if index < 0 {
|
||||||
index = 0
|
// We don't have enough data.
|
||||||
|
result[q] = 0
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
result[q] = usages[index].CPUUsageInCores
|
result[q] = usages[index].CPUUsageInCores
|
||||||
}
|
}
|
||||||
@ -476,6 +496,12 @@ func (r *resourceMonitor) Stop() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *resourceMonitor) Reset() {
|
||||||
|
for _, collector := range r.collectors {
|
||||||
|
collector.Reset()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (r *resourceMonitor) LogLatest() {
|
func (r *resourceMonitor) LogLatest() {
|
||||||
for _, collector := range r.collectors {
|
for _, collector := range r.collectors {
|
||||||
collector.LogLatest()
|
collector.LogLatest()
|
||||||
@ -483,7 +509,7 @@ func (r *resourceMonitor) LogLatest() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (r *resourceMonitor) LogCPUSummary() {
|
func (r *resourceMonitor) LogCPUSummary() {
|
||||||
// Example output for a node:
|
// Example output for a node (the percentiles may differ):
|
||||||
// CPU usage of containers on node "e2e-test-yjhong-minion-0vj7":
|
// CPU usage of containers on node "e2e-test-yjhong-minion-0vj7":
|
||||||
// container 5th% 50th% 90th% 95th%
|
// container 5th% 50th% 90th% 95th%
|
||||||
// "/" 0.051 0.159 0.387 0.455
|
// "/" 0.051 0.159 0.387 0.455
|
||||||
|
Loading…
Reference in New Issue
Block a user