From a8c685921f908894be46e8934b37fd70c11d2e4e Mon Sep 17 00:00:00 2001 From: Yu-Ju Hong Date: Thu, 7 Apr 2016 15:20:50 -0700 Subject: [PATCH] e2e: adapt kubelet_perf.go to use the new summary metrics API This commit switch most functions in kubelet_stats.go to use the new API. However, the functions that perform one-time resource usage retrieval remain unchanged to be compatible with reource_usage_gatherer.go. They should be handled separately. Also, the new summary API does not provide the RSS memory yet, so all memory checking tests will *always* pass. We plan to add this metrics in the API and restore the functionality of the test. --- test/e2e/kubelet_perf.go | 18 ++-- test/e2e/kubelet_stats.go | 215 +++++++++++++++++++++++--------------- 2 files changed, 142 insertions(+), 91 deletions(-) diff --git a/test/e2e/kubelet_perf.go b/test/e2e/kubelet_perf.go index dd7f0046b17..857afb46c8f 100644 --- a/test/e2e/kubelet_perf.go +++ b/test/e2e/kubelet_perf.go @@ -22,6 +22,7 @@ import ( "time" client "k8s.io/kubernetes/pkg/client/unversioned" + "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/util" "k8s.io/kubernetes/pkg/util/sets" @@ -31,7 +32,7 @@ import ( const ( // Interval to poll /stats/container on a node - containerStatsPollingPeriod = 3 * time.Second + containerStatsPollingPeriod = 10 * time.Second // The monitoring time for one test. monitoringTime = 20 * time.Minute // The periodic reporting period. @@ -210,27 +211,26 @@ var _ = KubeDescribe("Kubelet [Serial] [Slow]", func() { { podsPerNode: 0, cpuLimits: containersCPUSummary{ - "/kubelet": {0.50: 0.06, 0.95: 0.08}, - "/docker-daemon": {0.50: 0.05, 0.95: 0.06}, + stats.SystemContainerKubelet: {0.50: 0.06, 0.95: 0.08}, + stats.SystemContainerRuntime: {0.50: 0.05, 0.95: 0.06}, }, // We set the memory limits generously because the distribution // of the addon pods affect the memory usage on each node. memLimits: resourceUsagePerContainer{ - "/kubelet": &containerResourceUsage{MemoryRSSInBytes: 70 * 1024 * 1024}, - "/docker-daemon": &containerResourceUsage{MemoryRSSInBytes: 85 * 1024 * 1024}, + stats.SystemContainerKubelet: &containerResourceUsage{MemoryRSSInBytes: 70 * 1024 * 1024}, + stats.SystemContainerRuntime: &containerResourceUsage{MemoryRSSInBytes: 85 * 1024 * 1024}, }, }, { podsPerNode: 35, cpuLimits: containersCPUSummary{ - "/kubelet": {0.50: 0.12, 0.95: 0.14}, - "/docker-daemon": {0.50: 0.06, 0.95: 0.08}, + stats.SystemContainerKubelet: {0.50: 0.12, 0.95: 0.14}, + stats.SystemContainerRuntime: {0.50: 0.06, 0.95: 0.08}, }, // We set the memory limits generously because the distribution // of the addon pods affect the memory usage on each node. memLimits: resourceUsagePerContainer{ - "/kubelet": &containerResourceUsage{MemoryRSSInBytes: 75 * 1024 * 1024}, - "/docker-daemon": &containerResourceUsage{MemoryRSSInBytes: 100 * 1024 * 1024}, + stats.SystemContainerRuntime: &containerResourceUsage{MemoryRSSInBytes: 100 * 1024 * 1024}, }, }, { diff --git a/test/e2e/kubelet_stats.go b/test/e2e/kubelet_stats.go index 255c284597b..da92094575a 100644 --- a/test/e2e/kubelet_stats.go +++ b/test/e2e/kubelet_stats.go @@ -33,8 +33,9 @@ import ( "github.com/prometheus/common/model" "k8s.io/kubernetes/pkg/api" client "k8s.io/kubernetes/pkg/client/unversioned" + "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/metrics" - "k8s.io/kubernetes/pkg/kubelet/server/stats" + kubeletstats "k8s.io/kubernetes/pkg/kubelet/server/stats" "k8s.io/kubernetes/pkg/master/ports" utilerrors "k8s.io/kubernetes/pkg/util/errors" "k8s.io/kubernetes/pkg/util/sets" @@ -148,7 +149,9 @@ func HighLatencyKubeletOperations(c *client.Client, threshold time.Duration, nod // getContainerInfo contacts kubelet for the container information. The "Stats" // in the returned ContainerInfo is subject to the requirements in statsRequest. -func getContainerInfo(c *client.Client, nodeName string, req *stats.StatsRequest) (map[string]cadvisorapi.ContainerInfo, error) { +// TODO: This function uses the deprecated kubelet stats API; it should be +// removed. +func getContainerInfo(c *client.Client, nodeName string, req *kubeletstats.StatsRequest) (map[string]cadvisorapi.ContainerInfo, error) { reqBody, err := json.Marshal(req) if err != nil { return nil, err @@ -191,47 +194,6 @@ func getContainerInfo(c *client.Client, nodeName string, req *stats.StatsRequest return containers, nil } -const ( - // cadvisor records stats about every second. - cadvisorStatsPollingIntervalInSeconds float64 = 1.0 - // cadvisor caches up to 2 minutes of stats (configured by kubelet). - maxNumStatsToRequest int = 120 -) - -// A list of containers for which we want to collect resource usage. -func targetContainers() []string { - if providerIs("gce", "gke") { - return []string{ - "/", - "/docker-daemon", - "/kubelet", - "/system", - } - } else { - return []string{ - "/", - } - } -} - -type containerResourceUsage struct { - Name string - Timestamp time.Time - CPUUsageInCores float64 - MemoryUsageInBytes uint64 - MemoryWorkingSetInBytes uint64 - MemoryRSSInBytes uint64 - // The interval used to calculate CPUUsageInCores. - CPUInterval time.Duration -} - -func (r *containerResourceUsage) isStrictlyGreaterThan(rhs *containerResourceUsage) bool { - return r.CPUUsageInCores > rhs.CPUUsageInCores && r.MemoryWorkingSetInBytes > rhs.MemoryWorkingSetInBytes -} - -type resourceUsagePerContainer map[string]*containerResourceUsage -type resourceUsagePerNode map[string]resourceUsagePerContainer - // getOneTimeResourceUsageOnNode queries the node's /stats/container endpoint // and returns the resource usage of all containerNames for the past // cpuInterval. @@ -252,6 +214,8 @@ type resourceUsagePerNode map[string]resourceUsagePerContainer // should fail if one of containers listed by containerNames is missing on any node // (useful e.g. when looking for system containers or daemons). If set to true function // is more forgiving and ignores missing containers. +// TODO: This function relies on the deprecated kubelet stats API and should be +// removed and/or rewritten. func getOneTimeResourceUsageOnNode( c *client.Client, nodeName string, @@ -259,12 +223,19 @@ func getOneTimeResourceUsageOnNode( containerNames func() []string, expectMissingContainers bool, ) (resourceUsagePerContainer, error) { + const ( + // cadvisor records stats about every second. + cadvisorStatsPollingIntervalInSeconds float64 = 1.0 + // cadvisor caches up to 2 minutes of stats (configured by kubelet). + maxNumStatsToRequest int = 120 + ) + numStats := int(float64(cpuInterval.Seconds()) / cadvisorStatsPollingIntervalInSeconds) if numStats < 2 || numStats > maxNumStatsToRequest { return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest) } // Get information of all containers on the node. - containerInfos, err := getContainerInfo(c, nodeName, &stats.StatsRequest{ + containerInfos, err := getContainerInfo(c, nodeName, &kubeletstats.StatsRequest{ ContainerName: "/", NumStats: numStats, Subcontainers: true, @@ -272,6 +243,18 @@ func getOneTimeResourceUsageOnNode( if err != nil { return nil, err } + + f := func(name string, oldStats, newStats *cadvisorapi.ContainerStats) *containerResourceUsage { + return &containerResourceUsage{ + Name: name, + Timestamp: newStats.Timestamp, + CPUUsageInCores: float64(newStats.Cpu.Usage.Total-oldStats.Cpu.Usage.Total) / float64(newStats.Timestamp.Sub(oldStats.Timestamp).Nanoseconds()), + MemoryUsageInBytes: newStats.Memory.Usage, + MemoryWorkingSetInBytes: newStats.Memory.WorkingSet, + MemoryRSSInBytes: newStats.Memory.RSS, + CPUInterval: newStats.Timestamp.Sub(oldStats.Timestamp), + } + } // Process container infos that are relevant to us. containers := containerNames() usageMap := make(resourceUsagePerContainer, len(containers)) @@ -285,26 +268,96 @@ func getOneTimeResourceUsageOnNode( } first := info.Stats[0] last := info.Stats[len(info.Stats)-1] - usageMap[name] = computeContainerResourceUsage(name, first, last) + usageMap[name] = f(name, first, last) } return usageMap, nil } -// logOneTimeResourceUsageSummary collects container resource for the list of -// nodes, formats and logs the stats. -func logOneTimeResourceUsageSummary(c *client.Client, nodeNames []string, cpuInterval time.Duration) { - var summary []string - for _, nodeName := range nodeNames { - stats, err := getOneTimeResourceUsageOnNode(c, nodeName, cpuInterval, targetContainers, false) - if err != nil { - summary = append(summary, fmt.Sprintf("Error getting resource usage from node %q, err: %v", nodeName, err)) - } else { - summary = append(summary, formatResourceUsageStats(nodeName, stats)) - } +func getNodeStatsSummary(c *client.Client, nodeName string) (*stats.Summary, error) { + subResourceProxyAvailable, err := serverVersionGTE(subResourceServiceAndNodeProxyVersion, c) + if err != nil { + return nil, err } - Logf("\n%s", strings.Join(summary, "\n")) + + var data []byte + if subResourceProxyAvailable { + data, err = c.Get(). + Resource("nodes"). + SubResource("proxy"). + Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)). + Suffix("stats/summary"). + SetHeader("Content-Type", "application/json"). + Do().Raw() + + } else { + data, err = c.Get(). + Prefix("proxy"). + Resource("nodes"). + Name(fmt.Sprintf("%v:%v", nodeName, ports.KubeletPort)). + Suffix("stats/summary"). + SetHeader("Content-Type", "application/json"). + Do().Raw() + } + if err != nil { + return nil, err + } + + var summary *stats.Summary + err = json.Unmarshal(data, &summary) + if err != nil { + return nil, err + } + return summary, nil } +func getSystemContainerStats(summary *stats.Summary) map[string]*stats.ContainerStats { + statsList := summary.Node.SystemContainers + statsMap := make(map[string]*stats.ContainerStats) + for i := range statsList { + statsMap[statsList[i].Name] = &statsList[i] + } + + // Create a root container stats using information available in + // stats.NodeStats. This is necessary since it is a different type. + statsMap[rootContainerName] = &stats.ContainerStats{ + CPU: summary.Node.CPU, + Memory: summary.Node.Memory, + } + return statsMap +} + +const ( + rootContainerName = "/" +) + +// A list of containers for which we want to collect resource usage. +func targetContainers() []string { + return []string{ + rootContainerName, + stats.SystemContainerRuntime, + stats.SystemContainerKubelet, + stats.SystemContainerMisc, + } +} + +type containerResourceUsage struct { + Name string + Timestamp time.Time + CPUUsageInCores float64 + MemoryUsageInBytes uint64 + MemoryWorkingSetInBytes uint64 + MemoryRSSInBytes uint64 + // The interval used to calculate CPUUsageInCores. + CPUInterval time.Duration +} + +func (r *containerResourceUsage) isStrictlyGreaterThan(rhs *containerResourceUsage) bool { + return r.CPUUsageInCores > rhs.CPUUsageInCores && r.MemoryWorkingSetInBytes > rhs.MemoryWorkingSetInBytes +} + +type resourceUsagePerContainer map[string]*containerResourceUsage +type resourceUsagePerNode map[string]resourceUsagePerContainer + func formatResourceUsageStats(nodeName string, containerStats resourceUsagePerContainer) string { // Example output: // @@ -395,15 +448,15 @@ func PrintAllKubeletPods(c *client.Client, nodeName string) { } } -func computeContainerResourceUsage(name string, oldStats, newStats *cadvisorapi.ContainerStats) *containerResourceUsage { +func computeContainerResourceUsage(name string, oldStats, newStats *stats.ContainerStats) *containerResourceUsage { return &containerResourceUsage{ Name: name, - Timestamp: newStats.Timestamp, - CPUUsageInCores: float64(newStats.Cpu.Usage.Total-oldStats.Cpu.Usage.Total) / float64(newStats.Timestamp.Sub(oldStats.Timestamp).Nanoseconds()), - MemoryUsageInBytes: newStats.Memory.Usage, - MemoryWorkingSetInBytes: newStats.Memory.WorkingSet, - MemoryRSSInBytes: newStats.Memory.RSS, - CPUInterval: newStats.Timestamp.Sub(oldStats.Timestamp), + Timestamp: newStats.CPU.Time.Time, + CPUUsageInCores: float64(*newStats.CPU.UsageCoreNanoSeconds-*oldStats.CPU.UsageCoreNanoSeconds) / float64(newStats.CPU.Time.Time.Sub(oldStats.CPU.Time.Time).Nanoseconds()), + MemoryUsageInBytes: *newStats.Memory.UsageBytes, + MemoryWorkingSetInBytes: *newStats.Memory.WorkingSetBytes, + MemoryRSSInBytes: *newStats.Memory.RSSBytes, + CPUInterval: newStats.CPU.Time.Time.Sub(oldStats.CPU.Time.Time), } } @@ -435,7 +488,7 @@ func newResourceCollector(c *client.Client, nodeName string, containerNames []st func (r *resourceCollector) Start() { r.stopCh = make(chan struct{}, 1) // Keep the last observed stats for comparison. - oldStats := make(map[string]*cadvisorapi.ContainerStats) + oldStats := make(map[string]*stats.ContainerStats) go wait.Until(func() { r.collectStats(oldStats) }, r.pollingInterval, r.stopCh) } @@ -444,35 +497,33 @@ func (r *resourceCollector) Stop() { close(r.stopCh) } -// collectStats gets the latest stats from kubelet's /stats/container, computes +// collectStats gets the latest stats from kubelet stats summary API, computes // the resource usage, and pushes it to the buffer. -func (r *resourceCollector) collectStats(oldStats map[string]*cadvisorapi.ContainerStats) { - infos, err := getContainerInfo(r.client, r.node, &stats.StatsRequest{ - ContainerName: "/", - NumStats: 1, - Subcontainers: true, - }) +func (r *resourceCollector) collectStats(oldStatsMap map[string]*stats.ContainerStats) { + summary, err := getNodeStatsSummary(r.client, r.node) + cStatsMap := getSystemContainerStats(summary) if err != nil { - Logf("Error getting container info on %q, err: %v", r.node, err) + Logf("Error getting node stats summary on %q, err: %v", r.node, err) return } r.lock.Lock() defer r.lock.Unlock() for _, name := range r.containers { - info, ok := infos[name] - if !ok || len(info.Stats) < 1 { + cStats, ok := cStatsMap[name] + if !ok { Logf("Missing info/stats for container %q on node %q", name, r.node) return } - if oldInfo, ok := oldStats[name]; ok { - newInfo := info.Stats[0] - if oldInfo.Timestamp.Equal(newInfo.Timestamp) { + + if oldStats, ok := oldStatsMap[name]; ok { + if oldStats.CPU.Time.Equal(cStats.CPU.Time) { // No change -> skip this stat. continue } - r.buffers[name] = append(r.buffers[name], computeContainerResourceUsage(name, oldInfo, newInfo)) + r.buffers[name] = append(r.buffers[name], computeContainerResourceUsage(name, oldStats, cStats)) } - oldStats[name] = info.Stats[0] + // Update the old stats. + oldStatsMap[name] = cStats } } @@ -613,9 +664,9 @@ func (r *resourceMonitor) FormatCPUSummary(summary nodesCPUSummary) string { // CPU usage of containers on node "e2e-test-foo-minion-0vj7": // container 5th% 50th% 90th% 95th% // "/" 0.051 0.159 0.387 0.455 - // "/docker-daemon" 0.000 0.000 0.146 0.166 + // "/runtime 0.000 0.000 0.146 0.166 // "/kubelet" 0.036 0.053 0.091 0.154 - // "/system" 0.001 0.001 0.001 0.002 + // "/misc" 0.001 0.001 0.001 0.002 var summaryStrings []string var header []string header = append(header, "container")