Merge pull request #60106 from dashpole/cadvisor_godep

Automatic merge from submit-queue (batch tested with PRs 60106, 59510, 60263, 60063, 59088). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Update cadvisor godeps to v0.29.0 and ignore per-cpu metrics

**What this PR does / why we need it**:
Updates the cAdvisor dependency to the cAdvisor release associated with the kubernetes 1.10 release.

**Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*:
Fixes #60052

**Special notes for your reviewer**:
This PR also adds per-cpu metrics to the ignoreMetrics list.  This is a new metric that can be ignored in the most recent cAdvisor release.
The reason for not collecting per-cpu metrics is that it can cause severe scalability issues.
For example, if using a 128 core machine, and running 100 containers, we have 12800 different streams of metrics just for per-cpu metrics which cAdvisor needs to process and transmit.
Additionally, per-cpu metrics are not used by any kubernetes components, and if a user needs these metrics, they can run cAdvisor as a daemonset. 

**Release note**:
```release-note
Disable per-cpu metrics by default for scalability.
Fix inaccurate disk usage monitoring of overlayFs.
Retry docker connection on startup timeout to avoid permanent loss of metrics.
```

/assign @dchen1107
This commit is contained in:
Kubernetes Submit Queue
2018-02-23 02:59:38 -08:00
committed by GitHub
5 changed files with 110 additions and 94 deletions

View File

@@ -42,6 +42,7 @@ type MetricKind string
const (
CpuUsageMetrics MetricKind = "cpu"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"

View File

@@ -113,7 +113,8 @@ func GetStats(cgroupManager cgroups.Manager, rootFs string, pid int, ignoreMetri
libcontainerStats := &libcontainer.Stats{
CgroupStats: cgroupStats,
}
stats := newContainerStats(libcontainerStats)
withPerCPU := !ignoreMetrics.Has(container.PerCpuUsageMetrics)
stats := newContainerStats(libcontainerStats, withPerCPU)
// If we know the pid then get network stats from /proc/<pid>/net/dev
if pid == 0 {
@@ -467,14 +468,17 @@ func minUint32(x, y uint32) uint32 {
var numCpusFunc = getNumberOnlineCPUs
// Convert libcontainer stats to info.ContainerStats.
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode
ret.Cpu.Usage.System = s.CpuStats.CpuUsage.UsageInKernelmode
ret.Cpu.Usage.Total = 0
ret.Cpu.Usage.Total = s.CpuStats.CpuUsage.TotalUsage
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
if !withPerCPU {
return
}
if len(s.CpuStats.CpuUsage.PercpuUsage) == 0 {
// libcontainer's 'GetStats' can leave 'PercpuUsage' nil if it skipped the
// cpuacct subsystem.
@@ -501,7 +505,6 @@ func setCpuStats(s *cgroups.Stats, ret *info.ContainerStats) {
for i := uint32(0); i < numActual; i++ {
ret.Cpu.Usage.PerCpu[i] = s.CpuStats.CpuUsage.PercpuUsage[i]
ret.Cpu.Usage.Total += s.CpuStats.CpuUsage.PercpuUsage[i]
}
}
@@ -587,13 +590,13 @@ func setNetworkStats(libcontainerStats *libcontainer.Stats, ret *info.ContainerS
}
}
func newContainerStats(libcontainerStats *libcontainer.Stats) *info.ContainerStats {
func newContainerStats(libcontainerStats *libcontainer.Stats, withPerCPU bool) *info.ContainerStats {
ret := &info.ContainerStats{
Timestamp: time.Now(),
}
if s := libcontainerStats.CgroupStats; s != nil {
setCpuStats(s, ret)
setCpuStats(s, ret, withPerCPU)
setDiskIoStats(s, ret)
setMemoryStats(s, ret)
}

View File

@@ -150,10 +150,18 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc) *PrometheusCo
},
}, {
name: "container_cpu_usage_seconds_total",
help: "Cumulative cpu time consumed per cpu in seconds.",
help: "Cumulative cpu time consumed in seconds.",
valueType: prometheus.CounterValue,
extraLabels: []string{"cpu"},
getValues: func(s *info.ContainerStats) metricValues {
if len(s.Cpu.Usage.PerCpu) == 0 {
if s.Cpu.Usage.Total > 0 {
return metricValues{{
value: float64(s.Cpu.Usage.Total) / float64(time.Second),
labels: []string{"total"},
}}
}
}
values := make(metricValues, 0, len(s.Cpu.Usage.PerCpu))
for i, value := range s.Cpu.Usage.PerCpu {
if value > 0 {