diff --git a/docs/design/data/metrics.yaml b/docs/design/data/metrics.yaml index 549883848..eeb8c5ced 100644 --- a/docs/design/data/metrics.yaml +++ b/docs/design/data/metrics.yaml @@ -1698,6 +1698,28 @@ components: fixed: false values: [] since: 2.0.0 + - name: kata_shim_pod_overhead_cpu + type: GAUGE + unit: "percent" + help: Kata Pod overhead for CPU resources(percent). + labels: + - name: sandbox_id + desc: "" + manually_edit: false + fixed: false + values: [] + since: 2.0.0 + - name: kata_shim_pod_overhead_memory_in_bytes + type: GAUGE + unit: bytes + help: Kata Pod overhead for memory resources(bytes). + labels: + - name: sandbox_id + desc: "" + manually_edit: false + fixed: false + values: [] + since: 2.0.0 - name: kata_shim_proc_stat type: GAUGE unit: "" diff --git a/docs/design/kata-2-0-metrics.md b/docs/design/kata-2-0-metrics.md index 72321d81b..a3321dbab 100644 --- a/docs/design/kata-2-0-metrics.md +++ b/docs/design/kata-2-0-metrics.md @@ -309,6 +309,8 @@ Metrics about Kata containerd shim v2 process. | `kata_shim_go_threads`:
Number of OS threads created. | `GAUGE` | | | 2.0.0 | | `kata_shim_io_stat`:
Kata containerd shim v2 process IO statistics. | `GAUGE` | | | 2.0.0 | | `kata_shim_netdev`:
Kata containerd shim v2 network devices statistics. | `GAUGE` | | | 2.0.0 | +| `kata_shim_pod_overhead_cpu`:
Kata Pod overhead for CPU resources(percent). | `GAUGE` | percent | | 2.0.0 | +| `kata_shim_pod_overhead_memory_in_bytes`:
Kata Pod overhead for memory resources(bytes). | `GAUGE` | `bytes` | | 2.0.0 | | `kata_shim_proc_stat`:
Kata containerd shim v2 process statistics. | `GAUGE` | | | 2.0.0 | | `kata_shim_proc_status`:
Kata containerd shim v2 process status. | `GAUGE` | | | 2.0.0 | | `kata_shim_process_cpu_seconds_total`:
Total user and system CPU time spent in seconds. | `COUNTER` | `seconds` | | 2.0.0 | diff --git a/src/runtime/containerd-shim-v2/shim_management.go b/src/runtime/containerd-shim-v2/shim_management.go index 2018347c0..69ba88e3f 100644 --- a/src/runtime/containerd-shim-v2/shim_management.go +++ b/src/runtime/containerd-shim-v2/shim_management.go @@ -60,7 +60,7 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) { agentMetrics, err := s.sandbox.GetAgentMetrics() if err != nil { logrus.WithError(err).Error("failed GetAgentMetrics") - if isGRPCErrorCode(codes.Unimplemented, err) { + if isGRPCErrorCode(codes.NotFound, err) { logrus.Warn("metrics API not supportted by this agent.") ifSupportAgentMetricsAPI = false return @@ -74,6 +74,11 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) { for _, mf := range list { encoder.Encode(mf) } + + // collect pod overhead metrics need sleep to get the changes of cpu/memory resources usage + // so here only trigger the collect operation, and the data will be gathered + // next time collection request from Prometheus server + go s.setPodOverheadMetrics() } func decodeAgentMetrics(body string) []*dto.MetricFamily { diff --git a/src/runtime/containerd-shim-v2/shim_metrics.go b/src/runtime/containerd-shim-v2/shim_metrics.go index e26da7beb..455af2a2b 100644 --- a/src/runtime/containerd-shim-v2/shim_metrics.go +++ b/src/runtime/containerd-shim-v2/shim_metrics.go @@ -6,7 +6,10 @@ package containerdshim import ( + "time" + mutils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils" + vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs" ) @@ -66,6 +69,18 @@ var ( Name: "fds", Help: "Kata containerd shim v2 open FDs.", }) + + katashimPodOverheadCPU = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespaceKatashim, + Name: "pod_overhead_cpu", + Help: "Kata Pod overhead for CPU resources(percent).", + }) + + katashimPodOverheadMemory = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespaceKatashim, + Name: "pod_overhead_memory_in_bytes", + Help: "Kata Pod overhead for memory resources(bytes).", + }) ) func registerMetrics() { @@ -76,6 +91,8 @@ func registerMetrics() { prometheus.MustRegister(katashimNetdev) prometheus.MustRegister(katashimIOStat) prometheus.MustRegister(katashimOpenFDs) + prometheus.MustRegister(katashimPodOverheadCPU) + prometheus.MustRegister(katashimPodOverheadMemory) } // updateShimMetrics will update metrics for kata shim process itself @@ -116,3 +133,78 @@ func updateShimMetrics() error { return nil } + +// statsSandbox returns a detailed sandbox stats. +func (s *service) statsSandbox() (vc.SandboxStats, []vc.ContainerStats, error) { + sandboxStats, err := s.sandbox.Stats() + if err != nil { + return vc.SandboxStats{}, []vc.ContainerStats{}, err + } + + containerStats := []vc.ContainerStats{} + for _, c := range s.sandbox.GetAllContainers() { + cstats, err := s.sandbox.StatsContainer(c.ID()) + if err != nil { + return vc.SandboxStats{}, []vc.ContainerStats{}, err + } + containerStats = append(containerStats, cstats) + } + + return sandboxStats, containerStats, nil +} + +func calcOverhead(initialSandboxStats, finishSandboxStats vc.SandboxStats, initialContainerStats, finishContainersStats []vc.ContainerStats, deltaTime float64) (float64, float64) { + hostInitCPU := initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage + guestInitCPU := uint64(0) + for _, cs := range initialContainerStats { + guestInitCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage + } + + hostFinalCPU := finishSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage + guestFinalCPU := uint64(0) + for _, cs := range finishContainersStats { + guestFinalCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage + } + + var guestMemoryUsage uint64 + for _, cs := range finishContainersStats { + guestMemoryUsage += cs.CgroupStats.MemoryStats.Usage.Usage + } + + hostMemoryUsage := finishSandboxStats.CgroupStats.MemoryStats.Usage.Usage + + cpuUsageGuest := float64(guestFinalCPU-guestInitCPU) / deltaTime * 100 + cpuUsageHost := float64(hostFinalCPU-hostInitCPU) / deltaTime * 100 + + return float64(hostMemoryUsage - guestMemoryUsage), float64(cpuUsageHost - cpuUsageGuest) +} + +func (s *service) getPodOverhead() (float64, float64, error) { + initTime := time.Now().UnixNano() + initialSandboxStats, initialContainerStats, err := s.statsSandbox() + if err != nil { + return 0, 0, err + } + + // Wait for 1 second to calculate CPU usage + time.Sleep(time.Second * 1) + finishtTime := time.Now().UnixNano() + deltaTime := float64(finishtTime - initTime) + + finishSandboxStats, finishContainersStats, err := s.statsSandbox() + if err != nil { + return 0, 0, err + } + mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, deltaTime) + return mem, cpu, nil +} + +func (s *service) setPodOverheadMetrics() error { + mem, cpu, err := s.getPodOverhead() + if err != nil { + return err + } + katashimPodOverheadMemory.Set(mem) + katashimPodOverheadCPU.Set(cpu) + return nil +} diff --git a/src/runtime/containerd-shim-v2/shim_metrics_test.go b/src/runtime/containerd-shim-v2/shim_metrics_test.go new file mode 100644 index 000000000..ba1afe552 --- /dev/null +++ b/src/runtime/containerd-shim-v2/shim_metrics_test.go @@ -0,0 +1,114 @@ +// Copyright (c) 2020 Ant Financial +// +// SPDX-License-Identifier: Apache-2.0 +// + +package containerdshim + +import ( + "testing" + + vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers" + "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock" + + "github.com/stretchr/testify/assert" +) + +func getSandboxCPUFunc(c, m uint64) func() (vc.SandboxStats, error) { + return func() (vc.SandboxStats, error) { + return vc.SandboxStats{ + CgroupStats: vc.CgroupStats{ + CPUStats: vc.CPUStats{ + CPUUsage: vc.CPUUsage{ + TotalUsage: c * 1e9, + }, + }, + MemoryStats: vc.MemoryStats{ + Usage: vc.MemoryData{ + Usage: m, + }, + }, + }, + Cpus: 20, + }, nil + } +} + +func getStatsContainerCPUFunc(fooCPU, barCPU, fooMem, barMem uint64) func(contID string) (vc.ContainerStats, error) { + return func(contID string) (vc.ContainerStats, error) { + vCPU := fooCPU + vMem := fooMem + if contID == "bar" { + vCPU = barCPU + vMem = barMem + } + return vc.ContainerStats{ + CgroupStats: &vc.CgroupStats{ + CPUStats: vc.CPUStats{ + CPUUsage: vc.CPUUsage{ + TotalUsage: vCPU * 1e9, + }, + }, + MemoryStats: vc.MemoryStats{ + Usage: vc.MemoryData{ + Usage: vMem, + }, + }, + }, + }, nil + + } +} + +func TestStatsSandbox(t *testing.T) { + assert := assert.New(t) + + sandbox := &vcmock.Sandbox{ + MockID: testSandboxID, + StatsFunc: getSandboxCPUFunc(1000, 100000), + StatsContainerFunc: getStatsContainerCPUFunc(100, 200, 10000, 20000), + MockContainers: []*vcmock.Container{ + &vcmock.Container{ + MockID: "foo", + }, + &vcmock.Container{ + MockID: "bar", + }, + }, + } + + s := &service{ + id: testSandboxID, + sandbox: sandbox, + containers: make(map[string]*container), + } + + initialSandboxStats, initialContainerStats, err := s.statsSandbox() + assert.Nil(err) + assert.Equal(uint64(1000*1e9), initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage) + assert.Equal(2, len(initialContainerStats)) + assert.Equal(uint64(100*1e9), initialContainerStats[0].CgroupStats.CPUStats.CPUUsage.TotalUsage) + assert.Equal(uint64(200*1e9), initialContainerStats[1].CgroupStats.CPUStats.CPUUsage.TotalUsage) + assert.Equal(uint64(10000), initialContainerStats[0].CgroupStats.MemoryStats.Usage.Usage) + assert.Equal(uint64(20000), initialContainerStats[1].CgroupStats.MemoryStats.Usage.Usage) + + // get the 2nd stats + sandbox.StatsFunc = getSandboxCPUFunc(2000, 110000) + sandbox.StatsContainerFunc = getStatsContainerCPUFunc(200, 400, 20000, 40000) + + finishSandboxStats, finishContainersStats, err := s.statsSandbox() + + // calc overhead + mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, 1e9) + + // 70000 = (host2.cpu - host1.cpu - (delta containers.1.cpu + delta containers.2.cpu)) * 100 + // = (2000 - 1000 - (200 -100 + 400 - 200)) * 100 + // = (1000 - 300) * 100 + // = 70000 + assert.Equal(float64(70000), cpu) + + // 50000 = 110000 - sum(containers) + // = 110000 - (20000 + 40000) + // = 50000 + assert.Equal(float64(50000), mem) +} diff --git a/src/runtime/virtcontainers/interfaces.go b/src/runtime/virtcontainers/interfaces.go index 95ac941e6..dab6ca2c7 100644 --- a/src/runtime/virtcontainers/interfaces.go +++ b/src/runtime/virtcontainers/interfaces.go @@ -68,6 +68,8 @@ type VCSandbox interface { ID() string SetAnnotations(annotations map[string]string) error + Stats() (SandboxStats, error) + Start() error Stop(force bool) error Release() error diff --git a/src/runtime/virtcontainers/pkg/vcmock/sandbox.go b/src/runtime/virtcontainers/pkg/vcmock/sandbox.go index 2019989e2..b92c1cf52 100644 --- a/src/runtime/virtcontainers/pkg/vcmock/sandbox.go +++ b/src/runtime/virtcontainers/pkg/vcmock/sandbox.go @@ -125,6 +125,9 @@ func (s *Sandbox) StatusContainer(contID string) (vc.ContainerStatus, error) { // StatsContainer implements the VCSandbox function of the same name. func (s *Sandbox) StatsContainer(contID string) (vc.ContainerStats, error) { + if s.StatsContainerFunc != nil { + return s.StatsContainerFunc(contID) + } return vc.ContainerStats{}, nil } @@ -232,3 +235,11 @@ func (s *Sandbox) GetAgentMetrics() (string, error) { } return "", nil } + +// Stats implements the VCSandbox function of the same name. +func (s *Sandbox) Stats() (vc.SandboxStats, error) { + if s.StatsFunc != nil { + return s.StatsFunc() + } + return vc.SandboxStats{}, nil +} diff --git a/src/runtime/virtcontainers/pkg/vcmock/types.go b/src/runtime/virtcontainers/pkg/vcmock/types.go index b58275827..89cf6ed33 100644 --- a/src/runtime/virtcontainers/pkg/vcmock/types.go +++ b/src/runtime/virtcontainers/pkg/vcmock/types.go @@ -66,6 +66,7 @@ type Sandbox struct { ListRoutesFunc func() ([]*vcTypes.Route, error) UpdateRuntimeMetricsFunc func() error GetAgentMetricsFunc func() (string, error) + StatsFunc func() (vc.SandboxStats, error) } // Container is a fake Container type used for testing