diff --git a/docs/design/data/metrics.yaml b/docs/design/data/metrics.yaml
index 549883848..eeb8c5ced 100644
--- a/docs/design/data/metrics.yaml
+++ b/docs/design/data/metrics.yaml
@@ -1698,6 +1698,28 @@ components:
fixed: false
values: []
since: 2.0.0
+ - name: kata_shim_pod_overhead_cpu
+ type: GAUGE
+ unit: "percent"
+ help: Kata Pod overhead for CPU resources(percent).
+ labels:
+ - name: sandbox_id
+ desc: ""
+ manually_edit: false
+ fixed: false
+ values: []
+ since: 2.0.0
+ - name: kata_shim_pod_overhead_memory_in_bytes
+ type: GAUGE
+ unit: bytes
+ help: Kata Pod overhead for memory resources(bytes).
+ labels:
+ - name: sandbox_id
+ desc: ""
+ manually_edit: false
+ fixed: false
+ values: []
+ since: 2.0.0
- name: kata_shim_proc_stat
type: GAUGE
unit: ""
diff --git a/docs/design/kata-2-0-metrics.md b/docs/design/kata-2-0-metrics.md
index 72321d81b..a3321dbab 100644
--- a/docs/design/kata-2-0-metrics.md
+++ b/docs/design/kata-2-0-metrics.md
@@ -309,6 +309,8 @@ Metrics about Kata containerd shim v2 process.
| `kata_shim_go_threads`:
Number of OS threads created. | `GAUGE` | |
| 2.0.0 |
| `kata_shim_io_stat`:
Kata containerd shim v2 process IO statistics. | `GAUGE` | | - `item` (see `/proc//io`)
- `cancelledwritebytes`
- `rchar`
- `readbytes`
- `syscr`
- `syscw`
- `wchar`
- `writebytes`
- `sandbox_id`
| 2.0.0 |
| `kata_shim_netdev`:
Kata containerd shim v2 network devices statistics. | `GAUGE` | | - `interface` (network device name)
- `item` (see `/proc/net/dev`)
- `recv_bytes`
- `recv_compressed`
- `recv_drop`
- `recv_errs`
- `recv_fifo`
- `recv_frame`
- `recv_multicast`
- `recv_packets`
- `sent_bytes`
- `sent_carrier`
- `sent_colls`
- `sent_compressed`
- `sent_drop`
- `sent_errs`
- `sent_fifo`
- `sent_packets`
- `sandbox_id`
| 2.0.0 |
+| `kata_shim_pod_overhead_cpu`:
Kata Pod overhead for CPU resources(percent). | `GAUGE` | percent | | 2.0.0 |
+| `kata_shim_pod_overhead_memory_in_bytes`:
Kata Pod overhead for memory resources(bytes). | `GAUGE` | `bytes` | | 2.0.0 |
| `kata_shim_proc_stat`:
Kata containerd shim v2 process statistics. | `GAUGE` | | - `item` (see `/proc//stat`)
- `cstime`
- `cutime`
- `stime`
- `utime`
- `sandbox_id`
| 2.0.0 |
| `kata_shim_proc_status`:
Kata containerd shim v2 process status. | `GAUGE` | | - `item` (see `/proc//status`)
- `hugetlbpages`
- `nonvoluntary_ctxt_switches`
- `rssanon`
- `rssfile`
- `rssshmem`
- `vmdata`
- `vmexe`
- `vmhwm`
- `vmlck`
- `vmlib`
- `vmpeak`
- `vmpin`
- `vmpmd`
- `vmpte`
- `vmrss`
- `vmsize`
- `vmstk`
- `vmswap`
- `voluntary_ctxt_switches`
- `sandbox_id`
| 2.0.0 |
| `kata_shim_process_cpu_seconds_total`:
Total user and system CPU time spent in seconds. | `COUNTER` | `seconds` | | 2.0.0 |
diff --git a/src/runtime/containerd-shim-v2/shim_management.go b/src/runtime/containerd-shim-v2/shim_management.go
index 2018347c0..69ba88e3f 100644
--- a/src/runtime/containerd-shim-v2/shim_management.go
+++ b/src/runtime/containerd-shim-v2/shim_management.go
@@ -60,7 +60,7 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
agentMetrics, err := s.sandbox.GetAgentMetrics()
if err != nil {
logrus.WithError(err).Error("failed GetAgentMetrics")
- if isGRPCErrorCode(codes.Unimplemented, err) {
+ if isGRPCErrorCode(codes.NotFound, err) {
logrus.Warn("metrics API not supportted by this agent.")
ifSupportAgentMetricsAPI = false
return
@@ -74,6 +74,11 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
for _, mf := range list {
encoder.Encode(mf)
}
+
+ // collect pod overhead metrics need sleep to get the changes of cpu/memory resources usage
+ // so here only trigger the collect operation, and the data will be gathered
+ // next time collection request from Prometheus server
+ go s.setPodOverheadMetrics()
}
func decodeAgentMetrics(body string) []*dto.MetricFamily {
diff --git a/src/runtime/containerd-shim-v2/shim_metrics.go b/src/runtime/containerd-shim-v2/shim_metrics.go
index e26da7beb..455af2a2b 100644
--- a/src/runtime/containerd-shim-v2/shim_metrics.go
+++ b/src/runtime/containerd-shim-v2/shim_metrics.go
@@ -6,7 +6,10 @@
package containerdshim
import (
+ "time"
+
mutils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
+ vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
)
@@ -66,6 +69,18 @@ var (
Name: "fds",
Help: "Kata containerd shim v2 open FDs.",
})
+
+ katashimPodOverheadCPU = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: namespaceKatashim,
+ Name: "pod_overhead_cpu",
+ Help: "Kata Pod overhead for CPU resources(percent).",
+ })
+
+ katashimPodOverheadMemory = prometheus.NewGauge(prometheus.GaugeOpts{
+ Namespace: namespaceKatashim,
+ Name: "pod_overhead_memory_in_bytes",
+ Help: "Kata Pod overhead for memory resources(bytes).",
+ })
)
func registerMetrics() {
@@ -76,6 +91,8 @@ func registerMetrics() {
prometheus.MustRegister(katashimNetdev)
prometheus.MustRegister(katashimIOStat)
prometheus.MustRegister(katashimOpenFDs)
+ prometheus.MustRegister(katashimPodOverheadCPU)
+ prometheus.MustRegister(katashimPodOverheadMemory)
}
// updateShimMetrics will update metrics for kata shim process itself
@@ -116,3 +133,78 @@ func updateShimMetrics() error {
return nil
}
+
+// statsSandbox returns a detailed sandbox stats.
+func (s *service) statsSandbox() (vc.SandboxStats, []vc.ContainerStats, error) {
+ sandboxStats, err := s.sandbox.Stats()
+ if err != nil {
+ return vc.SandboxStats{}, []vc.ContainerStats{}, err
+ }
+
+ containerStats := []vc.ContainerStats{}
+ for _, c := range s.sandbox.GetAllContainers() {
+ cstats, err := s.sandbox.StatsContainer(c.ID())
+ if err != nil {
+ return vc.SandboxStats{}, []vc.ContainerStats{}, err
+ }
+ containerStats = append(containerStats, cstats)
+ }
+
+ return sandboxStats, containerStats, nil
+}
+
+func calcOverhead(initialSandboxStats, finishSandboxStats vc.SandboxStats, initialContainerStats, finishContainersStats []vc.ContainerStats, deltaTime float64) (float64, float64) {
+ hostInitCPU := initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
+ guestInitCPU := uint64(0)
+ for _, cs := range initialContainerStats {
+ guestInitCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
+ }
+
+ hostFinalCPU := finishSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
+ guestFinalCPU := uint64(0)
+ for _, cs := range finishContainersStats {
+ guestFinalCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
+ }
+
+ var guestMemoryUsage uint64
+ for _, cs := range finishContainersStats {
+ guestMemoryUsage += cs.CgroupStats.MemoryStats.Usage.Usage
+ }
+
+ hostMemoryUsage := finishSandboxStats.CgroupStats.MemoryStats.Usage.Usage
+
+ cpuUsageGuest := float64(guestFinalCPU-guestInitCPU) / deltaTime * 100
+ cpuUsageHost := float64(hostFinalCPU-hostInitCPU) / deltaTime * 100
+
+ return float64(hostMemoryUsage - guestMemoryUsage), float64(cpuUsageHost - cpuUsageGuest)
+}
+
+func (s *service) getPodOverhead() (float64, float64, error) {
+ initTime := time.Now().UnixNano()
+ initialSandboxStats, initialContainerStats, err := s.statsSandbox()
+ if err != nil {
+ return 0, 0, err
+ }
+
+ // Wait for 1 second to calculate CPU usage
+ time.Sleep(time.Second * 1)
+ finishtTime := time.Now().UnixNano()
+ deltaTime := float64(finishtTime - initTime)
+
+ finishSandboxStats, finishContainersStats, err := s.statsSandbox()
+ if err != nil {
+ return 0, 0, err
+ }
+ mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, deltaTime)
+ return mem, cpu, nil
+}
+
+func (s *service) setPodOverheadMetrics() error {
+ mem, cpu, err := s.getPodOverhead()
+ if err != nil {
+ return err
+ }
+ katashimPodOverheadMemory.Set(mem)
+ katashimPodOverheadCPU.Set(cpu)
+ return nil
+}
diff --git a/src/runtime/containerd-shim-v2/shim_metrics_test.go b/src/runtime/containerd-shim-v2/shim_metrics_test.go
new file mode 100644
index 000000000..ba1afe552
--- /dev/null
+++ b/src/runtime/containerd-shim-v2/shim_metrics_test.go
@@ -0,0 +1,114 @@
+// Copyright (c) 2020 Ant Financial
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+package containerdshim
+
+import (
+ "testing"
+
+ vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
+ "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func getSandboxCPUFunc(c, m uint64) func() (vc.SandboxStats, error) {
+ return func() (vc.SandboxStats, error) {
+ return vc.SandboxStats{
+ CgroupStats: vc.CgroupStats{
+ CPUStats: vc.CPUStats{
+ CPUUsage: vc.CPUUsage{
+ TotalUsage: c * 1e9,
+ },
+ },
+ MemoryStats: vc.MemoryStats{
+ Usage: vc.MemoryData{
+ Usage: m,
+ },
+ },
+ },
+ Cpus: 20,
+ }, nil
+ }
+}
+
+func getStatsContainerCPUFunc(fooCPU, barCPU, fooMem, barMem uint64) func(contID string) (vc.ContainerStats, error) {
+ return func(contID string) (vc.ContainerStats, error) {
+ vCPU := fooCPU
+ vMem := fooMem
+ if contID == "bar" {
+ vCPU = barCPU
+ vMem = barMem
+ }
+ return vc.ContainerStats{
+ CgroupStats: &vc.CgroupStats{
+ CPUStats: vc.CPUStats{
+ CPUUsage: vc.CPUUsage{
+ TotalUsage: vCPU * 1e9,
+ },
+ },
+ MemoryStats: vc.MemoryStats{
+ Usage: vc.MemoryData{
+ Usage: vMem,
+ },
+ },
+ },
+ }, nil
+
+ }
+}
+
+func TestStatsSandbox(t *testing.T) {
+ assert := assert.New(t)
+
+ sandbox := &vcmock.Sandbox{
+ MockID: testSandboxID,
+ StatsFunc: getSandboxCPUFunc(1000, 100000),
+ StatsContainerFunc: getStatsContainerCPUFunc(100, 200, 10000, 20000),
+ MockContainers: []*vcmock.Container{
+ &vcmock.Container{
+ MockID: "foo",
+ },
+ &vcmock.Container{
+ MockID: "bar",
+ },
+ },
+ }
+
+ s := &service{
+ id: testSandboxID,
+ sandbox: sandbox,
+ containers: make(map[string]*container),
+ }
+
+ initialSandboxStats, initialContainerStats, err := s.statsSandbox()
+ assert.Nil(err)
+ assert.Equal(uint64(1000*1e9), initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage)
+ assert.Equal(2, len(initialContainerStats))
+ assert.Equal(uint64(100*1e9), initialContainerStats[0].CgroupStats.CPUStats.CPUUsage.TotalUsage)
+ assert.Equal(uint64(200*1e9), initialContainerStats[1].CgroupStats.CPUStats.CPUUsage.TotalUsage)
+ assert.Equal(uint64(10000), initialContainerStats[0].CgroupStats.MemoryStats.Usage.Usage)
+ assert.Equal(uint64(20000), initialContainerStats[1].CgroupStats.MemoryStats.Usage.Usage)
+
+ // get the 2nd stats
+ sandbox.StatsFunc = getSandboxCPUFunc(2000, 110000)
+ sandbox.StatsContainerFunc = getStatsContainerCPUFunc(200, 400, 20000, 40000)
+
+ finishSandboxStats, finishContainersStats, err := s.statsSandbox()
+
+ // calc overhead
+ mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, 1e9)
+
+ // 70000 = (host2.cpu - host1.cpu - (delta containers.1.cpu + delta containers.2.cpu)) * 100
+ // = (2000 - 1000 - (200 -100 + 400 - 200)) * 100
+ // = (1000 - 300) * 100
+ // = 70000
+ assert.Equal(float64(70000), cpu)
+
+ // 50000 = 110000 - sum(containers)
+ // = 110000 - (20000 + 40000)
+ // = 50000
+ assert.Equal(float64(50000), mem)
+}
diff --git a/src/runtime/virtcontainers/interfaces.go b/src/runtime/virtcontainers/interfaces.go
index 95ac941e6..dab6ca2c7 100644
--- a/src/runtime/virtcontainers/interfaces.go
+++ b/src/runtime/virtcontainers/interfaces.go
@@ -68,6 +68,8 @@ type VCSandbox interface {
ID() string
SetAnnotations(annotations map[string]string) error
+ Stats() (SandboxStats, error)
+
Start() error
Stop(force bool) error
Release() error
diff --git a/src/runtime/virtcontainers/pkg/vcmock/sandbox.go b/src/runtime/virtcontainers/pkg/vcmock/sandbox.go
index 2019989e2..b92c1cf52 100644
--- a/src/runtime/virtcontainers/pkg/vcmock/sandbox.go
+++ b/src/runtime/virtcontainers/pkg/vcmock/sandbox.go
@@ -125,6 +125,9 @@ func (s *Sandbox) StatusContainer(contID string) (vc.ContainerStatus, error) {
// StatsContainer implements the VCSandbox function of the same name.
func (s *Sandbox) StatsContainer(contID string) (vc.ContainerStats, error) {
+ if s.StatsContainerFunc != nil {
+ return s.StatsContainerFunc(contID)
+ }
return vc.ContainerStats{}, nil
}
@@ -232,3 +235,11 @@ func (s *Sandbox) GetAgentMetrics() (string, error) {
}
return "", nil
}
+
+// Stats implements the VCSandbox function of the same name.
+func (s *Sandbox) Stats() (vc.SandboxStats, error) {
+ if s.StatsFunc != nil {
+ return s.StatsFunc()
+ }
+ return vc.SandboxStats{}, nil
+}
diff --git a/src/runtime/virtcontainers/pkg/vcmock/types.go b/src/runtime/virtcontainers/pkg/vcmock/types.go
index b58275827..89cf6ed33 100644
--- a/src/runtime/virtcontainers/pkg/vcmock/types.go
+++ b/src/runtime/virtcontainers/pkg/vcmock/types.go
@@ -66,6 +66,7 @@ type Sandbox struct {
ListRoutesFunc func() ([]*vcTypes.Route, error)
UpdateRuntimeMetricsFunc func() error
GetAgentMetricsFunc func() (string, error)
+ StatsFunc func() (vc.SandboxStats, error)
}
// Container is a fake Container type used for testing