runtime: add pod overhead metrics

Add pod overhead metrics for CPU and memory to help with calculating sandbox overhead.

Fixes: #399

Signed-off-by: bin liu <bin@hyper.sh>
This commit is contained in:
bin liu 2020-07-08 18:00:55 +08:00
parent 7eedc95de4
commit 0790ca4933
8 changed files with 250 additions and 1 deletions

View File

@ -1698,6 +1698,28 @@ components:
fixed: false
values: []
since: 2.0.0
- name: kata_shim_pod_overhead_cpu
type: GAUGE
unit: "percent"
help: Kata Pod overhead for CPU resources(percent).
labels:
- name: sandbox_id
desc: ""
manually_edit: false
fixed: false
values: []
since: 2.0.0
- name: kata_shim_pod_overhead_memory_in_bytes
type: GAUGE
unit: bytes
help: Kata Pod overhead for memory resources(bytes).
labels:
- name: sandbox_id
desc: ""
manually_edit: false
fixed: false
values: []
since: 2.0.0
- name: kata_shim_proc_stat
type: GAUGE
unit: ""

View File

@ -309,6 +309,8 @@ Metrics about Kata containerd shim v2 process.
| `kata_shim_go_threads`: <br> Number of OS threads created. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_io_stat`: <br> Kata containerd shim v2 process IO statistics. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/io`)<ul><li>`cancelledwritebytes`</li><li>`rchar`</li><li>`readbytes`</li><li>`syscr`</li><li>`syscw`</li><li>`wchar`</li><li>`writebytes`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_netdev`: <br> Kata containerd shim v2 network devices statistics. | `GAUGE` | | <ul><li>`interface` (network device name)</li><li>`item` (see `/proc/net/dev`)<ul><li>`recv_bytes`</li><li>`recv_compressed`</li><li>`recv_drop`</li><li>`recv_errs`</li><li>`recv_fifo`</li><li>`recv_frame`</li><li>`recv_multicast`</li><li>`recv_packets`</li><li>`sent_bytes`</li><li>`sent_carrier`</li><li>`sent_colls`</li><li>`sent_compressed`</li><li>`sent_drop`</li><li>`sent_errs`</li><li>`sent_fifo`</li><li>`sent_packets`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_pod_overhead_cpu`: <br> Kata Pod overhead for CPU resources(percent). | `GAUGE` | percent | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_pod_overhead_memory_in_bytes`: <br> Kata Pod overhead for memory resources(bytes). | `GAUGE` | `bytes` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_proc_stat`: <br> Kata containerd shim v2 process statistics. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/stat`)<ul><li>`cstime`</li><li>`cutime`</li><li>`stime`</li><li>`utime`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_proc_status`: <br> Kata containerd shim v2 process status. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/status`)<ul><li>`hugetlbpages`</li><li>`nonvoluntary_ctxt_switches`</li><li>`rssanon`</li><li>`rssfile`</li><li>`rssshmem`</li><li>`vmdata`</li><li>`vmexe`</li><li>`vmhwm`</li><li>`vmlck`</li><li>`vmlib`</li><li>`vmpeak`</li><li>`vmpin`</li><li>`vmpmd`</li><li>`vmpte`</li><li>`vmrss`</li><li>`vmsize`</li><li>`vmstk`</li><li>`vmswap`</li><li>`voluntary_ctxt_switches`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
| `kata_shim_process_cpu_seconds_total`: <br> Total user and system CPU time spent in seconds. | `COUNTER` | `seconds` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |

View File

@ -60,7 +60,7 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
agentMetrics, err := s.sandbox.GetAgentMetrics()
if err != nil {
logrus.WithError(err).Error("failed GetAgentMetrics")
if isGRPCErrorCode(codes.Unimplemented, err) {
if isGRPCErrorCode(codes.NotFound, err) {
logrus.Warn("metrics API not supportted by this agent.")
ifSupportAgentMetricsAPI = false
return
@ -74,6 +74,11 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
for _, mf := range list {
encoder.Encode(mf)
}
// collect pod overhead metrics need sleep to get the changes of cpu/memory resources usage
// so here only trigger the collect operation, and the data will be gathered
// next time collection request from Prometheus server
go s.setPodOverheadMetrics()
}
func decodeAgentMetrics(body string) []*dto.MetricFamily {

View File

@ -6,7 +6,10 @@
package containerdshim
import (
"time"
mutils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
)
@ -66,6 +69,18 @@ var (
Name: "fds",
Help: "Kata containerd shim v2 open FDs.",
})
katashimPodOverheadCPU = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespaceKatashim,
Name: "pod_overhead_cpu",
Help: "Kata Pod overhead for CPU resources(percent).",
})
katashimPodOverheadMemory = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespaceKatashim,
Name: "pod_overhead_memory_in_bytes",
Help: "Kata Pod overhead for memory resources(bytes).",
})
)
func registerMetrics() {
@ -76,6 +91,8 @@ func registerMetrics() {
prometheus.MustRegister(katashimNetdev)
prometheus.MustRegister(katashimIOStat)
prometheus.MustRegister(katashimOpenFDs)
prometheus.MustRegister(katashimPodOverheadCPU)
prometheus.MustRegister(katashimPodOverheadMemory)
}
// updateShimMetrics will update metrics for kata shim process itself
@ -116,3 +133,78 @@ func updateShimMetrics() error {
return nil
}
// statsSandbox returns a detailed sandbox stats.
func (s *service) statsSandbox() (vc.SandboxStats, []vc.ContainerStats, error) {
sandboxStats, err := s.sandbox.Stats()
if err != nil {
return vc.SandboxStats{}, []vc.ContainerStats{}, err
}
containerStats := []vc.ContainerStats{}
for _, c := range s.sandbox.GetAllContainers() {
cstats, err := s.sandbox.StatsContainer(c.ID())
if err != nil {
return vc.SandboxStats{}, []vc.ContainerStats{}, err
}
containerStats = append(containerStats, cstats)
}
return sandboxStats, containerStats, nil
}
func calcOverhead(initialSandboxStats, finishSandboxStats vc.SandboxStats, initialContainerStats, finishContainersStats []vc.ContainerStats, deltaTime float64) (float64, float64) {
hostInitCPU := initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
guestInitCPU := uint64(0)
for _, cs := range initialContainerStats {
guestInitCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
}
hostFinalCPU := finishSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
guestFinalCPU := uint64(0)
for _, cs := range finishContainersStats {
guestFinalCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
}
var guestMemoryUsage uint64
for _, cs := range finishContainersStats {
guestMemoryUsage += cs.CgroupStats.MemoryStats.Usage.Usage
}
hostMemoryUsage := finishSandboxStats.CgroupStats.MemoryStats.Usage.Usage
cpuUsageGuest := float64(guestFinalCPU-guestInitCPU) / deltaTime * 100
cpuUsageHost := float64(hostFinalCPU-hostInitCPU) / deltaTime * 100
return float64(hostMemoryUsage - guestMemoryUsage), float64(cpuUsageHost - cpuUsageGuest)
}
func (s *service) getPodOverhead() (float64, float64, error) {
initTime := time.Now().UnixNano()
initialSandboxStats, initialContainerStats, err := s.statsSandbox()
if err != nil {
return 0, 0, err
}
// Wait for 1 second to calculate CPU usage
time.Sleep(time.Second * 1)
finishtTime := time.Now().UnixNano()
deltaTime := float64(finishtTime - initTime)
finishSandboxStats, finishContainersStats, err := s.statsSandbox()
if err != nil {
return 0, 0, err
}
mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, deltaTime)
return mem, cpu, nil
}
func (s *service) setPodOverheadMetrics() error {
mem, cpu, err := s.getPodOverhead()
if err != nil {
return err
}
katashimPodOverheadMemory.Set(mem)
katashimPodOverheadCPU.Set(cpu)
return nil
}

View File

@ -0,0 +1,114 @@
// Copyright (c) 2020 Ant Financial
//
// SPDX-License-Identifier: Apache-2.0
//
package containerdshim
import (
"testing"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock"
"github.com/stretchr/testify/assert"
)
func getSandboxCPUFunc(c, m uint64) func() (vc.SandboxStats, error) {
return func() (vc.SandboxStats, error) {
return vc.SandboxStats{
CgroupStats: vc.CgroupStats{
CPUStats: vc.CPUStats{
CPUUsage: vc.CPUUsage{
TotalUsage: c * 1e9,
},
},
MemoryStats: vc.MemoryStats{
Usage: vc.MemoryData{
Usage: m,
},
},
},
Cpus: 20,
}, nil
}
}
func getStatsContainerCPUFunc(fooCPU, barCPU, fooMem, barMem uint64) func(contID string) (vc.ContainerStats, error) {
return func(contID string) (vc.ContainerStats, error) {
vCPU := fooCPU
vMem := fooMem
if contID == "bar" {
vCPU = barCPU
vMem = barMem
}
return vc.ContainerStats{
CgroupStats: &vc.CgroupStats{
CPUStats: vc.CPUStats{
CPUUsage: vc.CPUUsage{
TotalUsage: vCPU * 1e9,
},
},
MemoryStats: vc.MemoryStats{
Usage: vc.MemoryData{
Usage: vMem,
},
},
},
}, nil
}
}
func TestStatsSandbox(t *testing.T) {
assert := assert.New(t)
sandbox := &vcmock.Sandbox{
MockID: testSandboxID,
StatsFunc: getSandboxCPUFunc(1000, 100000),
StatsContainerFunc: getStatsContainerCPUFunc(100, 200, 10000, 20000),
MockContainers: []*vcmock.Container{
&vcmock.Container{
MockID: "foo",
},
&vcmock.Container{
MockID: "bar",
},
},
}
s := &service{
id: testSandboxID,
sandbox: sandbox,
containers: make(map[string]*container),
}
initialSandboxStats, initialContainerStats, err := s.statsSandbox()
assert.Nil(err)
assert.Equal(uint64(1000*1e9), initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage)
assert.Equal(2, len(initialContainerStats))
assert.Equal(uint64(100*1e9), initialContainerStats[0].CgroupStats.CPUStats.CPUUsage.TotalUsage)
assert.Equal(uint64(200*1e9), initialContainerStats[1].CgroupStats.CPUStats.CPUUsage.TotalUsage)
assert.Equal(uint64(10000), initialContainerStats[0].CgroupStats.MemoryStats.Usage.Usage)
assert.Equal(uint64(20000), initialContainerStats[1].CgroupStats.MemoryStats.Usage.Usage)
// get the 2nd stats
sandbox.StatsFunc = getSandboxCPUFunc(2000, 110000)
sandbox.StatsContainerFunc = getStatsContainerCPUFunc(200, 400, 20000, 40000)
finishSandboxStats, finishContainersStats, err := s.statsSandbox()
// calc overhead
mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, 1e9)
// 70000 = (host2.cpu - host1.cpu - (delta containers.1.cpu + delta containers.2.cpu)) * 100
// = (2000 - 1000 - (200 -100 + 400 - 200)) * 100
// = (1000 - 300) * 100
// = 70000
assert.Equal(float64(70000), cpu)
// 50000 = 110000 - sum(containers)
// = 110000 - (20000 + 40000)
// = 50000
assert.Equal(float64(50000), mem)
}

View File

@ -68,6 +68,8 @@ type VCSandbox interface {
ID() string
SetAnnotations(annotations map[string]string) error
Stats() (SandboxStats, error)
Start() error
Stop(force bool) error
Release() error

View File

@ -125,6 +125,9 @@ func (s *Sandbox) StatusContainer(contID string) (vc.ContainerStatus, error) {
// StatsContainer implements the VCSandbox function of the same name.
func (s *Sandbox) StatsContainer(contID string) (vc.ContainerStats, error) {
if s.StatsContainerFunc != nil {
return s.StatsContainerFunc(contID)
}
return vc.ContainerStats{}, nil
}
@ -232,3 +235,11 @@ func (s *Sandbox) GetAgentMetrics() (string, error) {
}
return "", nil
}
// Stats implements the VCSandbox function of the same name.
func (s *Sandbox) Stats() (vc.SandboxStats, error) {
if s.StatsFunc != nil {
return s.StatsFunc()
}
return vc.SandboxStats{}, nil
}

View File

@ -66,6 +66,7 @@ type Sandbox struct {
ListRoutesFunc func() ([]*vcTypes.Route, error)
UpdateRuntimeMetricsFunc func() error
GetAgentMetricsFunc func() (string, error)
StatsFunc func() (vc.SandboxStats, error)
}
// Container is a fake Container type used for testing