mirror of
https://github.com/kata-containers/kata-containers.git
synced 2025-04-29 12:14:48 +00:00
runtime: add pod overhead metrics
Add pod overhead metrics for CPU and memory to help with calculating sandbox overhead. Fixes: #399 Signed-off-by: bin liu <bin@hyper.sh>
This commit is contained in:
parent
7eedc95de4
commit
0790ca4933
@ -1698,6 +1698,28 @@ components:
|
||||
fixed: false
|
||||
values: []
|
||||
since: 2.0.0
|
||||
- name: kata_shim_pod_overhead_cpu
|
||||
type: GAUGE
|
||||
unit: "percent"
|
||||
help: Kata Pod overhead for CPU resources(percent).
|
||||
labels:
|
||||
- name: sandbox_id
|
||||
desc: ""
|
||||
manually_edit: false
|
||||
fixed: false
|
||||
values: []
|
||||
since: 2.0.0
|
||||
- name: kata_shim_pod_overhead_memory_in_bytes
|
||||
type: GAUGE
|
||||
unit: bytes
|
||||
help: Kata Pod overhead for memory resources(bytes).
|
||||
labels:
|
||||
- name: sandbox_id
|
||||
desc: ""
|
||||
manually_edit: false
|
||||
fixed: false
|
||||
values: []
|
||||
since: 2.0.0
|
||||
- name: kata_shim_proc_stat
|
||||
type: GAUGE
|
||||
unit: ""
|
||||
|
@ -309,6 +309,8 @@ Metrics about Kata containerd shim v2 process.
|
||||
| `kata_shim_go_threads`: <br> Number of OS threads created. | `GAUGE` | | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_io_stat`: <br> Kata containerd shim v2 process IO statistics. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/io`)<ul><li>`cancelledwritebytes`</li><li>`rchar`</li><li>`readbytes`</li><li>`syscr`</li><li>`syscw`</li><li>`wchar`</li><li>`writebytes`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_netdev`: <br> Kata containerd shim v2 network devices statistics. | `GAUGE` | | <ul><li>`interface` (network device name)</li><li>`item` (see `/proc/net/dev`)<ul><li>`recv_bytes`</li><li>`recv_compressed`</li><li>`recv_drop`</li><li>`recv_errs`</li><li>`recv_fifo`</li><li>`recv_frame`</li><li>`recv_multicast`</li><li>`recv_packets`</li><li>`sent_bytes`</li><li>`sent_carrier`</li><li>`sent_colls`</li><li>`sent_compressed`</li><li>`sent_drop`</li><li>`sent_errs`</li><li>`sent_fifo`</li><li>`sent_packets`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_pod_overhead_cpu`: <br> Kata Pod overhead for CPU resources(percent). | `GAUGE` | percent | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_pod_overhead_memory_in_bytes`: <br> Kata Pod overhead for memory resources(bytes). | `GAUGE` | `bytes` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_proc_stat`: <br> Kata containerd shim v2 process statistics. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/stat`)<ul><li>`cstime`</li><li>`cutime`</li><li>`stime`</li><li>`utime`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_proc_status`: <br> Kata containerd shim v2 process status. | `GAUGE` | | <ul><li>`item` (see `/proc/<pid>/status`)<ul><li>`hugetlbpages`</li><li>`nonvoluntary_ctxt_switches`</li><li>`rssanon`</li><li>`rssfile`</li><li>`rssshmem`</li><li>`vmdata`</li><li>`vmexe`</li><li>`vmhwm`</li><li>`vmlck`</li><li>`vmlib`</li><li>`vmpeak`</li><li>`vmpin`</li><li>`vmpmd`</li><li>`vmpte`</li><li>`vmrss`</li><li>`vmsize`</li><li>`vmstk`</li><li>`vmswap`</li><li>`voluntary_ctxt_switches`</li></ul></li><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
| `kata_shim_process_cpu_seconds_total`: <br> Total user and system CPU time spent in seconds. | `COUNTER` | `seconds` | <ul><li>`sandbox_id`</li></ul> | 2.0.0 |
|
||||
|
@ -60,7 +60,7 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
agentMetrics, err := s.sandbox.GetAgentMetrics()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Error("failed GetAgentMetrics")
|
||||
if isGRPCErrorCode(codes.Unimplemented, err) {
|
||||
if isGRPCErrorCode(codes.NotFound, err) {
|
||||
logrus.Warn("metrics API not supportted by this agent.")
|
||||
ifSupportAgentMetricsAPI = false
|
||||
return
|
||||
@ -74,6 +74,11 @@ func (s *service) serveMetrics(w http.ResponseWriter, r *http.Request) {
|
||||
for _, mf := range list {
|
||||
encoder.Encode(mf)
|
||||
}
|
||||
|
||||
// collect pod overhead metrics need sleep to get the changes of cpu/memory resources usage
|
||||
// so here only trigger the collect operation, and the data will be gathered
|
||||
// next time collection request from Prometheus server
|
||||
go s.setPodOverheadMetrics()
|
||||
}
|
||||
|
||||
func decodeAgentMetrics(body string) []*dto.MetricFamily {
|
||||
|
@ -6,7 +6,10 @@
|
||||
package containerdshim
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
mutils "github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
@ -66,6 +69,18 @@ var (
|
||||
Name: "fds",
|
||||
Help: "Kata containerd shim v2 open FDs.",
|
||||
})
|
||||
|
||||
katashimPodOverheadCPU = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: namespaceKatashim,
|
||||
Name: "pod_overhead_cpu",
|
||||
Help: "Kata Pod overhead for CPU resources(percent).",
|
||||
})
|
||||
|
||||
katashimPodOverheadMemory = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: namespaceKatashim,
|
||||
Name: "pod_overhead_memory_in_bytes",
|
||||
Help: "Kata Pod overhead for memory resources(bytes).",
|
||||
})
|
||||
)
|
||||
|
||||
func registerMetrics() {
|
||||
@ -76,6 +91,8 @@ func registerMetrics() {
|
||||
prometheus.MustRegister(katashimNetdev)
|
||||
prometheus.MustRegister(katashimIOStat)
|
||||
prometheus.MustRegister(katashimOpenFDs)
|
||||
prometheus.MustRegister(katashimPodOverheadCPU)
|
||||
prometheus.MustRegister(katashimPodOverheadMemory)
|
||||
}
|
||||
|
||||
// updateShimMetrics will update metrics for kata shim process itself
|
||||
@ -116,3 +133,78 @@ func updateShimMetrics() error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// statsSandbox returns a detailed sandbox stats.
|
||||
func (s *service) statsSandbox() (vc.SandboxStats, []vc.ContainerStats, error) {
|
||||
sandboxStats, err := s.sandbox.Stats()
|
||||
if err != nil {
|
||||
return vc.SandboxStats{}, []vc.ContainerStats{}, err
|
||||
}
|
||||
|
||||
containerStats := []vc.ContainerStats{}
|
||||
for _, c := range s.sandbox.GetAllContainers() {
|
||||
cstats, err := s.sandbox.StatsContainer(c.ID())
|
||||
if err != nil {
|
||||
return vc.SandboxStats{}, []vc.ContainerStats{}, err
|
||||
}
|
||||
containerStats = append(containerStats, cstats)
|
||||
}
|
||||
|
||||
return sandboxStats, containerStats, nil
|
||||
}
|
||||
|
||||
func calcOverhead(initialSandboxStats, finishSandboxStats vc.SandboxStats, initialContainerStats, finishContainersStats []vc.ContainerStats, deltaTime float64) (float64, float64) {
|
||||
hostInitCPU := initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
|
||||
guestInitCPU := uint64(0)
|
||||
for _, cs := range initialContainerStats {
|
||||
guestInitCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
|
||||
}
|
||||
|
||||
hostFinalCPU := finishSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage
|
||||
guestFinalCPU := uint64(0)
|
||||
for _, cs := range finishContainersStats {
|
||||
guestFinalCPU += cs.CgroupStats.CPUStats.CPUUsage.TotalUsage
|
||||
}
|
||||
|
||||
var guestMemoryUsage uint64
|
||||
for _, cs := range finishContainersStats {
|
||||
guestMemoryUsage += cs.CgroupStats.MemoryStats.Usage.Usage
|
||||
}
|
||||
|
||||
hostMemoryUsage := finishSandboxStats.CgroupStats.MemoryStats.Usage.Usage
|
||||
|
||||
cpuUsageGuest := float64(guestFinalCPU-guestInitCPU) / deltaTime * 100
|
||||
cpuUsageHost := float64(hostFinalCPU-hostInitCPU) / deltaTime * 100
|
||||
|
||||
return float64(hostMemoryUsage - guestMemoryUsage), float64(cpuUsageHost - cpuUsageGuest)
|
||||
}
|
||||
|
||||
func (s *service) getPodOverhead() (float64, float64, error) {
|
||||
initTime := time.Now().UnixNano()
|
||||
initialSandboxStats, initialContainerStats, err := s.statsSandbox()
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
|
||||
// Wait for 1 second to calculate CPU usage
|
||||
time.Sleep(time.Second * 1)
|
||||
finishtTime := time.Now().UnixNano()
|
||||
deltaTime := float64(finishtTime - initTime)
|
||||
|
||||
finishSandboxStats, finishContainersStats, err := s.statsSandbox()
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, deltaTime)
|
||||
return mem, cpu, nil
|
||||
}
|
||||
|
||||
func (s *service) setPodOverheadMetrics() error {
|
||||
mem, cpu, err := s.getPodOverhead()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
katashimPodOverheadMemory.Set(mem)
|
||||
katashimPodOverheadCPU.Set(cpu)
|
||||
return nil
|
||||
}
|
||||
|
114
src/runtime/containerd-shim-v2/shim_metrics_test.go
Normal file
114
src/runtime/containerd-shim-v2/shim_metrics_test.go
Normal file
@ -0,0 +1,114 @@
|
||||
// Copyright (c) 2020 Ant Financial
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
package containerdshim
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
|
||||
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/vcmock"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func getSandboxCPUFunc(c, m uint64) func() (vc.SandboxStats, error) {
|
||||
return func() (vc.SandboxStats, error) {
|
||||
return vc.SandboxStats{
|
||||
CgroupStats: vc.CgroupStats{
|
||||
CPUStats: vc.CPUStats{
|
||||
CPUUsage: vc.CPUUsage{
|
||||
TotalUsage: c * 1e9,
|
||||
},
|
||||
},
|
||||
MemoryStats: vc.MemoryStats{
|
||||
Usage: vc.MemoryData{
|
||||
Usage: m,
|
||||
},
|
||||
},
|
||||
},
|
||||
Cpus: 20,
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func getStatsContainerCPUFunc(fooCPU, barCPU, fooMem, barMem uint64) func(contID string) (vc.ContainerStats, error) {
|
||||
return func(contID string) (vc.ContainerStats, error) {
|
||||
vCPU := fooCPU
|
||||
vMem := fooMem
|
||||
if contID == "bar" {
|
||||
vCPU = barCPU
|
||||
vMem = barMem
|
||||
}
|
||||
return vc.ContainerStats{
|
||||
CgroupStats: &vc.CgroupStats{
|
||||
CPUStats: vc.CPUStats{
|
||||
CPUUsage: vc.CPUUsage{
|
||||
TotalUsage: vCPU * 1e9,
|
||||
},
|
||||
},
|
||||
MemoryStats: vc.MemoryStats{
|
||||
Usage: vc.MemoryData{
|
||||
Usage: vMem,
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatsSandbox(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
sandbox := &vcmock.Sandbox{
|
||||
MockID: testSandboxID,
|
||||
StatsFunc: getSandboxCPUFunc(1000, 100000),
|
||||
StatsContainerFunc: getStatsContainerCPUFunc(100, 200, 10000, 20000),
|
||||
MockContainers: []*vcmock.Container{
|
||||
&vcmock.Container{
|
||||
MockID: "foo",
|
||||
},
|
||||
&vcmock.Container{
|
||||
MockID: "bar",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
s := &service{
|
||||
id: testSandboxID,
|
||||
sandbox: sandbox,
|
||||
containers: make(map[string]*container),
|
||||
}
|
||||
|
||||
initialSandboxStats, initialContainerStats, err := s.statsSandbox()
|
||||
assert.Nil(err)
|
||||
assert.Equal(uint64(1000*1e9), initialSandboxStats.CgroupStats.CPUStats.CPUUsage.TotalUsage)
|
||||
assert.Equal(2, len(initialContainerStats))
|
||||
assert.Equal(uint64(100*1e9), initialContainerStats[0].CgroupStats.CPUStats.CPUUsage.TotalUsage)
|
||||
assert.Equal(uint64(200*1e9), initialContainerStats[1].CgroupStats.CPUStats.CPUUsage.TotalUsage)
|
||||
assert.Equal(uint64(10000), initialContainerStats[0].CgroupStats.MemoryStats.Usage.Usage)
|
||||
assert.Equal(uint64(20000), initialContainerStats[1].CgroupStats.MemoryStats.Usage.Usage)
|
||||
|
||||
// get the 2nd stats
|
||||
sandbox.StatsFunc = getSandboxCPUFunc(2000, 110000)
|
||||
sandbox.StatsContainerFunc = getStatsContainerCPUFunc(200, 400, 20000, 40000)
|
||||
|
||||
finishSandboxStats, finishContainersStats, err := s.statsSandbox()
|
||||
|
||||
// calc overhead
|
||||
mem, cpu := calcOverhead(initialSandboxStats, finishSandboxStats, initialContainerStats, finishContainersStats, 1e9)
|
||||
|
||||
// 70000 = (host2.cpu - host1.cpu - (delta containers.1.cpu + delta containers.2.cpu)) * 100
|
||||
// = (2000 - 1000 - (200 -100 + 400 - 200)) * 100
|
||||
// = (1000 - 300) * 100
|
||||
// = 70000
|
||||
assert.Equal(float64(70000), cpu)
|
||||
|
||||
// 50000 = 110000 - sum(containers)
|
||||
// = 110000 - (20000 + 40000)
|
||||
// = 50000
|
||||
assert.Equal(float64(50000), mem)
|
||||
}
|
@ -68,6 +68,8 @@ type VCSandbox interface {
|
||||
ID() string
|
||||
SetAnnotations(annotations map[string]string) error
|
||||
|
||||
Stats() (SandboxStats, error)
|
||||
|
||||
Start() error
|
||||
Stop(force bool) error
|
||||
Release() error
|
||||
|
@ -125,6 +125,9 @@ func (s *Sandbox) StatusContainer(contID string) (vc.ContainerStatus, error) {
|
||||
|
||||
// StatsContainer implements the VCSandbox function of the same name.
|
||||
func (s *Sandbox) StatsContainer(contID string) (vc.ContainerStats, error) {
|
||||
if s.StatsContainerFunc != nil {
|
||||
return s.StatsContainerFunc(contID)
|
||||
}
|
||||
return vc.ContainerStats{}, nil
|
||||
}
|
||||
|
||||
@ -232,3 +235,11 @@ func (s *Sandbox) GetAgentMetrics() (string, error) {
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// Stats implements the VCSandbox function of the same name.
|
||||
func (s *Sandbox) Stats() (vc.SandboxStats, error) {
|
||||
if s.StatsFunc != nil {
|
||||
return s.StatsFunc()
|
||||
}
|
||||
return vc.SandboxStats{}, nil
|
||||
}
|
||||
|
@ -66,6 +66,7 @@ type Sandbox struct {
|
||||
ListRoutesFunc func() ([]*vcTypes.Route, error)
|
||||
UpdateRuntimeMetricsFunc func() error
|
||||
GetAgentMetricsFunc func() (string, error)
|
||||
StatsFunc func() (vc.SandboxStats, error)
|
||||
}
|
||||
|
||||
// Container is a fake Container type used for testing
|
||||
|
Loading…
Reference in New Issue
Block a user