From cd85d4b3fb0a7c07ba0a6a5b511627eb7c1d85e4 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Thu, 10 Jun 2021 15:23:27 -0400 Subject: [PATCH 1/7] features: add podAndContainerStatsFromCRI to allow users to specify whether the Kubelet should pull pod and container stats strictly from the CRI, rather than a mixture of CRI and cAdvsior Signed-off-by: Peter Hunt --- pkg/features/kube_features.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index 337963f182a..26cf549dbfd 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -791,6 +791,14 @@ const ( // node affinity, selector and tolerations. This is allowed only for suspended jobs // that have never been unsuspended before. JobMutableNodeSchedulingDirectives featuregate.Feature = "JobMutableNodeSchedulingDirectives" + + // owner: @haircommander + // kep: http://kep.k8s.io/2364 + // alpha: v1.23 + // + // Configures the Kubelet to use the CRI to populate pod and container stats, instead of supplimenting with stats from cAdvisor. + // Requires the CRI implementation supports supplying the required stats. + PodAndContainerStatsFromCRI featuregate.Feature = "PodAndContainerStatsFromCRI" ) func init() { @@ -907,6 +915,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS CPUManagerPolicyBetaOptions: {Default: true, PreRelease: featuregate.Beta}, JobMutableNodeSchedulingDirectives: {Default: true, PreRelease: featuregate.Beta}, IdentifyPodOS: {Default: false, PreRelease: featuregate.Alpha}, + PodAndContainerStatsFromCRI: {Default: false, PreRelease: featuregate.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: From 7866287ba1da391bd03a44510f691f3dc6f9e6ce Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Thu, 10 Jun 2021 15:28:06 -0400 Subject: [PATCH 2/7] kubelet stats: wire up podAndContainerStatsFromCRI feature gate though it is currently unused Signed-off-by: Peter Hunt --- pkg/kubelet/kubelet.go | 3 ++- pkg/kubelet/stats/cri_stats_provider.go | 14 ++++++++++++-- pkg/kubelet/stats/cri_stats_provider_test.go | 4 ++++ pkg/kubelet/stats/provider.go | 4 ++-- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f174235d262..2b286285411 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -723,7 +723,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, kubeDeps.RemoteRuntimeService, kubeDeps.RemoteImageService, hostStatsProvider, - utilfeature.DefaultFeatureGate.Enabled(features.DisableAcceleratorUsageMetrics)) + utilfeature.DefaultFeatureGate.Enabled(features.DisableAcceleratorUsageMetrics), + utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI)) } klet.pleg = pleg.NewGenericPLEG(klet.containerRuntime, plegChannelCapacity, plegRelistPeriod, klet.podCache, clock.RealClock{}) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index e68aa7c42ac..1c237282344 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -70,6 +70,7 @@ type criStatsProvider struct { cpuUsageCache map[string]*cpuUsageRecord mutex sync.RWMutex disableAcceleratorUsageMetrics bool + podAndContainerStatsFromCRI bool } // newCRIStatsProvider returns a containerStatsProvider implementation that @@ -80,7 +81,8 @@ func newCRIStatsProvider( runtimeService internalapi.RuntimeService, imageService internalapi.ImageManagerService, hostStatsProvider HostStatsProvider, - disableAcceleratorUsageMetrics bool, + disableAcceleratorUsageMetrics, + podAndContainerStatsFromCRI bool, ) containerStatsProvider { return &criStatsProvider{ cadvisor: cadvisor, @@ -90,6 +92,7 @@ func newCRIStatsProvider( hostStatsProvider: hostStatsProvider, cpuUsageCache: make(map[string]*cpuUsageRecord), disableAcceleratorUsageMetrics: disableAcceleratorUsageMetrics, + podAndContainerStatsFromCRI: podAndContainerStatsFromCRI, } } @@ -137,6 +140,10 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi for _, s := range podSandboxes { podSandboxMap[s.Id] = s } + + if p.podAndContainerStatsFromCRI { + return p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containers, podSandboxes) + } // fsIDtoInfo is a map from filesystem id to its stats. This will be used // as a cache to avoid querying cAdvisor for the filesystem stats with the // same filesystem id many times. @@ -156,7 +163,6 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi for _, c := range containers { containerMap[c.Id] = c } - allInfos, err := getCadvisorContainerInfo(p.cadvisor) if err != nil { return nil, fmt.Errorf("failed to fetch cadvisor stats: %v", err) @@ -218,6 +224,10 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi return result, nil } +func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bool, containers []*runtimeapi.Container, podSandboxes []*runtimeapi.PodSandbox) ([]statsapi.PodStats, error) { + return []statsapi.PodStats{}, nil +} + // ListPodCPUAndMemoryStats returns the CPU and Memory stats of all the pod-managed containers. func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, error) { containers, err := p.runtimeService.ListContainers(&runtimeapi.ContainerFilter{}) diff --git a/pkg/kubelet/stats/cri_stats_provider_test.go b/pkg/kubelet/stats/cri_stats_provider_test.go index a5857c1a0a9..52a3ea069a0 100644 --- a/pkg/kubelet/stats/cri_stats_provider_test.go +++ b/pkg/kubelet/stats/cri_stats_provider_test.go @@ -236,6 +236,7 @@ func TestCRIListPodStats(t *testing.T) { fakeImageService, NewFakeHostStatsProviderWithData(fakeStats, fakeOS), false, + false, ) stats, err := provider.ListPodStats() @@ -396,6 +397,7 @@ func TestAcceleratorUsageStatsCanBeDisabled(t *testing.T) { fakeImageService, NewFakeHostStatsProvider(), true, // this is what the test is actually testing + false, ) stats, err := provider.ListPodStats() @@ -541,6 +543,7 @@ func TestCRIListPodCPUAndMemoryStats(t *testing.T) { nil, NewFakeHostStatsProvider(), false, + false, ) stats, err := provider.ListPodCPUAndMemoryStats() @@ -671,6 +674,7 @@ func TestCRIImagesFsStats(t *testing.T) { fakeImageService, NewFakeHostStatsProvider(), false, + false, ) stats, err := provider.ImageFsStats() diff --git a/pkg/kubelet/stats/provider.go b/pkg/kubelet/stats/provider.go index fd3c5dd8248..7c06a550949 100644 --- a/pkg/kubelet/stats/provider.go +++ b/pkg/kubelet/stats/provider.go @@ -42,10 +42,10 @@ func NewCRIStatsProvider( runtimeService internalapi.RuntimeService, imageService internalapi.ImageManagerService, hostStatsProvider HostStatsProvider, - disableAcceleratorUsageMetrics bool, + disableAcceleratorUsageMetrics, podAndContainerStatsFromCRI bool, ) *Provider { return newStatsProvider(cadvisor, podManager, runtimeCache, newCRIStatsProvider(cadvisor, resourceAnalyzer, - runtimeService, imageService, hostStatsProvider, disableAcceleratorUsageMetrics)) + runtimeService, imageService, hostStatsProvider, disableAcceleratorUsageMetrics, podAndContainerStatsFromCRI)) } // NewCadvisorStatsProvider returns a containerStatsProvider that provides both From d2c436700eaeee91b011fca49b956d4bcd45cd6d Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 21 Jun 2021 15:48:33 -0400 Subject: [PATCH 3/7] kubelet stats: add support for podAndContainerStatsFromCRI This commit adds an initial implementation of translating from the new CRI fields to the /stats/summary PodStats object Signed-off-by: Peter Hunt --- pkg/kubelet/stats/cri_stats_provider.go | 123 +++++++++++++++++++++--- 1 file changed, 112 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index 1c237282344..58368d54234 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -141,8 +141,15 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi podSandboxMap[s.Id] = s } + containers = removeTerminatedContainers(containers) + // Creates container map. + containerMap := make(map[string]*runtimeapi.Container) + for _, c := range containers { + containerMap[c.Id] = c + } + if p.podAndContainerStatsFromCRI { - return p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containers, podSandboxes) + return p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) } // fsIDtoInfo is a map from filesystem id to its stats. This will be used // as a cache to avoid querying cAdvisor for the filesystem stats with the @@ -156,13 +163,6 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi if err != nil { return nil, fmt.Errorf("failed to list all container stats: %v", err) } - - containers = removeTerminatedContainers(containers) - // Creates container map. - containerMap := make(map[string]*runtimeapi.Container) - for _, c := range containers { - containerMap[c.Id] = c - } allInfos, err := getCadvisorContainerInfo(p.cadvisor) if err != nil { return nil, fmt.Errorf("failed to fetch cadvisor stats: %v", err) @@ -224,8 +224,38 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi return result, nil } -func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bool, containers []*runtimeapi.Container, podSandboxes []*runtimeapi.PodSandbox) ([]statsapi.PodStats, error) { - return []statsapi.PodStats{}, nil +func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bool, containerMap map[string]*runtimeapi.Container, podSandboxMap map[string]*runtimeapi.PodSandbox, rootFsInfo *cadvisorapiv2.FsInfo) ([]statsapi.PodStats, error) { + criSandboxStats, err := p.runtimeService.ListPodSandboxStats(&runtimeapi.PodSandboxStatsFilter{}) + if err != nil { + return nil, err + } + + fsIDtoInfo := make(map[runtimeapi.FilesystemIdentifier]*cadvisorapiv2.FsInfo) + summarySandboxStats := make([]statsapi.PodStats, 0, len(podSandboxMap)) + for _, criSandboxStat := range criSandboxStats { + podSandbox, found := podSandboxMap[criSandboxStat.Attributes.Id] + if !found { + continue + } + ps := buildPodStats(podSandbox) + // TODO FIXME(haircommander): resolve timestamp by taking the latest of each that were collected + for _, criContainerStat := range criSandboxStat.Linux.Containers { + container, found := containerMap[criContainerStat.Attributes.Id] + if !found { + continue + } + // Fill available stats for full set of required pod stats + cs := p.makeContainerStats(criContainerStat, container, rootFsInfo, fsIDtoInfo, podSandbox.GetMetadata(), updateCPUNanoCoreUsage) + ps.Containers = append(ps.Containers, *cs) + } + addCRIPodNetworkStats(ps, criSandboxStat) + addCRIPodCPUStats(ps, criSandboxStat) + addCRIPodMemoryStats(ps, criSandboxStat) + addCRIPodProcessStats(ps, criSandboxStat) + p.makePodStorageStats(ps, rootFsInfo) + summarySandboxStats = append(summarySandboxStats, *ps) + } + return summarySandboxStats, nil } // ListPodCPUAndMemoryStats returns the CPU and Memory stats of all the pod-managed containers. @@ -261,6 +291,26 @@ func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, erro containerMap[c.Id] = c } + result := make([]statsapi.PodStats, 0, len(sandboxIDToPodStats)) + + if p.podAndContainerStatsFromCRI { + criSandboxStats, err := p.runtimeService.ListPodSandboxStats(&runtimeapi.PodSandboxStatsFilter{}) + if err != nil { + return nil, err + } + for _, criSandboxStat := range criSandboxStats { + podSandbox, found := podSandboxMap[criSandboxStat.Attributes.Id] + if !found { + continue + } + ps := buildPodStats(podSandbox) + addCRIPodCPUStats(ps, criSandboxStat) + addCRIPodMemoryStats(ps, criSandboxStat) + result = append(result, *ps) + } + return result, err + } + allInfos, err := getCadvisorContainerInfo(p.cadvisor) if err != nil { return nil, fmt.Errorf("failed to fetch cadvisor stats: %v", err) @@ -305,7 +355,6 @@ func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, erro // cleanup outdated caches. p.cleanupOutdatedCaches() - result := make([]statsapi.PodStats, 0, len(sandboxIDToPodStats)) for _, s := range sandboxIDToPodStats { result = append(result, *s) } @@ -857,3 +906,55 @@ func extractIDFromCgroupPath(cgroupPath string) string { } return id } + +func addCRIPodNetworkStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + criNetwork := criPodStat.Linux.Network + iStats := statsapi.NetworkStats{ + Time: metav1.NewTime(time.Unix(0, criNetwork.Timestamp)), + InterfaceStats: criInterfaceToSummary(criNetwork.DefaultInterface), + Interfaces: make([]statsapi.InterfaceStats, 0, len(criNetwork.Interfaces)), + } + for _, iface := range criNetwork.Interfaces { + iStats.Interfaces = append(iStats.Interfaces, criInterfaceToSummary(iface)) + } + ps.Network = &iStats +} + +func criInterfaceToSummary(criIface *runtimeapi.NetworkInterfaceUsage) statsapi.InterfaceStats { + return statsapi.InterfaceStats{ + Name: criIface.Name, + RxBytes: &criIface.RxBytes.Value, + RxErrors: &criIface.RxErrors.Value, + TxBytes: &criIface.TxBytes.Value, + TxErrors: &criIface.TxErrors.Value, + } +} + +func addCRIPodCPUStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + criCPU := criPodStat.Linux.Cpu + ps.CPU = &statsapi.CPUStats{ + Time: metav1.NewTime(time.Unix(0, criCPU.Timestamp)), + UsageNanoCores: &criCPU.UsageNanoCores.Value, + UsageCoreNanoSeconds: &criCPU.UsageCoreNanoSeconds.Value, + } +} + +func addCRIPodMemoryStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + criMemory := criPodStat.Linux.Memory + ps.Memory = &statsapi.MemoryStats{ + Time: metav1.NewTime(time.Unix(0, criMemory.Timestamp)), + AvailableBytes: &criMemory.AvailableBytes.Value, + UsageBytes: &criMemory.UsageBytes.Value, + WorkingSetBytes: &criMemory.WorkingSetBytes.Value, + RSSBytes: &criMemory.RssBytes.Value, + PageFaults: &criMemory.PageFaults.Value, + MajorPageFaults: &criMemory.MajorPageFaults.Value, + } +} + +func addCRIPodProcessStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + criProcess := criPodStat.Linux.Process + ps.ProcessStats = &statsapi.ProcessStats{ + ProcessCount: &criProcess.ProcessCount.Value, + } +} From ffdb4b9c4aa2b23b64dc2a809de330f1bec44f90 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Wed, 23 Jun 2021 14:04:00 -0400 Subject: [PATCH 4/7] kubelet: slightly move around some cri stats functions to reduce duplication and add clarity Signed-off-by: Peter Hunt --- pkg/kubelet/stats/cri_stats_provider.go | 93 +++++++++++-------------- 1 file changed, 42 insertions(+), 51 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index 58368d54234..e666b92b69a 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -125,32 +125,18 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi return nil, fmt.Errorf("failed to get rootFs info: %v", err) } - containers, err := p.runtimeService.ListContainers(&runtimeapi.ContainerFilter{}) + containerMap, podSandboxMap, err := p.getPodAndContainerMaps() if err != nil { - return nil, fmt.Errorf("failed to list all containers: %v", err) - } - - // Creates pod sandbox map. - podSandboxMap := make(map[string]*runtimeapi.PodSandbox) - podSandboxes, err := p.runtimeService.ListPodSandbox(&runtimeapi.PodSandboxFilter{}) - if err != nil { - return nil, fmt.Errorf("failed to list all pod sandboxes: %v", err) - } - podSandboxes = removeTerminatedPods(podSandboxes) - for _, s := range podSandboxes { - podSandboxMap[s.Id] = s - } - - containers = removeTerminatedContainers(containers) - // Creates container map. - containerMap := make(map[string]*runtimeapi.Container) - for _, c := range containers { - containerMap[c.Id] = c + return nil, fmt.Errorf("failed to get pod or container map: %v", err) } if p.podAndContainerStatsFromCRI { return p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) } + return p.listPodStatsPartiallyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) +} + +func (p *criStatsProvider) listPodStatsPartiallyFromCRI(updateCPUNanoCoreUsage bool, containerMap map[string]*runtimeapi.Container, podSandboxMap map[string]*runtimeapi.PodSandbox, rootFsInfo *cadvisorapiv2.FsInfo) ([]statsapi.PodStats, error) { // fsIDtoInfo is a map from filesystem id to its stats. This will be used // as a cache to avoid querying cAdvisor for the filesystem stats with the // same filesystem id many times. @@ -198,7 +184,7 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi } // Fill available stats for full set of required pod stats - cs := p.makeContainerStats(stats, container, &rootFsInfo, fsIDtoInfo, podSandbox.GetMetadata(), updateCPUNanoCoreUsage) + cs := p.makeContainerStats(stats, container, rootFsInfo, fsIDtoInfo, podSandbox.GetMetadata(), updateCPUNanoCoreUsage) p.addPodNetworkStats(ps, podSandboxID, caInfos, cs, containerNetworkStats[podSandboxID]) p.addPodCPUMemoryStats(ps, types.UID(podSandbox.Metadata.Uid), allInfos, cs) p.addProcessStats(ps, types.UID(podSandbox.Metadata.Uid), allInfos, cs) @@ -218,7 +204,7 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi result := make([]statsapi.PodStats, 0, len(sandboxIDToPodStats)) for _, s := range sandboxIDToPodStats { - p.makePodStorageStats(s, &rootFsInfo) + p.makePodStorageStats(s, rootFsInfo) result = append(result, *s) } return result, nil @@ -238,7 +224,6 @@ func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bo continue } ps := buildPodStats(podSandbox) - // TODO FIXME(haircommander): resolve timestamp by taking the latest of each that were collected for _, criContainerStat := range criSandboxStat.Linux.Containers { container, found := containerMap[criContainerStat.Attributes.Id] if !found { @@ -260,39 +245,14 @@ func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bo // ListPodCPUAndMemoryStats returns the CPU and Memory stats of all the pod-managed containers. func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, error) { - containers, err := p.runtimeService.ListContainers(&runtimeapi.ContainerFilter{}) - if err != nil { - return nil, fmt.Errorf("failed to list all containers: %v", err) - } - - // Creates pod sandbox map. - podSandboxMap := make(map[string]*runtimeapi.PodSandbox) - podSandboxes, err := p.runtimeService.ListPodSandbox(&runtimeapi.PodSandboxFilter{}) - if err != nil { - return nil, fmt.Errorf("failed to list all pod sandboxes: %v", err) - } - podSandboxes = removeTerminatedPods(podSandboxes) - for _, s := range podSandboxes { - podSandboxMap[s.Id] = s - } - // sandboxIDToPodStats is a temporary map from sandbox ID to its pod stats. sandboxIDToPodStats := make(map[string]*statsapi.PodStats) - - resp, err := p.runtimeService.ListContainerStats(&runtimeapi.ContainerStatsFilter{}) + containerMap, podSandboxMap, err := p.getPodAndContainerMaps() if err != nil { - return nil, fmt.Errorf("failed to list all container stats: %v", err) + return nil, fmt.Errorf("failed to get pod or container map: %v", err) } - containers = removeTerminatedContainers(containers) - // Creates container map. - containerMap := make(map[string]*runtimeapi.Container) - for _, c := range containers { - containerMap[c.Id] = c - } - - result := make([]statsapi.PodStats, 0, len(sandboxIDToPodStats)) - + result := make([]statsapi.PodStats, 0, len(podSandboxMap)) if p.podAndContainerStatsFromCRI { criSandboxStats, err := p.runtimeService.ListPodSandboxStats(&runtimeapi.PodSandboxStatsFilter{}) if err != nil { @@ -311,6 +271,11 @@ func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, erro return result, err } + resp, err := p.runtimeService.ListContainerStats(&runtimeapi.ContainerStatsFilter{}) + if err != nil { + return nil, fmt.Errorf("failed to list all container stats: %v", err) + } + allInfos, err := getCadvisorContainerInfo(p.cadvisor) if err != nil { return nil, fmt.Errorf("failed to fetch cadvisor stats: %v", err) @@ -361,6 +326,32 @@ func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, erro return result, nil } +func (p *criStatsProvider) getPodAndContainerMaps() (map[string]*runtimeapi.Container, map[string]*runtimeapi.PodSandbox, error) { + containers, err := p.runtimeService.ListContainers(&runtimeapi.ContainerFilter{}) + if err != nil { + return nil, nil, fmt.Errorf("failed to list all containers: %v", err) + } + + // Creates pod sandbox map between the pod sandbox ID and the PodSandbox object. + podSandboxMap := make(map[string]*runtimeapi.PodSandbox) + podSandboxes, err := p.runtimeService.ListPodSandbox(&runtimeapi.PodSandboxFilter{}) + if err != nil { + return nil, nil, fmt.Errorf("failed to list all pod sandboxes: %v", err) + } + podSandboxes = removeTerminatedPods(podSandboxes) + for _, s := range podSandboxes { + podSandboxMap[s.Id] = s + } + + containers = removeTerminatedContainers(containers) + // Creates container map between the container ID and the Container object. + containerMap := make(map[string]*runtimeapi.Container) + for _, c := range containers { + containerMap[c.Id] = c + } + return containerMap, podSandboxMap, nil +} + // ImageFsStats returns the stats of the image filesystem. func (p *criStatsProvider) ImageFsStats() (*statsapi.FsStats, error) { resp, err := p.imageService.ImageFsInfo() From 85e8a4bf73fd343bc890d5cd3de076e379d88aae Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Wed, 23 Jun 2021 15:03:17 -0400 Subject: [PATCH 5/7] kubelet stats: use UsageNanoCores if available Signed-off-by: Peter Hunt --- pkg/kubelet/stats/cri_stats_provider.go | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index e666b92b69a..79438e62c92 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -678,12 +678,18 @@ func (p *criStatsProvider) makeContainerCPUAndMemoryStats( return result } -// getContainerUsageNanoCores gets the cached usageNanoCores. +// getContainerUsageNanoCores first attempts to get the usage nano cores from the stats reported +// by the CRI. If it is unable to, it gets the information from the cache instead. func (p *criStatsProvider) getContainerUsageNanoCores(stats *runtimeapi.ContainerStats) *uint64 { if stats == nil || stats.Attributes == nil { return nil } + // Bypass the cache if the CRI implementation specified the UsageNanoCores. + if stats.Cpu.UsageNanoCores != nil { + return &stats.Cpu.UsageNanoCores.Value + } + p.mutex.RLock() defer p.mutex.RUnlock() @@ -696,11 +702,19 @@ func (p *criStatsProvider) getContainerUsageNanoCores(stats *runtimeapi.Containe return &latestUsage } -// getContainerUsageNanoCores computes usageNanoCores based on the given and -// the cached usageCoreNanoSeconds, updates the cache with the computed -// usageNanoCores, and returns the usageNanoCores. +// getAndUpdateContainerUsageNanoCores first attempts to get the usage nano cores from the stats reported +// by the CRI. If it is unable to, it computes usageNanoCores based on the given and the cached usageCoreNanoSeconds, +// updates the cache with the computed usageNanoCores, and returns the usageNanoCores. func (p *criStatsProvider) getAndUpdateContainerUsageNanoCores(stats *runtimeapi.ContainerStats) *uint64 { - if stats == nil || stats.Attributes == nil || stats.Cpu == nil || stats.Cpu.UsageCoreNanoSeconds == nil { + if stats == nil || stats.Attributes == nil || stats.Cpu == nil { + return nil + } + // Bypass the cache if the CRI implementation specified the UsageNanoCores. + if stats.Cpu.UsageNanoCores != nil { + return &stats.Cpu.UsageNanoCores.Value + } + // If there is no UsageNanoCores, nor UsageCoreNanoSeconds, there is no information to use + if stats.Cpu.UsageCoreNanoSeconds == nil { return nil } id := stats.Attributes.Id From feb5f5e0ed30e9a51fffea00e600eafb28f6b005 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Fri, 2 Jul 2021 16:31:16 -0400 Subject: [PATCH 6/7] kubelet: use helper function to check for nil fields in sandbox stats Signed-off-by: Peter Hunt --- pkg/kubelet/stats/cri_stats_provider.go | 52 ++++++++++++++++++------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index 79438e62c92..21f8643f3cb 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -219,6 +219,10 @@ func (p *criStatsProvider) listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage bo fsIDtoInfo := make(map[runtimeapi.FilesystemIdentifier]*cadvisorapiv2.FsInfo) summarySandboxStats := make([]statsapi.PodStats, 0, len(podSandboxMap)) for _, criSandboxStat := range criSandboxStats { + if criSandboxStat == nil || criSandboxStat.Attributes == nil { + klog.V(5).InfoS("Unable to find CRI stats for sandbox") + continue + } podSandbox, found := podSandboxMap[criSandboxStat.Attributes.Id] if !found { continue @@ -686,7 +690,7 @@ func (p *criStatsProvider) getContainerUsageNanoCores(stats *runtimeapi.Containe } // Bypass the cache if the CRI implementation specified the UsageNanoCores. - if stats.Cpu.UsageNanoCores != nil { + if stats.Cpu != nil && stats.Cpu.UsageNanoCores != nil { return &stats.Cpu.UsageNanoCores.Value } @@ -913,6 +917,9 @@ func extractIDFromCgroupPath(cgroupPath string) string { } func addCRIPodNetworkStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + if criPodStat == nil || criPodStat.Linux == nil || criPodStat.Linux.Network == nil { + return + } criNetwork := criPodStat.Linux.Network iStats := statsapi.NetworkStats{ Time: metav1.NewTime(time.Unix(0, criNetwork.Timestamp)), @@ -928,38 +935,53 @@ func addCRIPodNetworkStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSand func criInterfaceToSummary(criIface *runtimeapi.NetworkInterfaceUsage) statsapi.InterfaceStats { return statsapi.InterfaceStats{ Name: criIface.Name, - RxBytes: &criIface.RxBytes.Value, - RxErrors: &criIface.RxErrors.Value, - TxBytes: &criIface.TxBytes.Value, - TxErrors: &criIface.TxErrors.Value, + RxBytes: valueOfUInt64Value(criIface.RxBytes), + RxErrors: valueOfUInt64Value(criIface.RxErrors), + TxBytes: valueOfUInt64Value(criIface.TxBytes), + TxErrors: valueOfUInt64Value(criIface.TxErrors), } } func addCRIPodCPUStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + if criPodStat == nil || criPodStat.Linux == nil || criPodStat.Linux.Cpu == nil { + return + } criCPU := criPodStat.Linux.Cpu ps.CPU = &statsapi.CPUStats{ Time: metav1.NewTime(time.Unix(0, criCPU.Timestamp)), - UsageNanoCores: &criCPU.UsageNanoCores.Value, - UsageCoreNanoSeconds: &criCPU.UsageCoreNanoSeconds.Value, + UsageNanoCores: valueOfUInt64Value(criCPU.UsageNanoCores), + UsageCoreNanoSeconds: valueOfUInt64Value(criCPU.UsageCoreNanoSeconds), } } func addCRIPodMemoryStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { + if criPodStat == nil || criPodStat.Linux == nil || criPodStat.Linux.Memory == nil { + return + } criMemory := criPodStat.Linux.Memory ps.Memory = &statsapi.MemoryStats{ Time: metav1.NewTime(time.Unix(0, criMemory.Timestamp)), - AvailableBytes: &criMemory.AvailableBytes.Value, - UsageBytes: &criMemory.UsageBytes.Value, - WorkingSetBytes: &criMemory.WorkingSetBytes.Value, - RSSBytes: &criMemory.RssBytes.Value, - PageFaults: &criMemory.PageFaults.Value, - MajorPageFaults: &criMemory.MajorPageFaults.Value, + AvailableBytes: valueOfUInt64Value(criMemory.AvailableBytes), + UsageBytes: valueOfUInt64Value(criMemory.UsageBytes), + WorkingSetBytes: valueOfUInt64Value(criMemory.WorkingSetBytes), + RSSBytes: valueOfUInt64Value(criMemory.RssBytes), + PageFaults: valueOfUInt64Value(criMemory.PageFaults), + MajorPageFaults: valueOfUInt64Value(criMemory.MajorPageFaults), } } func addCRIPodProcessStats(ps *statsapi.PodStats, criPodStat *runtimeapi.PodSandboxStats) { - criProcess := criPodStat.Linux.Process + if criPodStat == nil || criPodStat.Linux == nil || criPodStat.Linux.Process == nil { + return + } ps.ProcessStats = &statsapi.ProcessStats{ - ProcessCount: &criProcess.ProcessCount.Value, + ProcessCount: valueOfUInt64Value(criPodStat.Linux.Process.ProcessCount), } } + +func valueOfUInt64Value(value *runtimeapi.UInt64Value) *uint64 { + if value == nil { + return nil + } + return &value.Value +} From 6b3f8e56622c4d6b1c2ad028b7f8fad5ef2a2e80 Mon Sep 17 00:00:00 2001 From: Peter Hunt Date: Mon, 11 Oct 2021 19:37:48 -0400 Subject: [PATCH 7/7] kubelet: fallback to partial CRI stats if full fails This is partially to allow the kube alpha tests to pass until CRI implementations have support, but also to handle this error situation a bit more elegantly Signed-off-by: Peter Hunt --- pkg/kubelet/stats/cri_stats_provider.go | 49 ++++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index 21f8643f3cb..50f182e73b9 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -27,6 +27,8 @@ import ( cadvisorfs "github.com/google/cadvisor/fs" cadvisorapiv2 "github.com/google/cadvisor/info/v2" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" internalapi "k8s.io/cri-api/pkg/apis" @@ -131,7 +133,18 @@ func (p *criStatsProvider) listPodStats(updateCPUNanoCoreUsage bool) ([]statsapi } if p.podAndContainerStatsFromCRI { - return p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) + _, err := p.listPodStatsStrictlyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) + if err != nil { + s, ok := status.FromError(err) + // Legitimate failure, rather than the CRI implementation does not support ListPodSandboxStats. + if !ok || s.Code() != codes.Unimplemented { + return nil, err + } + // CRI implementation doesn't support ListPodSandboxStats, warn and fallback. + klog.V(5).ErrorS(err, + "CRI implementation must be updated to support ListPodSandboxStats if PodAndContainerStatsFromCRI feature gate is enabled. Falling back to populating with cAdvisor; this call will fail in the future.", + ) + } } return p.listPodStatsPartiallyFromCRI(updateCPUNanoCoreUsage, containerMap, podSandboxMap, &rootFsInfo) } @@ -259,20 +272,30 @@ func (p *criStatsProvider) ListPodCPUAndMemoryStats() ([]statsapi.PodStats, erro result := make([]statsapi.PodStats, 0, len(podSandboxMap)) if p.podAndContainerStatsFromCRI { criSandboxStats, err := p.runtimeService.ListPodSandboxStats(&runtimeapi.PodSandboxStatsFilter{}) - if err != nil { + // Call succeeded + if err == nil { + for _, criSandboxStat := range criSandboxStats { + podSandbox, found := podSandboxMap[criSandboxStat.Attributes.Id] + if !found { + continue + } + ps := buildPodStats(podSandbox) + addCRIPodCPUStats(ps, criSandboxStat) + addCRIPodMemoryStats(ps, criSandboxStat) + result = append(result, *ps) + } + return result, err + } + // Call failed, why? + s, ok := status.FromError(err) + // Legitimate failure, rather than the CRI implementation does not support ListPodSandboxStats. + if !ok || s.Code() != codes.Unimplemented { return nil, err } - for _, criSandboxStat := range criSandboxStats { - podSandbox, found := podSandboxMap[criSandboxStat.Attributes.Id] - if !found { - continue - } - ps := buildPodStats(podSandbox) - addCRIPodCPUStats(ps, criSandboxStat) - addCRIPodMemoryStats(ps, criSandboxStat) - result = append(result, *ps) - } - return result, err + // CRI implementation doesn't support ListPodSandboxStats, warn and fallback. + klog.ErrorS(err, + "CRI implementation must be updated to support ListPodSandboxStats if PodAndContainerStatsFromCRI feature gate is enabled. Falling back to populating with cAdvisor; this call will fail in the future.", + ) } resp, err := p.runtimeService.ListContainerStats(&runtimeapi.ContainerStatsFilter{})