mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 04:33:26 +00:00
Update running_pod_count and running_container_count metric
As already mentioned in this issue https://github.com/kubernetes/kubernetes/issues/79286, some metrics like "running_pod_count" and "running_container_count" uses non-standard prometheus metrics, this change converts them to be standard prometheus gauges Minor refactor in kubelet/pleg/generic.go and added some test for ruuning container and running pod metrics Fixed issues related to github CI pipeline failure * Updated bazel for new deps * Add comment for exported metrics variables,RuuningContainerCount and RunningPodCount * Specify keys explicitly in Guage metric instantation Fix go lint errors Replace "+=1" with "++", as reported by go lint Set container state as a label for the metrics "running_container_count" As per the metrics name "running_container_count" it should "ideally" be showing the number of containers in "running" state , but it was showing all the container count, irrespective of the state it is in. This commit adds a new label "container_running_state" to the metrics "running_container_count", which doesn't change the base metrics but adds the option to query the metrics with "container_state" such as "running"/"unknown/... remove unused methods reported by staticcheck Remove variables while instantiating gauge(vec) which are default set to nil Convert kubelet metrics(running_pod_count and running_container_count) to standard gauges and added label to running_container_count metrics. Currently kubelet metrics(running_pod_count and running_container_count) use non-standard prometheus collectors , this change converts them to standard prometheus gauges. Also this adds a new label(container_state) to running_container_count which does a breakdown of containers tracked by kubelet based on the containers' state(running/unknown/created/exited). Set statbility explicitly for running_pod_count and running_container_count and reformat test register metrics explicitly in test , so that they don't become no-op
This commit is contained in:
parent
06dc8cf4cb
commit
c02d49d775
@ -461,6 +461,26 @@ var (
|
||||
},
|
||||
[]string{"runtime_handler"},
|
||||
)
|
||||
|
||||
// RunningPodCount is a gauge that tracks the number of Pods currently running
|
||||
RunningPodCount = metrics.NewGauge(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: "running_pod_count",
|
||||
Help: "Number of pods currently running",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
)
|
||||
// RunningContainerCount is a gauge that tracks the number of containers currently running
|
||||
RunningContainerCount = metrics.NewGaugeVec(
|
||||
&metrics.GaugeOpts{
|
||||
Subsystem: KubeletSubsystem,
|
||||
Name: "running_container_count",
|
||||
Help: "Number of containers currently running",
|
||||
StabilityLevel: metrics.ALPHA,
|
||||
},
|
||||
[]string{"container_state"},
|
||||
)
|
||||
)
|
||||
|
||||
var registerMetrics sync.Once
|
||||
@ -475,7 +495,6 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
|
||||
legacyregistry.MustRegister(CgroupManagerDuration)
|
||||
legacyregistry.MustRegister(PodWorkerStartDuration)
|
||||
legacyregistry.MustRegister(ContainersPerPodCount)
|
||||
legacyregistry.RawMustRegister(newPodAndContainerCollector(containerCache))
|
||||
legacyregistry.MustRegister(PLEGRelistDuration)
|
||||
legacyregistry.MustRegister(PLEGDiscardEvents)
|
||||
legacyregistry.MustRegister(PLEGRelistInterval)
|
||||
@ -498,6 +517,8 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
|
||||
legacyregistry.MustRegister(DeprecatedEvictionStatsAge)
|
||||
legacyregistry.MustRegister(DeprecatedDevicePluginRegistrationCount)
|
||||
legacyregistry.MustRegister(DeprecatedDevicePluginAllocationLatency)
|
||||
legacyregistry.MustRegister(RunningContainerCount)
|
||||
legacyregistry.MustRegister(RunningPodCount)
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
|
||||
legacyregistry.MustRegister(AssignedConfig)
|
||||
legacyregistry.MustRegister(ActiveConfig)
|
||||
@ -520,60 +541,6 @@ func SinceInSeconds(start time.Time) float64 {
|
||||
return time.Since(start).Seconds()
|
||||
}
|
||||
|
||||
func newPodAndContainerCollector(containerCache kubecontainer.RuntimeCache) *podAndContainerCollector {
|
||||
return &podAndContainerCollector{
|
||||
containerCache: containerCache,
|
||||
}
|
||||
}
|
||||
|
||||
// Custom collector for current pod and container counts.
|
||||
type podAndContainerCollector struct {
|
||||
// Cache for accessing information about running containers.
|
||||
containerCache kubecontainer.RuntimeCache
|
||||
}
|
||||
|
||||
// TODO(vmarmol): Split by source?
|
||||
var (
|
||||
runningPodCountDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName("", KubeletSubsystem, "running_pod_count"),
|
||||
"Number of pods currently running",
|
||||
nil, nil)
|
||||
runningContainerCountDesc = prometheus.NewDesc(
|
||||
prometheus.BuildFQName("", KubeletSubsystem, "running_container_count"),
|
||||
"Number of containers currently running",
|
||||
nil, nil)
|
||||
)
|
||||
|
||||
// Describe implements Prometheus' Describe method from the Collector interface. It sends all
|
||||
// available descriptions to the provided channel and retunrs once the last description has been sent.
|
||||
func (pc *podAndContainerCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
ch <- runningPodCountDesc
|
||||
ch <- runningContainerCountDesc
|
||||
}
|
||||
|
||||
// Collect implements Prometheus' Collect method from the Collector interface. It's called by the Prometheus
|
||||
// registry when collecting metrics.
|
||||
func (pc *podAndContainerCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
runningPods, err := pc.containerCache.GetPods()
|
||||
if err != nil {
|
||||
klog.Warningf("Failed to get running container information while collecting metrics: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
runningContainers := 0
|
||||
for _, p := range runningPods {
|
||||
runningContainers += len(p.Containers)
|
||||
}
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
runningPodCountDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(len(runningPods)))
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
runningContainerCountDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(runningContainers))
|
||||
}
|
||||
|
||||
const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s"
|
||||
|
||||
func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) {
|
||||
|
@ -33,9 +33,11 @@ go_test(
|
||||
deps = [
|
||||
"//pkg/kubelet/container:go_default_library",
|
||||
"//pkg/kubelet/container/testing:go_default_library",
|
||||
"//pkg/kubelet/metrics:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
|
||||
"//vendor/github.com/prometheus/client_model/go:go_default_library",
|
||||
"//vendor/github.com/stretchr/testify/assert:go_default_library",
|
||||
],
|
||||
)
|
||||
|
@ -209,6 +209,8 @@ func (g *GenericPLEG) relist() {
|
||||
g.updateRelistTime(timestamp)
|
||||
|
||||
pods := kubecontainer.Pods(podList)
|
||||
// update running pod and container count
|
||||
updateRunningPodAndContainerMetrics(pods)
|
||||
g.podRecords.setCurrent(pods)
|
||||
|
||||
// Compare the old and the current pods, and generate events.
|
||||
@ -431,6 +433,24 @@ func getContainerState(pod *kubecontainer.Pod, cid *kubecontainer.ContainerID) p
|
||||
return state
|
||||
}
|
||||
|
||||
func updateRunningPodAndContainerMetrics(pods []*kubecontainer.Pod) {
|
||||
// Set the number of running pods in the parameter
|
||||
metrics.RunningPodCount.Set(float64(len(pods)))
|
||||
// intermediate map to store the count of each "container_state"
|
||||
containerStateCount := make(map[string]int)
|
||||
|
||||
for _, pod := range pods {
|
||||
containers := pod.Containers
|
||||
for _, container := range containers {
|
||||
// update the corresponding "container_state" in map to set value for the gaugeVec metrics
|
||||
containerStateCount[string(container.State)]++
|
||||
}
|
||||
}
|
||||
for key, value := range containerStateCount {
|
||||
metrics.RunningContainerCount.WithLabelValues(key).Set(float64(value))
|
||||
}
|
||||
}
|
||||
|
||||
func (pr podRecords) getOld(id types.UID) *kubecontainer.Pod {
|
||||
r, ok := pr[id]
|
||||
if !ok {
|
||||
|
@ -24,12 +24,14 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/apimachinery/pkg/util/clock"
|
||||
"k8s.io/apimachinery/pkg/util/diff"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
|
||||
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -643,3 +645,49 @@ func TestRelistIPChange(t *testing.T) {
|
||||
assert.Exactly(t, []*PodLifecycleEvent{event}, actualEvents)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunningPodAndContainerCount(t *testing.T) {
|
||||
fakeRuntime := &containertest.FakeRuntime{}
|
||||
runtimeCache, _ := kubecontainer.NewRuntimeCache(fakeRuntime)
|
||||
metrics.Register(runtimeCache)
|
||||
testPleg := newTestGenericPLEG()
|
||||
pleg, runtime := testPleg.pleg, testPleg.runtime
|
||||
|
||||
runtime.AllPodList = []*containertest.FakePod{
|
||||
{Pod: &kubecontainer.Pod{
|
||||
ID: "1234",
|
||||
Containers: []*kubecontainer.Container{
|
||||
createTestContainer("c1", kubecontainer.ContainerStateRunning),
|
||||
createTestContainer("c2", kubecontainer.ContainerStateUnknown),
|
||||
createTestContainer("c3", kubecontainer.ContainerStateUnknown),
|
||||
},
|
||||
}},
|
||||
{Pod: &kubecontainer.Pod{
|
||||
ID: "4567",
|
||||
Containers: []*kubecontainer.Container{
|
||||
createTestContainer("c1", kubecontainer.ContainerStateExited),
|
||||
},
|
||||
}},
|
||||
}
|
||||
|
||||
pleg.relist()
|
||||
|
||||
// assert for container count with label "running"
|
||||
actualMetricRunningContainerCount := &dto.Metric{}
|
||||
expectedMetricRunningContainerCount := float64(1)
|
||||
metrics.RunningContainerCount.WithLabelValues(string(kubecontainer.ContainerStateRunning)).Write(actualMetricRunningContainerCount)
|
||||
assert.Equal(t, expectedMetricRunningContainerCount, actualMetricRunningContainerCount.GetGauge().GetValue())
|
||||
|
||||
// assert for container count with label "unknown"
|
||||
actualMetricUnknownContainerCount := &dto.Metric{}
|
||||
expectedMetricUnknownContainerCount := float64(2)
|
||||
metrics.RunningContainerCount.WithLabelValues(string(kubecontainer.ContainerStateUnknown)).Write(actualMetricUnknownContainerCount)
|
||||
assert.Equal(t, expectedMetricUnknownContainerCount, actualMetricUnknownContainerCount.GetGauge().GetValue())
|
||||
|
||||
// assert for running pod count
|
||||
actualMetricRunningPodCount := &dto.Metric{}
|
||||
metrics.RunningPodCount.Write(actualMetricRunningPodCount)
|
||||
expectedMetricRunningPodCount := float64(2)
|
||||
assert.Equal(t, expectedMetricRunningPodCount, actualMetricRunningPodCount.GetGauge().GetValue())
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user