Update running_pod_count and running_container_count metric

As already mentioned in this issue https://github.com/kubernetes/kubernetes/issues/79286, some metrics like
"running_pod_count" and "running_container_count" uses non-standard prometheus metrics, this change converts them to be
standard prometheus gauges

Minor refactor in kubelet/pleg/generic.go and added some test for ruuning container and running pod metrics

Fixed issues related to github CI pipeline failure

* Updated bazel for new deps
* Add comment for exported metrics variables,RuuningContainerCount and RunningPodCount
* Specify keys explicitly in Guage metric instantation

Fix go lint errors

Replace "+=1" with "++", as reported by go lint

Set container state as a label for the metrics "running_container_count"

As per the metrics name "running_container_count" it should "ideally" be showing
the number of containers in "running" state , but it was showing all the container count, irrespective of the state it is in.
This commit adds a new label "container_running_state" to the metrics "running_container_count", which doesn't change the base metrics but adds the
option to query the metrics with "container_state" such as "running"/"unknown/...

remove unused methods reported by staticcheck

Remove variables while instantiating gauge(vec) which are default set to nil

Convert kubelet metrics(running_pod_count and running_container_count) to standard gauges and added label to running_container_count metrics.

Currently kubelet metrics(running_pod_count and running_container_count) use non-standard prometheus collectors , this change
converts them to standard prometheus gauges. Also this adds a new label(container_state) to running_container_count which does a breakdown of
containers tracked by kubelet based on the containers' state(running/unknown/created/exited).

Set statbility explicitly for running_pod_count and running_container_count and reformat test

register metrics explicitly in test , so that they don't become no-op
This commit is contained in:
Rajdeep Das 2019-08-18 20:55:06 +02:00
parent 06dc8cf4cb
commit c02d49d775
4 changed files with 92 additions and 55 deletions

View File

@ -461,6 +461,26 @@ var (
},
[]string{"runtime_handler"},
)
// RunningPodCount is a gauge that tracks the number of Pods currently running
RunningPodCount = metrics.NewGauge(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: "running_pod_count",
Help: "Number of pods currently running",
StabilityLevel: metrics.ALPHA,
},
)
// RunningContainerCount is a gauge that tracks the number of containers currently running
RunningContainerCount = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: "running_container_count",
Help: "Number of containers currently running",
StabilityLevel: metrics.ALPHA,
},
[]string{"container_state"},
)
)
var registerMetrics sync.Once
@ -475,7 +495,6 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
legacyregistry.MustRegister(CgroupManagerDuration)
legacyregistry.MustRegister(PodWorkerStartDuration)
legacyregistry.MustRegister(ContainersPerPodCount)
legacyregistry.RawMustRegister(newPodAndContainerCollector(containerCache))
legacyregistry.MustRegister(PLEGRelistDuration)
legacyregistry.MustRegister(PLEGDiscardEvents)
legacyregistry.MustRegister(PLEGRelistInterval)
@ -498,6 +517,8 @@ func Register(containerCache kubecontainer.RuntimeCache, collectors ...prometheu
legacyregistry.MustRegister(DeprecatedEvictionStatsAge)
legacyregistry.MustRegister(DeprecatedDevicePluginRegistrationCount)
legacyregistry.MustRegister(DeprecatedDevicePluginAllocationLatency)
legacyregistry.MustRegister(RunningContainerCount)
legacyregistry.MustRegister(RunningPodCount)
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicKubeletConfig) {
legacyregistry.MustRegister(AssignedConfig)
legacyregistry.MustRegister(ActiveConfig)
@ -520,60 +541,6 @@ func SinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
func newPodAndContainerCollector(containerCache kubecontainer.RuntimeCache) *podAndContainerCollector {
return &podAndContainerCollector{
containerCache: containerCache,
}
}
// Custom collector for current pod and container counts.
type podAndContainerCollector struct {
// Cache for accessing information about running containers.
containerCache kubecontainer.RuntimeCache
}
// TODO(vmarmol): Split by source?
var (
runningPodCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", KubeletSubsystem, "running_pod_count"),
"Number of pods currently running",
nil, nil)
runningContainerCountDesc = prometheus.NewDesc(
prometheus.BuildFQName("", KubeletSubsystem, "running_container_count"),
"Number of containers currently running",
nil, nil)
)
// Describe implements Prometheus' Describe method from the Collector interface. It sends all
// available descriptions to the provided channel and retunrs once the last description has been sent.
func (pc *podAndContainerCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- runningPodCountDesc
ch <- runningContainerCountDesc
}
// Collect implements Prometheus' Collect method from the Collector interface. It's called by the Prometheus
// registry when collecting metrics.
func (pc *podAndContainerCollector) Collect(ch chan<- prometheus.Metric) {
runningPods, err := pc.containerCache.GetPods()
if err != nil {
klog.Warningf("Failed to get running container information while collecting metrics: %v", err)
return
}
runningContainers := 0
for _, p := range runningPods {
runningContainers += len(p.Containers)
}
ch <- prometheus.MustNewConstMetric(
runningPodCountDesc,
prometheus.GaugeValue,
float64(len(runningPods)))
ch <- prometheus.MustNewConstMetric(
runningContainerCountDesc,
prometheus.GaugeValue,
float64(runningContainers))
}
const configMapAPIPathFmt = "/api/v1/namespaces/%s/configmaps/%s"
func configLabels(source *corev1.NodeConfigSource) (map[string]string, error) {

View File

@ -33,9 +33,11 @@ go_test(
deps = [
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/container/testing:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/clock:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
"//vendor/github.com/prometheus/client_model/go:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
],
)

View File

@ -209,6 +209,8 @@ func (g *GenericPLEG) relist() {
g.updateRelistTime(timestamp)
pods := kubecontainer.Pods(podList)
// update running pod and container count
updateRunningPodAndContainerMetrics(pods)
g.podRecords.setCurrent(pods)
// Compare the old and the current pods, and generate events.
@ -431,6 +433,24 @@ func getContainerState(pod *kubecontainer.Pod, cid *kubecontainer.ContainerID) p
return state
}
func updateRunningPodAndContainerMetrics(pods []*kubecontainer.Pod) {
// Set the number of running pods in the parameter
metrics.RunningPodCount.Set(float64(len(pods)))
// intermediate map to store the count of each "container_state"
containerStateCount := make(map[string]int)
for _, pod := range pods {
containers := pod.Containers
for _, container := range containers {
// update the corresponding "container_state" in map to set value for the gaugeVec metrics
containerStateCount[string(container.State)]++
}
}
for key, value := range containerStateCount {
metrics.RunningContainerCount.WithLabelValues(key).Set(float64(value))
}
}
func (pr podRecords) getOld(id types.UID) *kubecontainer.Pod {
r, ok := pr[id]
if !ok {

View File

@ -24,12 +24,14 @@ import (
"testing"
"time"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/clock"
"k8s.io/apimachinery/pkg/util/diff"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
const (
@ -643,3 +645,49 @@ func TestRelistIPChange(t *testing.T) {
assert.Exactly(t, []*PodLifecycleEvent{event}, actualEvents)
}
}
func TestRunningPodAndContainerCount(t *testing.T) {
fakeRuntime := &containertest.FakeRuntime{}
runtimeCache, _ := kubecontainer.NewRuntimeCache(fakeRuntime)
metrics.Register(runtimeCache)
testPleg := newTestGenericPLEG()
pleg, runtime := testPleg.pleg, testPleg.runtime
runtime.AllPodList = []*containertest.FakePod{
{Pod: &kubecontainer.Pod{
ID: "1234",
Containers: []*kubecontainer.Container{
createTestContainer("c1", kubecontainer.ContainerStateRunning),
createTestContainer("c2", kubecontainer.ContainerStateUnknown),
createTestContainer("c3", kubecontainer.ContainerStateUnknown),
},
}},
{Pod: &kubecontainer.Pod{
ID: "4567",
Containers: []*kubecontainer.Container{
createTestContainer("c1", kubecontainer.ContainerStateExited),
},
}},
}
pleg.relist()
// assert for container count with label "running"
actualMetricRunningContainerCount := &dto.Metric{}
expectedMetricRunningContainerCount := float64(1)
metrics.RunningContainerCount.WithLabelValues(string(kubecontainer.ContainerStateRunning)).Write(actualMetricRunningContainerCount)
assert.Equal(t, expectedMetricRunningContainerCount, actualMetricRunningContainerCount.GetGauge().GetValue())
// assert for container count with label "unknown"
actualMetricUnknownContainerCount := &dto.Metric{}
expectedMetricUnknownContainerCount := float64(2)
metrics.RunningContainerCount.WithLabelValues(string(kubecontainer.ContainerStateUnknown)).Write(actualMetricUnknownContainerCount)
assert.Equal(t, expectedMetricUnknownContainerCount, actualMetricUnknownContainerCount.GetGauge().GetValue())
// assert for running pod count
actualMetricRunningPodCount := &dto.Metric{}
metrics.RunningPodCount.Write(actualMetricRunningPodCount)
expectedMetricRunningPodCount := float64(2)
assert.Equal(t, expectedMetricRunningPodCount, actualMetricRunningPodCount.GetGauge().GetValue())
}