sort stability list of metrics in the shell command (#120197)

* sort stability list of metrics in the shell command

* remove used vars

* fix verify errors
This commit is contained in:
Han Kang 2023-08-28 04:43:54 -07:00 committed by GitHub
parent faf1b5d655
commit dbbce2aaba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 488 additions and 391 deletions

View File

@ -0,0 +1,86 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"fmt"
"os"
"sort"
flag "github.com/spf13/pflag"
"gopkg.in/yaml.v2"
"k8s.io/component-base/metrics"
)
func main() {
var sortFile string
flag.StringVar(&sortFile, "sort-file", "", "file of metrics to sort")
flag.Parse()
dat, err := os.ReadFile(sortFile)
if err == nil {
var parsedMetrics []metric
err = yaml.Unmarshal(dat, &parsedMetrics)
if err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
sort.Sort(byFQName(parsedMetrics))
data, err := yaml.Marshal(parsedMetrics)
if err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err)
os.Exit(1)
}
fmt.Print(string(data))
}
}
type metric struct {
Name string `yaml:"name" json:"name"`
Subsystem string `yaml:"subsystem,omitempty" json:"subsystem,omitempty"`
Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"`
Help string `yaml:"help,omitempty" json:"help,omitempty"`
Type string `yaml:"type,omitempty" json:"type,omitempty"`
DeprecatedVersion string `yaml:"deprecatedVersion,omitempty" json:"deprecatedVersion,omitempty"`
StabilityLevel string `yaml:"stabilityLevel,omitempty" json:"stabilityLevel,omitempty"`
Labels []string `yaml:"labels,omitempty" json:"labels,omitempty"`
Buckets []float64 `yaml:"buckets,omitempty" json:"buckets,omitempty"`
Objectives map[float64]float64 `yaml:"objectives,omitempty" json:"objectives,omitempty"`
AgeBuckets uint32 `yaml:"ageBuckets,omitempty" json:"ageBuckets,omitempty"`
BufCap uint32 `yaml:"bufCap,omitempty" json:"bufCap,omitempty"`
MaxAge int64 `yaml:"maxAge,omitempty" json:"maxAge,omitempty"`
ConstLabels map[string]string `yaml:"constLabels,omitempty" json:"constLabels,omitempty"`
}
func (m metric) BuildFQName() string {
return metrics.BuildFQName(m.Namespace, m.Subsystem, m.Name)
}
type byFQName []metric
func (ms byFQName) Len() int { return len(ms) }
func (ms byFQName) Less(i, j int) bool {
if ms[i].StabilityLevel < ms[j].StabilityLevel {
return true
} else if ms[i].StabilityLevel > ms[j].StabilityLevel {
return false
}
return ms[i].BuildFQName() < ms[j].BuildFQName()
}
func (ms byFQName) Swap(i, j int) {
ms[i], ms[j] = ms[j], ms[i]
}

View File

@ -59,6 +59,7 @@ reset=$(tput sgr0)
function kube::validate::stablemetrics() {
stability_check_setup
temp_file=$(mktemp)
temp_file2=$(mktemp)
doValidate=$(find_files_to_check -z \
| sort -z \
| KUBE_ROOT=${KUBE_ROOT} xargs -0 -L 200 \
@ -73,12 +74,16 @@ function kube::validate::stablemetrics() {
if $doValidate; then
echo -e "${green}Diffing test/instrumentation/testdata/stable-metrics-list.yaml\n${reset}"
if diff -u "$KUBE_ROOT/test/instrumentation/testdata/stable-metrics-list.yaml" "$temp_file"; then
echo -e "${green}\nPASS metrics stability verification ${reset}"
return 0
fi
fi
doSort=$(KUBE_ROOT=${KUBE_ROOT} go run "test/instrumentation/sort/main.go" --sort-file="${temp_file}" 1>"${temp_file2}")
if ! $doSort; then
echo "${red}!!! sorting metrics has failed! ${reset}" >&2
exit 1
fi
if diff -u "$KUBE_ROOT/test/instrumentation/testdata/stable-metrics-list.yaml" "$temp_file2"; then
echo -e "${green}\nPASS metrics stability verification ${reset}"
return 0
fi
echo "${red}!!! Metrics Stability static analysis has failed!${reset}" >&2
echo "${red}!!! Please run ./hack/update-generated-stable-metrics.sh to update the golden list.${reset}" >&2
exit 1
@ -115,7 +120,7 @@ function kube::validate::test::stablemetrics() {
function kube::update::stablemetrics() {
stability_check_setup
temp_file=$(mktemp)
temp_file2=$(mktemp)
doCheckStability=$(find_files_to_check -z \
| sort -z \
| KUBE_ROOT=${KUBE_ROOT} xargs -0 -L 200 \
@ -133,6 +138,12 @@ function kube::update::stablemetrics() {
exit 1
fi
mv -f "$temp_file" "${KUBE_ROOT}/test/instrumentation/testdata/stable-metrics-list.yaml"
doSort=$(go run "test/instrumentation/sort/main.go" --sort-file="${KUBE_ROOT}/test/instrumentation/testdata/stable-metrics-list.yaml" 1>"${temp_file2}")
if ! $doSort; then
echo "${red}!!! sorting metrics has failed! ${reset}" >&2
exit 1
fi
mv -f "$temp_file2" "${KUBE_ROOT}/test/instrumentation/testdata/stable-metrics-list.yaml"
echo "${green}Updated golden list of stable metrics.${reset}"
}

View File

@ -1,129 +1,126 @@
- name: job_creation_skew_duration_seconds
subsystem: cronjob_controller
help: Time between when a cronjob is scheduled to be run, and when the corresponding
job is created
- name: current_executing_requests
subsystem: flowcontrol
namespace: apiserver
help: Number of requests in initial (for a WATCH) or any (for a non-WATCH) execution
stage in the API Priority and Fairness subsystem
type: Gauge
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- name: current_executing_seats
subsystem: flowcontrol
namespace: apiserver
help: Concurrency (number of seats) occupied by the currently executing (initial
stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness
subsystem
type: Gauge
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- name: current_inqueue_requests
subsystem: flowcontrol
namespace: apiserver
help: Number of requests currently pending in queues of the API Priority and Fairness
subsystem
type: Gauge
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- name: dispatched_requests_total
subsystem: flowcontrol
namespace: apiserver
help: Number of requests executed by API Priority and Fairness subsystem
type: Counter
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- name: nominal_limit_seats
subsystem: flowcontrol
namespace: apiserver
help: Nominal number of execution seats configured for each priority level
type: Gauge
stabilityLevel: BETA
labels:
- priority_level
- name: rejected_requests_total
subsystem: flowcontrol
namespace: apiserver
help: Number of requests rejected by API Priority and Fairness subsystem
type: Counter
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- reason
- name: request_wait_duration_seconds
subsystem: flowcontrol
namespace: apiserver
help: Length of time a request spent waiting in its queue
type: Histogram
stabilityLevel: STABLE
stabilityLevel: BETA
labels:
- execute
- flow_schema
- priority_level
buckets:
- 0
- 0.005
- 0.02
- 0.05
- 0.1
- 0.2
- 0.5
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- name: job_pods_finished_total
subsystem: job_controller
help: The number of finished Pods that are fully tracked
- 5
- 10
- 15
- 30
- name: disabled_metrics_total
help: The count of disabled metrics.
type: Counter
stabilityLevel: STABLE
labels:
- completion_mode
- result
- name: job_sync_duration_seconds
subsystem: job_controller
help: The time it took to sync a job
type: Histogram
stabilityLevel: STABLE
labels:
- action
- completion_mode
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: job_syncs_total
subsystem: job_controller
help: The number of job syncs
stabilityLevel: BETA
- name: hidden_metrics_total
help: The count of hidden metrics.
type: Counter
stabilityLevel: STABLE
stabilityLevel: BETA
- name: feature_enabled
namespace: kubernetes
help: This metric records the data about the stage and enablement of a k8s feature.
type: Gauge
stabilityLevel: BETA
labels:
- action
- completion_mode
- result
- name: jobs_finished_total
subsystem: job_controller
help: The number of finished jobs
- name
- stage
- name: healthcheck
namespace: kubernetes
help: This metric records the result of a single healthcheck.
type: Gauge
stabilityLevel: BETA
labels:
- name
- type
- name: healthchecks_total
namespace: kubernetes
help: This metric records the results of all healthcheck.
type: Counter
stabilityLevel: STABLE
stabilityLevel: BETA
labels:
- completion_mode
- reason
- result
- name: evictions_total
subsystem: node_collector
help: Number of Node evictions that happened since current instance of NodeController
started.
- name
- status
- type
- name: registered_metrics_total
help: The count of registered metrics broken by stability level and deprecation
version.
type: Counter
stabilityLevel: STABLE
stabilityLevel: BETA
labels:
- zone
- name: container_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the container in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: container_memory_working_set_bytes
help: Current working set of the container in bytes
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: container_start_time_seconds
help: Start time of the container since unix epoch in seconds
type: Custom
stabilityLevel: STABLE
labels:
- container
- pod
- namespace
- name: node_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the node in core-seconds
type: Custom
stabilityLevel: STABLE
- name: node_memory_working_set_bytes
help: Current working set of the node in bytes
type: Custom
stabilityLevel: STABLE
- name: pod_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the pod in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: pod_memory_working_set_bytes
help: Current working set of the pod in bytes
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: resource_scrape_error
help: 1 if there was an error while getting container metrics, 0 otherwise
type: Custom
stabilityLevel: STABLE
- deprecated_version
- stability_level
- name: pod_scheduling_sli_duration_seconds
subsystem: scheduler
help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling
@ -153,167 +150,6 @@
- 1310.72
- 2621.44
- 5242.88
- name: kube_pod_resource_limit
help: Resources limit for workloads on the cluster, broken down by pod. This shows
the resource usage the scheduler and kubelet expect per pod for resources along
with the unit for the resource if any.
type: Custom
stabilityLevel: STABLE
labels:
- namespace
- pod
- node
- scheduler
- priority
- resource
- unit
- name: kube_pod_resource_request
help: Resources requested by workloads on the cluster, broken down by pod. This
shows the resource usage the scheduler and kubelet expect per pod for resources
along with the unit for the resource if any.
type: Custom
stabilityLevel: STABLE
labels:
- namespace
- pod
- node
- scheduler
- priority
- resource
- unit
- name: framework_extension_point_duration_seconds
subsystem: scheduler
help: Latency for running all plugins of a specific extension point.
type: Histogram
stabilityLevel: STABLE
labels:
- extension_point
- profile
- status
buckets:
- 0.0001
- 0.0002
- 0.0004
- 0.0008
- 0.0016
- 0.0032
- 0.0064
- 0.0128
- 0.0256
- 0.0512
- 0.1024
- 0.2048
- name: pending_pods
subsystem: scheduler
help: Number of pending pods, by the queue type. 'active' means number of pods in
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
of pods in unschedulablePods that the scheduler attempted to schedule and failed;
'gated' is the number of unschedulable pods that the scheduler never attempted
to schedule because they are gated.
type: Gauge
stabilityLevel: STABLE
labels:
- queue
- name: pod_scheduling_attempts
subsystem: scheduler
help: Number of attempts to successfully schedule a pod.
type: Histogram
stabilityLevel: STABLE
buckets:
- 1
- 2
- 4
- 8
- 16
- name: pod_scheduling_duration_seconds
subsystem: scheduler
help: E2e latency for a pod being scheduled which may include multiple scheduling
attempts.
type: Histogram
deprecatedVersion: 1.28.0
stabilityLevel: STABLE
labels:
- attempts
buckets:
- 0.01
- 0.02
- 0.04
- 0.08
- 0.16
- 0.32
- 0.64
- 1.28
- 2.56
- 5.12
- 10.24
- 20.48
- 40.96
- 81.92
- 163.84
- 327.68
- 655.36
- 1310.72
- 2621.44
- 5242.88
- name: preemption_attempts_total
subsystem: scheduler
help: Total preemption attempts in the cluster till now
type: Counter
stabilityLevel: STABLE
- name: preemption_victims
subsystem: scheduler
help: Number of selected preemption victims
type: Histogram
stabilityLevel: STABLE
buckets:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- name: queue_incoming_pods_total
subsystem: scheduler
help: Number of pods added to scheduling queues by event and queue type.
type: Counter
stabilityLevel: STABLE
labels:
- event
- queue
- name: schedule_attempts_total
subsystem: scheduler
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
a pod could not be scheduled, while 'error' means an internal scheduler problem.
type: Counter
stabilityLevel: STABLE
labels:
- profile
- result
- name: scheduling_attempt_duration_seconds
subsystem: scheduler
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
type: Histogram
stabilityLevel: STABLE
labels:
- profile
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: controller_admission_duration_seconds
subsystem: admission
namespace: apiserver
@ -489,126 +325,290 @@
stabilityLevel: STABLE
labels:
- resource
- name: current_executing_requests
subsystem: flowcontrol
namespace: apiserver
help: Number of requests in initial (for a WATCH) or any (for a non-WATCH) execution
stage in the API Priority and Fairness subsystem
type: Gauge
stabilityLevel: BETA
- name: container_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the container in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- flow_schema
- priority_level
- name: current_executing_seats
subsystem: flowcontrol
namespace: apiserver
help: Concurrency (number of seats) occupied by the currently executing (initial
stage for a WATCH, any stage otherwise) requests in the API Priority and Fairness
subsystem
type: Gauge
stabilityLevel: BETA
- container
- pod
- namespace
- name: container_memory_working_set_bytes
help: Current working set of the container in bytes
type: Custom
stabilityLevel: STABLE
labels:
- flow_schema
- priority_level
- name: current_inqueue_requests
subsystem: flowcontrol
namespace: apiserver
help: Number of requests currently pending in queues of the API Priority and Fairness
subsystem
type: Gauge
stabilityLevel: BETA
- container
- pod
- namespace
- name: container_start_time_seconds
help: Start time of the container since unix epoch in seconds
type: Custom
stabilityLevel: STABLE
labels:
- flow_schema
- priority_level
- name: dispatched_requests_total
subsystem: flowcontrol
namespace: apiserver
help: Number of requests executed by API Priority and Fairness subsystem
type: Counter
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- name: nominal_limit_seats
subsystem: flowcontrol
namespace: apiserver
help: Nominal number of execution seats configured for each priority level
type: Gauge
stabilityLevel: BETA
labels:
- priority_level
- name: rejected_requests_total
subsystem: flowcontrol
namespace: apiserver
help: Number of requests rejected by API Priority and Fairness subsystem
type: Counter
stabilityLevel: BETA
labels:
- flow_schema
- priority_level
- reason
- name: request_wait_duration_seconds
subsystem: flowcontrol
namespace: apiserver
help: Length of time a request spent waiting in its queue
- container
- pod
- namespace
- name: job_creation_skew_duration_seconds
subsystem: cronjob_controller
help: Time between when a cronjob is scheduled to be run, and when the corresponding
job is created
type: Histogram
stabilityLevel: BETA
labels:
- execute
- flow_schema
- priority_level
stabilityLevel: STABLE
buckets:
- 0
- 0.005
- 0.02
- 0.05
- 0.1
- 0.2
- 0.5
- 1
- 2
- 5
- 10
- 15
- 30
- name: disabled_metrics_total
help: The count of disabled metrics.
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 512
- name: job_pods_finished_total
subsystem: job_controller
help: The number of finished Pods that are fully tracked
type: Counter
stabilityLevel: BETA
- name: hidden_metrics_total
help: The count of hidden metrics.
stabilityLevel: STABLE
labels:
- completion_mode
- result
- name: job_sync_duration_seconds
subsystem: job_controller
help: The time it took to sync a job
type: Histogram
stabilityLevel: STABLE
labels:
- action
- completion_mode
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: job_syncs_total
subsystem: job_controller
help: The number of job syncs
type: Counter
stabilityLevel: BETA
- name: feature_enabled
namespace: kubernetes
help: This metric records the data about the stage and enablement of a k8s feature.
type: Gauge
stabilityLevel: BETA
stabilityLevel: STABLE
labels:
- name
- stage
- name: healthcheck
namespace: kubernetes
help: This metric records the result of a single healthcheck.
type: Gauge
stabilityLevel: BETA
labels:
- name
- type
- name: healthchecks_total
namespace: kubernetes
help: This metric records the results of all healthcheck.
- action
- completion_mode
- result
- name: jobs_finished_total
subsystem: job_controller
help: The number of finished jobs
type: Counter
stabilityLevel: BETA
stabilityLevel: STABLE
labels:
- name
- completion_mode
- reason
- result
- name: kube_pod_resource_limit
help: Resources limit for workloads on the cluster, broken down by pod. This shows
the resource usage the scheduler and kubelet expect per pod for resources along
with the unit for the resource if any.
type: Custom
stabilityLevel: STABLE
labels:
- namespace
- pod
- node
- scheduler
- priority
- resource
- unit
- name: kube_pod_resource_request
help: Resources requested by workloads on the cluster, broken down by pod. This
shows the resource usage the scheduler and kubelet expect per pod for resources
along with the unit for the resource if any.
type: Custom
stabilityLevel: STABLE
labels:
- namespace
- pod
- node
- scheduler
- priority
- resource
- unit
- name: evictions_total
subsystem: node_collector
help: Number of Node evictions that happened since current instance of NodeController
started.
type: Counter
stabilityLevel: STABLE
labels:
- zone
- name: node_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the node in core-seconds
type: Custom
stabilityLevel: STABLE
- name: node_memory_working_set_bytes
help: Current working set of the node in bytes
type: Custom
stabilityLevel: STABLE
- name: pod_cpu_usage_seconds_total
help: Cumulative cpu time consumed by the pod in core-seconds
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: pod_memory_working_set_bytes
help: Current working set of the pod in bytes
type: Custom
stabilityLevel: STABLE
labels:
- pod
- namespace
- name: resource_scrape_error
help: 1 if there was an error while getting container metrics, 0 otherwise
type: Custom
stabilityLevel: STABLE
- name: framework_extension_point_duration_seconds
subsystem: scheduler
help: Latency for running all plugins of a specific extension point.
type: Histogram
stabilityLevel: STABLE
labels:
- extension_point
- profile
- status
- type
- name: registered_metrics_total
help: The count of registered metrics broken by stability level and deprecation
version.
type: Counter
stabilityLevel: BETA
buckets:
- 0.0001
- 0.0002
- 0.0004
- 0.0008
- 0.0016
- 0.0032
- 0.0064
- 0.0128
- 0.0256
- 0.0512
- 0.1024
- 0.2048
- name: pending_pods
subsystem: scheduler
help: Number of pending pods, by the queue type. 'active' means number of pods in
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
of pods in unschedulablePods that the scheduler attempted to schedule and failed;
'gated' is the number of unschedulable pods that the scheduler never attempted
to schedule because they are gated.
type: Gauge
stabilityLevel: STABLE
labels:
- deprecated_version
- stability_level
- queue
- name: pod_scheduling_attempts
subsystem: scheduler
help: Number of attempts to successfully schedule a pod.
type: Histogram
stabilityLevel: STABLE
buckets:
- 1
- 2
- 4
- 8
- 16
- name: pod_scheduling_duration_seconds
subsystem: scheduler
help: E2e latency for a pod being scheduled which may include multiple scheduling
attempts.
type: Histogram
deprecatedVersion: 1.28.0
stabilityLevel: STABLE
labels:
- attempts
buckets:
- 0.01
- 0.02
- 0.04
- 0.08
- 0.16
- 0.32
- 0.64
- 1.28
- 2.56
- 5.12
- 10.24
- 20.48
- 40.96
- 81.92
- 163.84
- 327.68
- 655.36
- 1310.72
- 2621.44
- 5242.88
- name: preemption_attempts_total
subsystem: scheduler
help: Total preemption attempts in the cluster till now
type: Counter
stabilityLevel: STABLE
- name: preemption_victims
subsystem: scheduler
help: Number of selected preemption victims
type: Histogram
stabilityLevel: STABLE
buckets:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- name: queue_incoming_pods_total
subsystem: scheduler
help: Number of pods added to scheduling queues by event and queue type.
type: Counter
stabilityLevel: STABLE
labels:
- event
- queue
- name: schedule_attempts_total
subsystem: scheduler
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
a pod could not be scheduled, while 'error' means an internal scheduler problem.
type: Counter
stabilityLevel: STABLE
labels:
- profile
- result
- name: scheduling_attempt_duration_seconds
subsystem: scheduler
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
type: Histogram
stabilityLevel: STABLE
labels:
- profile
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384