Remove Prometheus addon and it's tests

Prometheus addon was developed for exterimental and test purpose only.
As readme states it should not be used by anyone.
This commit is contained in:
Marek Siarkowicz 2019-10-03 10:18:32 +02:00
parent 35d68586db
commit 887e84e330
24 changed files with 0 additions and 1199 deletions

View File

@ -1,13 +0,0 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- kawych
- piosz
- serathius
- brancz
reviewers:
- kawych
- piosz
- serathius
- brancz

View File

@ -1,5 +0,0 @@
# Prometheus Add-on
This add-on is an experimental configuration of k8s monitoring using Prometheus used for e2e tests.
For production use check out more mature setups like [Prometheus Operator](https://github.com/coreos/prometheus-operator) and [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus).

View File

@ -1,18 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
data:
alertmanager.yml: |
global: null
receivers:
- name: default-receiver
route:
group_interval: 5m
group_wait: 10s
receiver: default-receiver
repeat_interval: 3h

View File

@ -1,76 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: kube-system
labels:
k8s-app: alertmanager
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
version: v0.14.0
spec:
replicas: 1
selector:
matchLabels:
k8s-app: alertmanager
version: v0.14.0
template:
metadata:
labels:
k8s-app: alertmanager
version: v0.14.0
spec:
priorityClassName: system-cluster-critical
containers:
- name: prometheus-alertmanager
image: "prom/alertmanager:v0.14.0"
imagePullPolicy: "IfNotPresent"
args:
- --config.file=/etc/config/alertmanager.yml
- --storage.path=/data
- --web.external-url=/
ports:
- containerPort: 9093
readinessProbe:
httpGet:
path: /#/status
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: storage-volume
mountPath: "/data"
subPath: ""
resources:
limits:
cpu: 10m
memory: 50Mi
requests:
cpu: 10m
memory: 50Mi
- name: prometheus-alertmanager-configmap-reload
image: "jimmidyson/configmap-reload:v0.1"
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/config
- --webhook-url=http://localhost:9093/-/reload
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
resources:
limits:
cpu: 10m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
volumes:
- name: config-volume
configMap:
name: alertmanager-config
- name: storage-volume
persistentVolumeClaim:
claimName: alertmanager

View File

@ -1,15 +0,0 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: alertmanager
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
spec:
storageClassName: standard
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "2Gi"

View File

@ -1,18 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "Alertmanager"
spec:
ports:
- name: http
port: 80
protocol: TCP
targetPort: 9093
selector:
k8s-app: alertmanager
type: "ClusterIP"

View File

@ -1,89 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
labels:
k8s-app: kube-state-metrics
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
version: v1.3.0
spec:
selector:
matchLabels:
k8s-app: kube-state-metrics
version: v1.3.0
replicas: 1
template:
metadata:
labels:
k8s-app: kube-state-metrics
version: v1.3.0
spec:
priorityClassName: system-cluster-critical
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: quay.io/coreos/kube-state-metrics:v1.3.0
ports:
- name: http-metrics
containerPort: 8080
- name: telemetry
containerPort: 8081
readinessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
- name: addon-resizer
image: k8s.gcr.io/addon-resizer:1.8.6
resources:
limits:
cpu: 100m
memory: 30Mi
requests:
cpu: 100m
memory: 30Mi
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- name: config-volume
mountPath: /etc/config
command:
- /pod_nanny
- --config-dir=/etc/config
- --container=kube-state-metrics
- --cpu=100m
- --extra-cpu=1m
- --memory=100Mi
- --extra-memory=2Mi
- --threshold=5
- --deployment=kube-state-metrics
volumes:
- name: config-volume
configMap:
name: kube-state-metrics-config
---
# Config map for resource configuration.
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-state-metrics-config
namespace: kube-system
labels:
k8s-app: kube-state-metrics
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
data:
NannyConfiguration: |-
apiVersion: nannyconfig/v1alpha1
kind: NannyConfiguration

View File

@ -1,103 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
rules:
- apiGroups: [""]
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources:
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: kube-state-metrics-resizer
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
rules:
- apiGroups: [""]
resources:
- pods
verbs: ["get"]
- apiGroups: ["extensions"]
resources:
- deployments
resourceNames: ["kube-state-metrics"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kube-state-metrics
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kube-state-metrics-resizer
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system

View File

@ -1,23 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "kube-state-metrics"
annotations:
prometheus.io/scrape: 'true'
spec:
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
protocol: TCP
- name: telemetry
port: 8081
targetPort: telemetry
protocol: TCP
selector:
k8s-app: kube-state-metrics

View File

@ -1,57 +0,0 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: kube-system
labels:
k8s-app: node-exporter
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
version: v0.15.2
spec:
selector:
matchLabels:
k8s-app: node-exporter
version: v0.15.2
updateStrategy:
type: OnDelete
template:
metadata:
labels:
k8s-app: node-exporter
version: v0.15.2
spec:
priorityClassName: system-node-critical
containers:
- name: prometheus-node-exporter
image: "prom/node-exporter:v0.15.2"
imagePullPolicy: "IfNotPresent"
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
ports:
- name: metrics
containerPort: 9100
hostPort: 9100
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
resources:
limits:
memory: 50Mi
requests:
cpu: 100m
memory: 50Mi
hostNetwork: true
hostPID: true
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys

View File

@ -1,20 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: kube-system
annotations:
prometheus.io/scrape: "true"
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
kubernetes.io/name: "NodeExporter"
spec:
clusterIP: None
ports:
- name: metrics
port: 9100
protocol: TCP
targetPort: 9100
selector:
k8s-app: node-exporter

View File

@ -1,171 +0,0 @@
# Prometheus configuration format https://prometheus.io/docs/prometheus/latest/configuration/configuration/
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
data:
prometheus.yml: |
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-kubelet
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __metrics_path__
replacement: /metrics/cadvisor
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: kubernetes-services
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module:
- http_2xx
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels:
- __address__
target_label: __param_target
- replacement: blackbox
target_label: __address__
- source_labels:
- __param_target
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: kube-system
action: keep
- source_labels: [__meta_kubernetes_pod_label_k8s_app]
regex: alertmanager
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number]
regex:
action: drop

View File

@ -1,55 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- nonResourceURLs:
- "/metrics"
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: kube-system

View File

@ -1,17 +0,0 @@
kind: Service
apiVersion: v1
metadata:
name: prometheus
namespace: kube-system
labels:
kubernetes.io/name: "Prometheus"
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
ports:
- name: http
port: 9090
protocol: TCP
targetPort: 9090
selector:
k8s-app: prometheus

View File

@ -1,107 +0,0 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: kube-system
labels:
k8s-app: prometheus
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
version: v2.2.1
spec:
serviceName: "prometheus"
replicas: 1
podManagementPolicy: "Parallel"
updateStrategy:
type: "RollingUpdate"
selector:
matchLabels:
k8s-app: prometheus
template:
metadata:
labels:
k8s-app: prometheus
spec:
priorityClassName: system-cluster-critical
serviceAccountName: prometheus
initContainers:
- name: "init-chown-data"
image: "busybox:latest"
imagePullPolicy: "IfNotPresent"
command: ["chown", "-R", "65534:65534", "/data"]
volumeMounts:
- name: prometheus-data
mountPath: /data
subPath: ""
containers:
- name: prometheus-server-configmap-reload
image: "jimmidyson/configmap-reload:v0.1"
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/config
- --webhook-url=http://localhost:9090/-/reload
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
resources:
limits:
cpu: 10m
memory: 10Mi
requests:
cpu: 10m
memory: 10Mi
- name: prometheus-server
image: "prom/prometheus:v2.2.1"
imagePullPolicy: "IfNotPresent"
args:
- --config.file=/etc/config/prometheus.yml
- --storage.tsdb.path=/data
- --web.console.libraries=/etc/prometheus/console_libraries
- --web.console.templates=/etc/prometheus/consoles
- --web.enable-lifecycle
ports:
- containerPort: 9090
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
timeoutSeconds: 30
# based on 10 running nodes with 30 pods each
resources:
limits:
cpu: 200m
memory: 1000Mi
requests:
cpu: 200m
memory: 1000Mi
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: prometheus-data
mountPath: /data
subPath: ""
terminationGracePeriodSeconds: 300
volumes:
- name: config-volume
configMap:
name: prometheus-config
volumeClaimTemplates:
- metadata:
name: prometheus-data
spec:
storageClassName: standard
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "16Gi"

View File

@ -160,9 +160,6 @@ ENABLE_L7_LOADBALANCING="${KUBE_ENABLE_L7_LOADBALANCING:-glbc}"
# standalone - Heapster only. Metrics available via Heapster REST API.
ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}"
# Optional: Enable deploying separate prometheus stack for monitoring kubernetes cluster
ENABLE_PROMETHEUS_MONITORING="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}"
# Optional: Enable Metrics Server. Metrics Server should be enable everywhere,
# since it's a critical component, but in the first release we need a way to disable
# this in case of stability issues.

View File

@ -172,9 +172,6 @@ ENABLE_L7_LOADBALANCING="${KUBE_ENABLE_L7_LOADBALANCING:-glbc}"
# standalone - Heapster only. Metrics available via Heapster REST API.
ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}"
# Optional: Enable deploying separate prometheus stack for monitoring kubernetes cluster
ENABLE_PROMETHEUS_MONITORING="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}"
# Optional: Enable Metrics Server. Metrics Server should be enable everywhere,
# since it's a critical component, but in the first release we need a way to disable
# this in case of stability issues.

View File

@ -2702,10 +2702,6 @@ EOF
prepare-kube-proxy-manifest-variables "$src_dir/kube-proxy/kube-proxy-ds.yaml"
setup-addon-manifests "addons" "kube-proxy"
fi
# Setup prometheus stack for monitoring kubernetes cluster
if [[ "${ENABLE_PROMETHEUS_MONITORING:-}" == "true" ]]; then
setup-addon-manifests "addons" "prometheus"
fi
# Setup cluster monitoring using heapster
if [[ "${ENABLE_CLUSTER_MONITORING:-}" == "influxdb" ]] || \
[[ "${ENABLE_CLUSTER_MONITORING:-}" == "google" ]] || \

View File

@ -1124,7 +1124,6 @@ SERVICE_CLUSTER_IP_RANGE: $(yaml-quote ${SERVICE_CLUSTER_IP_RANGE})
KUBERNETES_MASTER_NAME: $(yaml-quote ${KUBERNETES_MASTER_NAME})
ALLOCATE_NODE_CIDRS: $(yaml-quote ${ALLOCATE_NODE_CIDRS:-false})
ENABLE_CLUSTER_MONITORING: $(yaml-quote ${ENABLE_CLUSTER_MONITORING:-none})
ENABLE_PROMETHEUS_MONITORING: $(yaml-quote ${ENABLE_PROMETHEUS_MONITORING:-false})
ENABLE_METRICS_SERVER: $(yaml-quote ${ENABLE_METRICS_SERVER:-false})
ENABLE_METADATA_AGENT: $(yaml-quote ${ENABLE_METADATA_AGENT:-none})
METADATA_AGENT_CPU_REQUEST: $(yaml-quote ${METADATA_AGENT_CPU_REQUEST:-})

View File

@ -163,7 +163,6 @@ export PATH
--node-tag="${NODE_TAG:-}" \
--master-tag="${MASTER_TAG:-}" \
--cluster-monitoring-mode="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" \
--prometheus-monitoring="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}" \
--dns-domain="${KUBE_DNS_DOMAIN:-cluster.local}" \
--ginkgo.slowSpecThreshold="${GINKGO_SLOW_SPEC_THRESHOLD:-300}" \
${KUBE_CONTAINER_RUNTIME:+"--container-runtime=${KUBE_CONTAINER_RUNTIME}"} \

View File

@ -108,13 +108,6 @@ func SkipIfMultizone(c clientset.Interface) {
}
}
// SkipUnlessPrometheusMonitoringIsEnabled skips if the prometheus monitoring is not enabled.
func SkipUnlessPrometheusMonitoringIsEnabled(supportedMonitoring ...string) {
if !TestContext.EnablePrometheusMonitoring {
skipInternalf(1, "Skipped because prometheus monitoring is not enabled")
}
}
// SkipUnlessMasterOSDistroIs skips if the master OS distro is not included in the supportedMasterOsDistros.
func SkipUnlessMasterOSDistroIs(supportedMasterOsDistros ...string) {
if !MasterOSDistroIs(supportedMasterOsDistros...) {

View File

@ -151,8 +151,6 @@ type TestContextType struct {
NodeTestContextType
// Monitoring solution that is used in current cluster.
ClusterMonitoringMode string
// Separate Prometheus monitoring deployed in cluster
EnablePrometheusMonitoring bool
// Indicates what path the kubernetes-anywhere is installed on
KubernetesAnywherePath string
@ -313,7 +311,6 @@ func RegisterClusterFlags(flags *flag.FlagSet) {
flags.StringVar(&TestContext.MasterOSDistro, "master-os-distro", "debian", "The OS distribution of cluster master (debian, ubuntu, gci, coreos, or custom).")
flags.StringVar(&TestContext.NodeOSDistro, "node-os-distro", "debian", "The OS distribution of cluster VM instances (debian, ubuntu, gci, coreos, or custom).")
flags.StringVar(&TestContext.ClusterMonitoringMode, "cluster-monitoring-mode", "standalone", "The monitoring solution that is used in the cluster.")
flags.BoolVar(&TestContext.EnablePrometheusMonitoring, "prometheus-monitoring", false, "Separate Prometheus monitoring deployed in cluster.")
flags.StringVar(&TestContext.ClusterDNSDomain, "dns-domain", "cluster.local", "The DNS Domain of the cluster.")
// TODO: Flags per provider? Rename gce-project/gce-zone?

View File

@ -13,7 +13,6 @@ go_library(
"custom_metrics_deployments.go",
"custom_metrics_stackdriver.go",
"metrics_grabber.go",
"prometheus.go",
"stackdriver.go",
"stackdriver_metadata_agent.go",
],
@ -46,7 +45,6 @@ go_library(
"//test/utils/image:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library",
"//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/github.com/prometheus/common/model:go_default_library",
"//vendor/golang.org/x/oauth2/google:go_default_library",
"//vendor/google.golang.org/api/monitoring/v3:go_default_library",
],

View File

@ -1,388 +0,0 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package monitoring
import (
"context"
"encoding/json"
"fmt"
"math"
"time"
"github.com/prometheus/common/model"
"github.com/onsi/ginkgo"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/common"
"k8s.io/kubernetes/test/e2e/framework"
instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common"
)
const (
prometheusQueryStep = time.Minute
prometheusMetricErrorTolerance = 0.25
prometheusMetricValidationDuration = time.Minute * 2
prometheusRate = time.Minute * 2
prometheusRequiredNodesUpDuration = time.Minute * 5
prometheusService = "prometheus"
prometheusSleepBetweenAttempts = time.Second * 30
prometheusTestTimeout = time.Minute * 5
customMetricValue = 1000
targetCPUUsage = 0.1
)
var _ = instrumentation.SIGDescribe("[Feature:PrometheusMonitoring] Prometheus", func() {
ginkgo.BeforeEach(func() {
framework.SkipUnlessPrometheusMonitoringIsEnabled()
})
f := framework.NewDefaultFramework("prometheus-monitoring")
ginkgo.It("should scrape container metrics from all nodes.", func() {
expectedNodes, err := getAllNodes(f.ClientSet)
framework.ExpectNoError(err)
retryUntilSucceeds(func() error {
return validateMetricAvailableForAllNodes(f.ClientSet, `container_cpu_usage_seconds_total`, expectedNodes)
}, prometheusTestTimeout)
})
ginkgo.It("should successfully scrape all targets", func() {
retryUntilSucceeds(func() error {
return validateAllActiveTargetsAreHealthy(f.ClientSet)
}, prometheusTestTimeout)
})
ginkgo.It("should contain correct container CPU metric.", func() {
query := prometheusCPUQuery(f.Namespace.Name, "prometheus-cpu-consumer", prometheusRate)
consumer := consumeCPUResources(f, "prometheus-cpu-consumer", targetCPUUsage*1000)
defer consumer.CleanUp()
retryUntilSucceeds(func() error {
return validateQueryReturnsCorrectValues(f.ClientSet, query, targetCPUUsage, 3, prometheusMetricErrorTolerance)
}, prometheusTestTimeout)
})
ginkgo.It("should scrape metrics from annotated pods.", func() {
query := prometheusPodCustomMetricQuery(f.Namespace.Name, "prometheus-custom-pod-metric")
consumer := exportCustomMetricFromPod(f, "prometheus-custom-pod-metric", customMetricValue)
defer consumer.CleanUp()
retryUntilSucceeds(func() error {
return validateQueryReturnsCorrectValues(f.ClientSet, query, customMetricValue, 1, prometheusMetricErrorTolerance)
}, prometheusTestTimeout)
})
ginkgo.It("should scrape metrics from annotated services.", func() {
query := prometheusServiceCustomMetricQuery(f.Namespace.Name, "prometheus-custom-service-metric")
consumer := exportCustomMetricFromService(f, "prometheus-custom-service-metric", customMetricValue)
defer consumer.CleanUp()
retryUntilSucceeds(func() error {
return validateQueryReturnsCorrectValues(f.ClientSet, query, customMetricValue, 1, prometheusMetricErrorTolerance)
}, prometheusTestTimeout)
})
})
func prometheusCPUQuery(namespace, podNamePrefix string, rate time.Duration) string {
return fmt.Sprintf(`sum(irate(container_cpu_usage_seconds_total{namespace="%v",pod=~"%v.*",image!=""}[%vm]))`,
namespace, podNamePrefix, int64(rate.Minutes()))
}
func prometheusServiceCustomMetricQuery(namespace, service string) string {
return fmt.Sprintf(`sum(QPS{kubernetes_namespace="%v",kubernetes_name="%v"})`, namespace, service)
}
func prometheusPodCustomMetricQuery(namespace, podNamePrefix string) string {
return fmt.Sprintf(`sum(QPS{kubernetes_namespace="%s",kubernetes_pod_name=~"%s.*"})`, namespace, podNamePrefix)
}
func consumeCPUResources(f *framework.Framework, consumerName string, cpuUsage int) *common.ResourceConsumer {
return common.NewDynamicResourceConsumer(consumerName, f.Namespace.Name, common.KindDeployment, 1, cpuUsage,
memoryUsed, 0, int64(cpuUsage), memoryLimit, f.ClientSet, f.ScalesGetter)
}
func exportCustomMetricFromPod(f *framework.Framework, consumerName string, metricValue int) *common.ResourceConsumer {
podAnnotations := map[string]string{
"prometheus.io/scrape": "true",
"prometheus.io/path": "/metrics",
"prometheus.io/port": "8080",
}
return common.NewMetricExporter(consumerName, f.Namespace.Name, podAnnotations, nil, metricValue, f.ClientSet, f.ScalesGetter)
}
func exportCustomMetricFromService(f *framework.Framework, consumerName string, metricValue int) *common.ResourceConsumer {
serviceAnnotations := map[string]string{
"prometheus.io/scrape": "true",
"prometheus.io/path": "/metrics",
"prometheus.io/port": "8080",
}
return common.NewMetricExporter(consumerName, f.Namespace.Name, nil, serviceAnnotations, metricValue, f.ClientSet, f.ScalesGetter)
}
func validateMetricAvailableForAllNodes(c clientset.Interface, metric string, expectedNodesNames []string) error {
instanceLabels, err := getInstanceLabelsAvailableForMetric(c, prometheusRequiredNodesUpDuration, metric)
if err != nil {
return err
}
nodesWithMetric := make(map[string]bool)
for _, instance := range instanceLabels {
nodesWithMetric[instance] = true
}
missedNodesCount := 0
for _, nodeName := range expectedNodesNames {
if _, found := nodesWithMetric[nodeName]; !found {
missedNodesCount++
}
}
if missedNodesCount > 0 {
return fmt.Errorf("Metric not found for %v out of %v nodes", missedNodesCount, len(expectedNodesNames))
}
return nil
}
func validateAllActiveTargetsAreHealthy(c clientset.Interface) error {
discovery, err := fetchPrometheusTargetDiscovery(c)
if err != nil {
return err
}
if len(discovery.ActiveTargets) == 0 {
return fmt.Errorf("Prometheus is not scraping any targets, at least one target is required")
}
for _, target := range discovery.ActiveTargets {
if target.Health != HealthGood {
return fmt.Errorf("Target health not good. Target: %v", target)
}
}
return nil
}
func validateQueryReturnsCorrectValues(c clientset.Interface, query string, expectedValue float64, minSamplesCount int, errorTolerance float64) error {
samples, err := fetchQueryValues(c, query, prometheusMetricValidationDuration)
if err != nil {
return err
}
if len(samples) < minSamplesCount {
return fmt.Errorf("Not enough samples for query '%v', got %v", query, samples)
}
framework.Logf("Executed query '%v' returned %v", query, samples)
for _, value := range samples {
error := math.Abs(value-expectedValue) / expectedValue
if error >= errorTolerance {
return fmt.Errorf("Query result values outside expected value tolerance. Expected error below %v, got %v", errorTolerance, error)
}
}
return nil
}
func fetchQueryValues(c clientset.Interface, query string, duration time.Duration) ([]float64, error) {
now := time.Now()
response, err := queryPrometheus(c, query, now.Add(-duration), now, prometheusQueryStep)
if err != nil {
return nil, err
}
m, ok := response.(model.Matrix)
if !ok {
return nil, fmt.Errorf("Expected matric response, got: %T", response)
}
values := make([]float64, 0)
for _, stream := range m {
for _, sample := range stream.Values {
values = append(values, float64(sample.Value))
}
}
return values, nil
}
func getInstanceLabelsAvailableForMetric(c clientset.Interface, duration time.Duration, metric string) ([]string, error) {
var instance model.LabelValue
now := time.Now()
query := fmt.Sprintf(`sum(%v)by(instance)`, metric)
result, err := queryPrometheus(c, query, now.Add(-duration), now, prometheusQueryStep)
if err != nil {
return nil, err
}
instanceLabels := make([]string, 0)
m, ok := result.(model.Matrix)
if !ok {
framework.Failf("Expected matrix response for query '%v', got: %T", query, result)
return instanceLabels, nil
}
for _, stream := range m {
if instance, ok = stream.Metric["instance"]; !ok {
continue
}
instanceLabels = append(instanceLabels, string(instance))
}
return instanceLabels, nil
}
func fetchPrometheusTargetDiscovery(c clientset.Interface) (TargetDiscovery, error) {
ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout)
defer cancel()
response, err := c.CoreV1().RESTClient().Get().
Context(ctx).
Namespace("kube-system").
Resource("services").
Name(prometheusService+":9090").
SubResource("proxy").
Suffix("api", "v1", "targets").
Do().
Raw()
var qres promTargetsResponse
if err != nil {
framework.Logf(string(response))
return qres.Data, err
}
err = json.Unmarshal(response, &qres)
return qres.Data, nil
}
type promTargetsResponse struct {
Status string `json:"status"`
Data TargetDiscovery `json:"data"`
}
// TargetDiscovery has all the active targets.
type TargetDiscovery struct {
ActiveTargets []*Target `json:"activeTargets"`
DroppedTargets []*DroppedTarget `json:"droppedTargets"`
}
// Target has the information for one target.
type Target struct {
DiscoveredLabels map[string]string `json:"discoveredLabels"`
Labels map[string]string `json:"labels"`
ScrapeURL string `json:"scrapeUrl"`
LastError string `json:"lastError"`
LastScrape time.Time `json:"lastScrape"`
Health TargetHealth `json:"health"`
}
// DroppedTarget has the information for one target that was dropped during relabelling.
type DroppedTarget struct {
// Labels before any processing.
DiscoveredLabels map[string]string `json:"discoveredLabels"`
}
// The possible health states of a target based on the last performed scrape.
const (
HealthUnknown TargetHealth = "unknown"
HealthGood TargetHealth = "up"
HealthBad TargetHealth = "down"
)
// TargetHealth describes the health state of a target.
type TargetHealth string
func queryPrometheus(c clientset.Interface, query string, start, end time.Time, step time.Duration) (model.Value, error) {
ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout)
defer cancel()
response, err := c.CoreV1().RESTClient().Get().
Context(ctx).
Namespace("kube-system").
Resource("services").
Name(prometheusService+":9090").
SubResource("proxy").
Suffix("api", "v1", "query_range").
Param("query", query).
Param("start", fmt.Sprintf("%v", start.Unix())).
Param("end", fmt.Sprintf("%v", end.Unix())).
Param("step", fmt.Sprintf("%vs", step.Seconds())).
Do().
Raw()
if err != nil {
framework.Logf(string(response))
return nil, err
}
var qres promQueryResponse
err = json.Unmarshal(response, &qres)
return model.Value(qres.Data.v), err
}
type promQueryResponse struct {
Status string `json:"status"`
Data responseData `json:"data"`
}
type responseData struct {
Type model.ValueType `json:"resultType"`
Result interface{} `json:"result"`
// The decoded value.
v model.Value
}
func (qr *responseData) UnmarshalJSON(b []byte) error {
v := struct {
Type model.ValueType `json:"resultType"`
Result json.RawMessage `json:"result"`
}{}
err := json.Unmarshal(b, &v)
if err != nil {
return err
}
switch v.Type {
case model.ValScalar:
var sv model.Scalar
err = json.Unmarshal(v.Result, &sv)
qr.v = &sv
case model.ValVector:
var vv model.Vector
err = json.Unmarshal(v.Result, &vv)
qr.v = vv
case model.ValMatrix:
var mv model.Matrix
err = json.Unmarshal(v.Result, &mv)
qr.v = mv
default:
err = fmt.Errorf("unexpected value type %q", v.Type)
}
return err
}
func retryUntilSucceeds(validator func() error, timeout time.Duration) {
startTime := time.Now()
var err error
for {
err = validator()
if err == nil {
return
}
if time.Since(startTime) >= timeout {
break
}
framework.Logf(err.Error())
time.Sleep(prometheusSleepBetweenAttempts)
}
framework.Failf(err.Error())
}
func getAllNodes(c clientset.Interface) ([]string, error) {
nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
return nil, err
}
result := []string{}
for _, node := range nodeList.Items {
result = append(result, node.Name)
}
return result, nil
}