diff --git a/cluster/addons/prometheus/OWNERS b/cluster/addons/prometheus/OWNERS deleted file mode 100644 index f4401a08dca..00000000000 --- a/cluster/addons/prometheus/OWNERS +++ /dev/null @@ -1,13 +0,0 @@ -# See the OWNERS docs at https://go.k8s.io/owners - -approvers: -- kawych -- piosz -- serathius -- brancz -reviewers: -- kawych -- piosz -- serathius -- brancz - diff --git a/cluster/addons/prometheus/README.md b/cluster/addons/prometheus/README.md deleted file mode 100644 index ef7288f172b..00000000000 --- a/cluster/addons/prometheus/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Prometheus Add-on - -This add-on is an experimental configuration of k8s monitoring using Prometheus used for e2e tests. - -For production use check out more mature setups like [Prometheus Operator](https://github.com/coreos/prometheus-operator) and [kube-prometheus](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus). \ No newline at end of file diff --git a/cluster/addons/prometheus/alertmanager-configmap.yaml b/cluster/addons/prometheus/alertmanager-configmap.yaml deleted file mode 100644 index 0890a8b14db..00000000000 --- a/cluster/addons/prometheus/alertmanager-configmap.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-config - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: EnsureExists -data: - alertmanager.yml: | - global: null - receivers: - - name: default-receiver - route: - group_interval: 5m - group_wait: 10s - receiver: default-receiver - repeat_interval: 3h diff --git a/cluster/addons/prometheus/alertmanager-deployment.yaml b/cluster/addons/prometheus/alertmanager-deployment.yaml deleted file mode 100644 index 38ec99d3055..00000000000 --- a/cluster/addons/prometheus/alertmanager-deployment.yaml +++ /dev/null @@ -1,76 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: alertmanager - namespace: kube-system - labels: - k8s-app: alertmanager - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - version: v0.14.0 -spec: - replicas: 1 - selector: - matchLabels: - k8s-app: alertmanager - version: v0.14.0 - template: - metadata: - labels: - k8s-app: alertmanager - version: v0.14.0 - spec: - priorityClassName: system-cluster-critical - containers: - - name: prometheus-alertmanager - image: "prom/alertmanager:v0.14.0" - imagePullPolicy: "IfNotPresent" - args: - - --config.file=/etc/config/alertmanager.yml - - --storage.path=/data - - --web.external-url=/ - ports: - - containerPort: 9093 - readinessProbe: - httpGet: - path: /#/status - port: 9093 - initialDelaySeconds: 30 - timeoutSeconds: 30 - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: storage-volume - mountPath: "/data" - subPath: "" - resources: - limits: - cpu: 10m - memory: 50Mi - requests: - cpu: 10m - memory: 50Mi - - name: prometheus-alertmanager-configmap-reload - image: "jimmidyson/configmap-reload:v0.1" - imagePullPolicy: "IfNotPresent" - args: - - --volume-dir=/etc/config - - --webhook-url=http://localhost:9093/-/reload - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - resources: - limits: - cpu: 10m - memory: 10Mi - requests: - cpu: 10m - memory: 10Mi - volumes: - - name: config-volume - configMap: - name: alertmanager-config - - name: storage-volume - persistentVolumeClaim: - claimName: alertmanager diff --git a/cluster/addons/prometheus/alertmanager-pvc.yaml b/cluster/addons/prometheus/alertmanager-pvc.yaml deleted file mode 100644 index bc83c5f4761..00000000000 --- a/cluster/addons/prometheus/alertmanager-pvc.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: alertmanager - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: EnsureExists -spec: - storageClassName: standard - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "2Gi" diff --git a/cluster/addons/prometheus/alertmanager-service.yaml b/cluster/addons/prometheus/alertmanager-service.yaml deleted file mode 100644 index 3edb8c20fe5..00000000000 --- a/cluster/addons/prometheus/alertmanager-service.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: alertmanager - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - kubernetes.io/name: "Alertmanager" -spec: - ports: - - name: http - port: 80 - protocol: TCP - targetPort: 9093 - selector: - k8s-app: alertmanager - type: "ClusterIP" diff --git a/cluster/addons/prometheus/kube-state-metrics-deployment.yaml b/cluster/addons/prometheus/kube-state-metrics-deployment.yaml deleted file mode 100644 index 4f0be672bde..00000000000 --- a/cluster/addons/prometheus/kube-state-metrics-deployment.yaml +++ /dev/null @@ -1,89 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kube-state-metrics - namespace: kube-system - labels: - k8s-app: kube-state-metrics - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - version: v1.3.0 -spec: - selector: - matchLabels: - k8s-app: kube-state-metrics - version: v1.3.0 - replicas: 1 - template: - metadata: - labels: - k8s-app: kube-state-metrics - version: v1.3.0 - spec: - priorityClassName: system-cluster-critical - serviceAccountName: kube-state-metrics - containers: - - name: kube-state-metrics - image: quay.io/coreos/kube-state-metrics:v1.3.0 - ports: - - name: http-metrics - containerPort: 8080 - - name: telemetry - containerPort: 8081 - readinessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 5 - timeoutSeconds: 5 - - name: addon-resizer - image: k8s.gcr.io/addon-resizer:1.8.6 - resources: - limits: - cpu: 100m - memory: 30Mi - requests: - cpu: 100m - memory: 30Mi - env: - - name: MY_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: MY_POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - volumeMounts: - - name: config-volume - mountPath: /etc/config - command: - - /pod_nanny - - --config-dir=/etc/config - - --container=kube-state-metrics - - --cpu=100m - - --extra-cpu=1m - - --memory=100Mi - - --extra-memory=2Mi - - --threshold=5 - - --deployment=kube-state-metrics - volumes: - - name: config-volume - configMap: - name: kube-state-metrics-config ---- -# Config map for resource configuration. -apiVersion: v1 -kind: ConfigMap -metadata: - name: kube-state-metrics-config - namespace: kube-system - labels: - k8s-app: kube-state-metrics - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -data: - NannyConfiguration: |- - apiVersion: nannyconfig/v1alpha1 - kind: NannyConfiguration - diff --git a/cluster/addons/prometheus/kube-state-metrics-rbac.yaml b/cluster/addons/prometheus/kube-state-metrics-rbac.yaml deleted file mode 100644 index 9fae18b2601..00000000000 --- a/cluster/addons/prometheus/kube-state-metrics-rbac.yaml +++ /dev/null @@ -1,103 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kube-state-metrics - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kube-state-metrics - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -rules: -- apiGroups: [""] - resources: - - configmaps - - secrets - - nodes - - pods - - services - - resourcequotas - - replicationcontrollers - - limitranges - - persistentvolumeclaims - - persistentvolumes - - namespaces - - endpoints - verbs: ["list", "watch"] -- apiGroups: ["extensions"] - resources: - - daemonsets - - deployments - - replicasets - verbs: ["list", "watch"] -- apiGroups: ["apps"] - resources: - - statefulsets - verbs: ["list", "watch"] -- apiGroups: ["batch"] - resources: - - cronjobs - - jobs - verbs: ["list", "watch"] -- apiGroups: ["autoscaling"] - resources: - - horizontalpodautoscalers - verbs: ["list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: kube-state-metrics-resizer - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -rules: -- apiGroups: [""] - resources: - - pods - verbs: ["get"] -- apiGroups: ["extensions"] - resources: - - deployments - resourceNames: ["kube-state-metrics"] - verbs: ["get", "update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kube-state-metrics - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-state-metrics -subjects: -- kind: ServiceAccount - name: kube-state-metrics - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: kube-state-metrics - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: kube-state-metrics-resizer -subjects: -- kind: ServiceAccount - name: kube-state-metrics - namespace: kube-system diff --git a/cluster/addons/prometheus/kube-state-metrics-service.yaml b/cluster/addons/prometheus/kube-state-metrics-service.yaml deleted file mode 100644 index bad3ffd4600..00000000000 --- a/cluster/addons/prometheus/kube-state-metrics-service.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: kube-state-metrics - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - kubernetes.io/name: "kube-state-metrics" - annotations: - prometheus.io/scrape: 'true' -spec: - ports: - - name: http-metrics - port: 8080 - targetPort: http-metrics - protocol: TCP - - name: telemetry - port: 8081 - targetPort: telemetry - protocol: TCP - selector: - k8s-app: kube-state-metrics diff --git a/cluster/addons/prometheus/node-exporter-ds.yml b/cluster/addons/prometheus/node-exporter-ds.yml deleted file mode 100644 index bc1766a1d38..00000000000 --- a/cluster/addons/prometheus/node-exporter-ds.yml +++ /dev/null @@ -1,57 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: node-exporter - namespace: kube-system - labels: - k8s-app: node-exporter - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - version: v0.15.2 -spec: - selector: - matchLabels: - k8s-app: node-exporter - version: v0.15.2 - updateStrategy: - type: OnDelete - template: - metadata: - labels: - k8s-app: node-exporter - version: v0.15.2 - spec: - priorityClassName: system-node-critical - containers: - - name: prometheus-node-exporter - image: "prom/node-exporter:v0.15.2" - imagePullPolicy: "IfNotPresent" - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - ports: - - name: metrics - containerPort: 9100 - hostPort: 9100 - volumeMounts: - - name: proc - mountPath: /host/proc - readOnly: true - - name: sys - mountPath: /host/sys - readOnly: true - resources: - limits: - memory: 50Mi - requests: - cpu: 100m - memory: 50Mi - hostNetwork: true - hostPID: true - volumes: - - name: proc - hostPath: - path: /proc - - name: sys - hostPath: - path: /sys diff --git a/cluster/addons/prometheus/node-exporter-service.yaml b/cluster/addons/prometheus/node-exporter-service.yaml deleted file mode 100644 index e386330f6b3..00000000000 --- a/cluster/addons/prometheus/node-exporter-service.yaml +++ /dev/null @@ -1,20 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: node-exporter - namespace: kube-system - annotations: - prometheus.io/scrape: "true" - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - kubernetes.io/name: "NodeExporter" -spec: - clusterIP: None - ports: - - name: metrics - port: 9100 - protocol: TCP - targetPort: 9100 - selector: - k8s-app: node-exporter diff --git a/cluster/addons/prometheus/prometheus-configmap.yaml b/cluster/addons/prometheus/prometheus-configmap.yaml deleted file mode 100644 index de490f1e928..00000000000 --- a/cluster/addons/prometheus/prometheus-configmap.yaml +++ /dev/null @@ -1,171 +0,0 @@ -# Prometheus configuration format https://prometheus.io/docs/prometheus/latest/configuration/configuration/ -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-config - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: EnsureExists -data: - prometheus.yml: | - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - - job_name: kubernetes-apiservers - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: default;kubernetes;https - source_labels: - - __meta_kubernetes_namespace - - __meta_kubernetes_service_name - - __meta_kubernetes_endpoint_port_name - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - - job_name: kubernetes-nodes-kubelet - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __metrics_path__ - replacement: /metrics/cadvisor - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - - job_name: kubernetes-service-endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: kubernetes_namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: kubernetes_name - - - job_name: kubernetes-services - kubernetes_sd_configs: - - role: service - metrics_path: /probe - params: - module: - - http_2xx - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - source_labels: - - __address__ - target_label: __param_target - - replacement: blackbox - target_label: __address__ - - source_labels: - - __param_target - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: - - __meta_kubernetes_namespace - target_label: kubernetes_namespace - - source_labels: - - __meta_kubernetes_service_name - target_label: kubernetes_name - - - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_pod_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: kubernetes_namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: kubernetes_pod_name - alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: pod - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - relabel_configs: - - source_labels: [__meta_kubernetes_namespace] - regex: kube-system - action: keep - - source_labels: [__meta_kubernetes_pod_label_k8s_app] - regex: alertmanager - action: keep - - source_labels: [__meta_kubernetes_pod_container_port_number] - regex: - action: drop diff --git a/cluster/addons/prometheus/prometheus-rbac.yaml b/cluster/addons/prometheus/prometheus-rbac.yaml deleted file mode 100644 index 2a5acdec950..00000000000 --- a/cluster/addons/prometheus/prometheus-rbac.yaml +++ /dev/null @@ -1,55 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -rules: - - apiGroups: - - "" - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - nonResourceURLs: - - "/metrics" - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: kube-system diff --git a/cluster/addons/prometheus/prometheus-service.yaml b/cluster/addons/prometheus/prometheus-service.yaml deleted file mode 100644 index cc85ee5a8f6..00000000000 --- a/cluster/addons/prometheus/prometheus-service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: prometheus - namespace: kube-system - labels: - kubernetes.io/name: "Prometheus" - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -spec: - ports: - - name: http - port: 9090 - protocol: TCP - targetPort: 9090 - selector: - k8s-app: prometheus diff --git a/cluster/addons/prometheus/prometheus-statefulset.yaml b/cluster/addons/prometheus/prometheus-statefulset.yaml deleted file mode 100644 index 01a12d0a969..00000000000 --- a/cluster/addons/prometheus/prometheus-statefulset.yaml +++ /dev/null @@ -1,107 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: prometheus - namespace: kube-system - labels: - k8s-app: prometheus - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile - version: v2.2.1 -spec: - serviceName: "prometheus" - replicas: 1 - podManagementPolicy: "Parallel" - updateStrategy: - type: "RollingUpdate" - selector: - matchLabels: - k8s-app: prometheus - template: - metadata: - labels: - k8s-app: prometheus - spec: - priorityClassName: system-cluster-critical - serviceAccountName: prometheus - initContainers: - - name: "init-chown-data" - image: "busybox:latest" - imagePullPolicy: "IfNotPresent" - command: ["chown", "-R", "65534:65534", "/data"] - volumeMounts: - - name: prometheus-data - mountPath: /data - subPath: "" - containers: - - name: prometheus-server-configmap-reload - image: "jimmidyson/configmap-reload:v0.1" - imagePullPolicy: "IfNotPresent" - args: - - --volume-dir=/etc/config - - --webhook-url=http://localhost:9090/-/reload - volumeMounts: - - name: config-volume - mountPath: /etc/config - readOnly: true - resources: - limits: - cpu: 10m - memory: 10Mi - requests: - cpu: 10m - memory: 10Mi - - - name: prometheus-server - image: "prom/prometheus:v2.2.1" - imagePullPolicy: "IfNotPresent" - args: - - --config.file=/etc/config/prometheus.yml - - --storage.tsdb.path=/data - - --web.console.libraries=/etc/prometheus/console_libraries - - --web.console.templates=/etc/prometheus/consoles - - --web.enable-lifecycle - ports: - - containerPort: 9090 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - initialDelaySeconds: 30 - timeoutSeconds: 30 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - initialDelaySeconds: 30 - timeoutSeconds: 30 - # based on 10 running nodes with 30 pods each - resources: - limits: - cpu: 200m - memory: 1000Mi - requests: - cpu: 200m - memory: 1000Mi - - volumeMounts: - - name: config-volume - mountPath: /etc/config - - name: prometheus-data - mountPath: /data - subPath: "" - terminationGracePeriodSeconds: 300 - volumes: - - name: config-volume - configMap: - name: prometheus-config - volumeClaimTemplates: - - metadata: - name: prometheus-data - spec: - storageClassName: standard - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "16Gi" diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 914154657ab..63301e02215 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -160,9 +160,6 @@ ENABLE_L7_LOADBALANCING="${KUBE_ENABLE_L7_LOADBALANCING:-glbc}" # standalone - Heapster only. Metrics available via Heapster REST API. ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" -# Optional: Enable deploying separate prometheus stack for monitoring kubernetes cluster -ENABLE_PROMETHEUS_MONITORING="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}" - # Optional: Enable Metrics Server. Metrics Server should be enable everywhere, # since it's a critical component, but in the first release we need a way to disable # this in case of stability issues. diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index f14d2d587c5..badcba72f15 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -172,9 +172,6 @@ ENABLE_L7_LOADBALANCING="${KUBE_ENABLE_L7_LOADBALANCING:-glbc}" # standalone - Heapster only. Metrics available via Heapster REST API. ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" -# Optional: Enable deploying separate prometheus stack for monitoring kubernetes cluster -ENABLE_PROMETHEUS_MONITORING="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}" - # Optional: Enable Metrics Server. Metrics Server should be enable everywhere, # since it's a critical component, but in the first release we need a way to disable # this in case of stability issues. diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 953c53c858f..582f1bdc131 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2702,10 +2702,6 @@ EOF prepare-kube-proxy-manifest-variables "$src_dir/kube-proxy/kube-proxy-ds.yaml" setup-addon-manifests "addons" "kube-proxy" fi - # Setup prometheus stack for monitoring kubernetes cluster - if [[ "${ENABLE_PROMETHEUS_MONITORING:-}" == "true" ]]; then - setup-addon-manifests "addons" "prometheus" - fi # Setup cluster monitoring using heapster if [[ "${ENABLE_CLUSTER_MONITORING:-}" == "influxdb" ]] || \ [[ "${ENABLE_CLUSTER_MONITORING:-}" == "google" ]] || \ diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 460e24d6772..b196fb33b44 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -1124,7 +1124,6 @@ SERVICE_CLUSTER_IP_RANGE: $(yaml-quote ${SERVICE_CLUSTER_IP_RANGE}) KUBERNETES_MASTER_NAME: $(yaml-quote ${KUBERNETES_MASTER_NAME}) ALLOCATE_NODE_CIDRS: $(yaml-quote ${ALLOCATE_NODE_CIDRS:-false}) ENABLE_CLUSTER_MONITORING: $(yaml-quote ${ENABLE_CLUSTER_MONITORING:-none}) -ENABLE_PROMETHEUS_MONITORING: $(yaml-quote ${ENABLE_PROMETHEUS_MONITORING:-false}) ENABLE_METRICS_SERVER: $(yaml-quote ${ENABLE_METRICS_SERVER:-false}) ENABLE_METADATA_AGENT: $(yaml-quote ${ENABLE_METADATA_AGENT:-none}) METADATA_AGENT_CPU_REQUEST: $(yaml-quote ${METADATA_AGENT_CPU_REQUEST:-}) diff --git a/hack/ginkgo-e2e.sh b/hack/ginkgo-e2e.sh index f048eb04aec..08e5188e03d 100755 --- a/hack/ginkgo-e2e.sh +++ b/hack/ginkgo-e2e.sh @@ -163,7 +163,6 @@ export PATH --node-tag="${NODE_TAG:-}" \ --master-tag="${MASTER_TAG:-}" \ --cluster-monitoring-mode="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" \ - --prometheus-monitoring="${KUBE_ENABLE_PROMETHEUS_MONITORING:-false}" \ --dns-domain="${KUBE_DNS_DOMAIN:-cluster.local}" \ --ginkgo.slowSpecThreshold="${GINKGO_SLOW_SPEC_THRESHOLD:-300}" \ ${KUBE_CONTAINER_RUNTIME:+"--container-runtime=${KUBE_CONTAINER_RUNTIME}"} \ diff --git a/test/e2e/framework/skip.go b/test/e2e/framework/skip.go index 16ae55c80e3..139f289bcd6 100644 --- a/test/e2e/framework/skip.go +++ b/test/e2e/framework/skip.go @@ -108,13 +108,6 @@ func SkipIfMultizone(c clientset.Interface) { } } -// SkipUnlessPrometheusMonitoringIsEnabled skips if the prometheus monitoring is not enabled. -func SkipUnlessPrometheusMonitoringIsEnabled(supportedMonitoring ...string) { - if !TestContext.EnablePrometheusMonitoring { - skipInternalf(1, "Skipped because prometheus monitoring is not enabled") - } -} - // SkipUnlessMasterOSDistroIs skips if the master OS distro is not included in the supportedMasterOsDistros. func SkipUnlessMasterOSDistroIs(supportedMasterOsDistros ...string) { if !MasterOSDistroIs(supportedMasterOsDistros...) { diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index 75fdcf0c304..ea68ed23504 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -151,8 +151,6 @@ type TestContextType struct { NodeTestContextType // Monitoring solution that is used in current cluster. ClusterMonitoringMode string - // Separate Prometheus monitoring deployed in cluster - EnablePrometheusMonitoring bool // Indicates what path the kubernetes-anywhere is installed on KubernetesAnywherePath string @@ -313,7 +311,6 @@ func RegisterClusterFlags(flags *flag.FlagSet) { flags.StringVar(&TestContext.MasterOSDistro, "master-os-distro", "debian", "The OS distribution of cluster master (debian, ubuntu, gci, coreos, or custom).") flags.StringVar(&TestContext.NodeOSDistro, "node-os-distro", "debian", "The OS distribution of cluster VM instances (debian, ubuntu, gci, coreos, or custom).") flags.StringVar(&TestContext.ClusterMonitoringMode, "cluster-monitoring-mode", "standalone", "The monitoring solution that is used in the cluster.") - flags.BoolVar(&TestContext.EnablePrometheusMonitoring, "prometheus-monitoring", false, "Separate Prometheus monitoring deployed in cluster.") flags.StringVar(&TestContext.ClusterDNSDomain, "dns-domain", "cluster.local", "The DNS Domain of the cluster.") // TODO: Flags per provider? Rename gce-project/gce-zone? diff --git a/test/e2e/instrumentation/monitoring/BUILD b/test/e2e/instrumentation/monitoring/BUILD index 2a574f728ab..c8b13be0361 100644 --- a/test/e2e/instrumentation/monitoring/BUILD +++ b/test/e2e/instrumentation/monitoring/BUILD @@ -13,7 +13,6 @@ go_library( "custom_metrics_deployments.go", "custom_metrics_stackdriver.go", "metrics_grabber.go", - "prometheus.go", "stackdriver.go", "stackdriver_metadata_agent.go", ], @@ -46,7 +45,6 @@ go_library( "//test/utils/image:go_default_library", "//vendor/github.com/onsi/ginkgo:go_default_library", "//vendor/github.com/onsi/gomega:go_default_library", - "//vendor/github.com/prometheus/common/model:go_default_library", "//vendor/golang.org/x/oauth2/google:go_default_library", "//vendor/google.golang.org/api/monitoring/v3:go_default_library", ], diff --git a/test/e2e/instrumentation/monitoring/prometheus.go b/test/e2e/instrumentation/monitoring/prometheus.go deleted file mode 100644 index 530101dcf09..00000000000 --- a/test/e2e/instrumentation/monitoring/prometheus.go +++ /dev/null @@ -1,388 +0,0 @@ -/* -Copyright 2018 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package monitoring - -import ( - "context" - "encoding/json" - "fmt" - "math" - "time" - - "github.com/prometheus/common/model" - - "github.com/onsi/ginkgo" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - clientset "k8s.io/client-go/kubernetes" - "k8s.io/kubernetes/test/e2e/common" - "k8s.io/kubernetes/test/e2e/framework" - instrumentation "k8s.io/kubernetes/test/e2e/instrumentation/common" -) - -const ( - prometheusQueryStep = time.Minute - prometheusMetricErrorTolerance = 0.25 - prometheusMetricValidationDuration = time.Minute * 2 - prometheusRate = time.Minute * 2 - prometheusRequiredNodesUpDuration = time.Minute * 5 - prometheusService = "prometheus" - prometheusSleepBetweenAttempts = time.Second * 30 - prometheusTestTimeout = time.Minute * 5 - customMetricValue = 1000 - targetCPUUsage = 0.1 -) - -var _ = instrumentation.SIGDescribe("[Feature:PrometheusMonitoring] Prometheus", func() { - ginkgo.BeforeEach(func() { - framework.SkipUnlessPrometheusMonitoringIsEnabled() - }) - - f := framework.NewDefaultFramework("prometheus-monitoring") - ginkgo.It("should scrape container metrics from all nodes.", func() { - expectedNodes, err := getAllNodes(f.ClientSet) - framework.ExpectNoError(err) - retryUntilSucceeds(func() error { - return validateMetricAvailableForAllNodes(f.ClientSet, `container_cpu_usage_seconds_total`, expectedNodes) - }, prometheusTestTimeout) - }) - ginkgo.It("should successfully scrape all targets", func() { - retryUntilSucceeds(func() error { - return validateAllActiveTargetsAreHealthy(f.ClientSet) - }, prometheusTestTimeout) - }) - ginkgo.It("should contain correct container CPU metric.", func() { - query := prometheusCPUQuery(f.Namespace.Name, "prometheus-cpu-consumer", prometheusRate) - consumer := consumeCPUResources(f, "prometheus-cpu-consumer", targetCPUUsage*1000) - defer consumer.CleanUp() - retryUntilSucceeds(func() error { - return validateQueryReturnsCorrectValues(f.ClientSet, query, targetCPUUsage, 3, prometheusMetricErrorTolerance) - }, prometheusTestTimeout) - }) - ginkgo.It("should scrape metrics from annotated pods.", func() { - query := prometheusPodCustomMetricQuery(f.Namespace.Name, "prometheus-custom-pod-metric") - consumer := exportCustomMetricFromPod(f, "prometheus-custom-pod-metric", customMetricValue) - defer consumer.CleanUp() - retryUntilSucceeds(func() error { - return validateQueryReturnsCorrectValues(f.ClientSet, query, customMetricValue, 1, prometheusMetricErrorTolerance) - }, prometheusTestTimeout) - }) - ginkgo.It("should scrape metrics from annotated services.", func() { - query := prometheusServiceCustomMetricQuery(f.Namespace.Name, "prometheus-custom-service-metric") - consumer := exportCustomMetricFromService(f, "prometheus-custom-service-metric", customMetricValue) - defer consumer.CleanUp() - retryUntilSucceeds(func() error { - return validateQueryReturnsCorrectValues(f.ClientSet, query, customMetricValue, 1, prometheusMetricErrorTolerance) - }, prometheusTestTimeout) - }) -}) - -func prometheusCPUQuery(namespace, podNamePrefix string, rate time.Duration) string { - return fmt.Sprintf(`sum(irate(container_cpu_usage_seconds_total{namespace="%v",pod=~"%v.*",image!=""}[%vm]))`, - namespace, podNamePrefix, int64(rate.Minutes())) -} - -func prometheusServiceCustomMetricQuery(namespace, service string) string { - return fmt.Sprintf(`sum(QPS{kubernetes_namespace="%v",kubernetes_name="%v"})`, namespace, service) -} - -func prometheusPodCustomMetricQuery(namespace, podNamePrefix string) string { - return fmt.Sprintf(`sum(QPS{kubernetes_namespace="%s",kubernetes_pod_name=~"%s.*"})`, namespace, podNamePrefix) -} - -func consumeCPUResources(f *framework.Framework, consumerName string, cpuUsage int) *common.ResourceConsumer { - return common.NewDynamicResourceConsumer(consumerName, f.Namespace.Name, common.KindDeployment, 1, cpuUsage, - memoryUsed, 0, int64(cpuUsage), memoryLimit, f.ClientSet, f.ScalesGetter) -} - -func exportCustomMetricFromPod(f *framework.Framework, consumerName string, metricValue int) *common.ResourceConsumer { - podAnnotations := map[string]string{ - "prometheus.io/scrape": "true", - "prometheus.io/path": "/metrics", - "prometheus.io/port": "8080", - } - return common.NewMetricExporter(consumerName, f.Namespace.Name, podAnnotations, nil, metricValue, f.ClientSet, f.ScalesGetter) -} - -func exportCustomMetricFromService(f *framework.Framework, consumerName string, metricValue int) *common.ResourceConsumer { - serviceAnnotations := map[string]string{ - "prometheus.io/scrape": "true", - "prometheus.io/path": "/metrics", - "prometheus.io/port": "8080", - } - return common.NewMetricExporter(consumerName, f.Namespace.Name, nil, serviceAnnotations, metricValue, f.ClientSet, f.ScalesGetter) -} - -func validateMetricAvailableForAllNodes(c clientset.Interface, metric string, expectedNodesNames []string) error { - instanceLabels, err := getInstanceLabelsAvailableForMetric(c, prometheusRequiredNodesUpDuration, metric) - if err != nil { - return err - } - nodesWithMetric := make(map[string]bool) - for _, instance := range instanceLabels { - nodesWithMetric[instance] = true - } - missedNodesCount := 0 - for _, nodeName := range expectedNodesNames { - if _, found := nodesWithMetric[nodeName]; !found { - missedNodesCount++ - } - } - if missedNodesCount > 0 { - return fmt.Errorf("Metric not found for %v out of %v nodes", missedNodesCount, len(expectedNodesNames)) - } - return nil -} - -func validateAllActiveTargetsAreHealthy(c clientset.Interface) error { - discovery, err := fetchPrometheusTargetDiscovery(c) - if err != nil { - return err - } - if len(discovery.ActiveTargets) == 0 { - return fmt.Errorf("Prometheus is not scraping any targets, at least one target is required") - } - for _, target := range discovery.ActiveTargets { - if target.Health != HealthGood { - return fmt.Errorf("Target health not good. Target: %v", target) - } - } - return nil -} - -func validateQueryReturnsCorrectValues(c clientset.Interface, query string, expectedValue float64, minSamplesCount int, errorTolerance float64) error { - samples, err := fetchQueryValues(c, query, prometheusMetricValidationDuration) - if err != nil { - return err - } - if len(samples) < minSamplesCount { - return fmt.Errorf("Not enough samples for query '%v', got %v", query, samples) - } - framework.Logf("Executed query '%v' returned %v", query, samples) - for _, value := range samples { - error := math.Abs(value-expectedValue) / expectedValue - if error >= errorTolerance { - return fmt.Errorf("Query result values outside expected value tolerance. Expected error below %v, got %v", errorTolerance, error) - } - } - return nil -} - -func fetchQueryValues(c clientset.Interface, query string, duration time.Duration) ([]float64, error) { - now := time.Now() - response, err := queryPrometheus(c, query, now.Add(-duration), now, prometheusQueryStep) - if err != nil { - return nil, err - } - m, ok := response.(model.Matrix) - if !ok { - return nil, fmt.Errorf("Expected matric response, got: %T", response) - } - values := make([]float64, 0) - for _, stream := range m { - for _, sample := range stream.Values { - values = append(values, float64(sample.Value)) - } - } - return values, nil -} - -func getInstanceLabelsAvailableForMetric(c clientset.Interface, duration time.Duration, metric string) ([]string, error) { - var instance model.LabelValue - now := time.Now() - query := fmt.Sprintf(`sum(%v)by(instance)`, metric) - result, err := queryPrometheus(c, query, now.Add(-duration), now, prometheusQueryStep) - if err != nil { - return nil, err - } - instanceLabels := make([]string, 0) - m, ok := result.(model.Matrix) - if !ok { - framework.Failf("Expected matrix response for query '%v', got: %T", query, result) - return instanceLabels, nil - } - for _, stream := range m { - if instance, ok = stream.Metric["instance"]; !ok { - continue - } - instanceLabels = append(instanceLabels, string(instance)) - } - return instanceLabels, nil -} - -func fetchPrometheusTargetDiscovery(c clientset.Interface) (TargetDiscovery, error) { - ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout) - defer cancel() - - response, err := c.CoreV1().RESTClient().Get(). - Context(ctx). - Namespace("kube-system"). - Resource("services"). - Name(prometheusService+":9090"). - SubResource("proxy"). - Suffix("api", "v1", "targets"). - Do(). - Raw() - var qres promTargetsResponse - if err != nil { - framework.Logf(string(response)) - return qres.Data, err - } - err = json.Unmarshal(response, &qres) - - return qres.Data, nil -} - -type promTargetsResponse struct { - Status string `json:"status"` - Data TargetDiscovery `json:"data"` -} - -// TargetDiscovery has all the active targets. -type TargetDiscovery struct { - ActiveTargets []*Target `json:"activeTargets"` - DroppedTargets []*DroppedTarget `json:"droppedTargets"` -} - -// Target has the information for one target. -type Target struct { - DiscoveredLabels map[string]string `json:"discoveredLabels"` - Labels map[string]string `json:"labels"` - - ScrapeURL string `json:"scrapeUrl"` - - LastError string `json:"lastError"` - LastScrape time.Time `json:"lastScrape"` - Health TargetHealth `json:"health"` -} - -// DroppedTarget has the information for one target that was dropped during relabelling. -type DroppedTarget struct { - // Labels before any processing. - DiscoveredLabels map[string]string `json:"discoveredLabels"` -} - -// The possible health states of a target based on the last performed scrape. -const ( - HealthUnknown TargetHealth = "unknown" - HealthGood TargetHealth = "up" - HealthBad TargetHealth = "down" -) - -// TargetHealth describes the health state of a target. -type TargetHealth string - -func queryPrometheus(c clientset.Interface, query string, start, end time.Time, step time.Duration) (model.Value, error) { - ctx, cancel := context.WithTimeout(context.Background(), framework.SingleCallTimeout) - defer cancel() - - response, err := c.CoreV1().RESTClient().Get(). - Context(ctx). - Namespace("kube-system"). - Resource("services"). - Name(prometheusService+":9090"). - SubResource("proxy"). - Suffix("api", "v1", "query_range"). - Param("query", query). - Param("start", fmt.Sprintf("%v", start.Unix())). - Param("end", fmt.Sprintf("%v", end.Unix())). - Param("step", fmt.Sprintf("%vs", step.Seconds())). - Do(). - Raw() - if err != nil { - framework.Logf(string(response)) - return nil, err - } - var qres promQueryResponse - err = json.Unmarshal(response, &qres) - - return model.Value(qres.Data.v), err -} - -type promQueryResponse struct { - Status string `json:"status"` - Data responseData `json:"data"` -} - -type responseData struct { - Type model.ValueType `json:"resultType"` - Result interface{} `json:"result"` - - // The decoded value. - v model.Value -} - -func (qr *responseData) UnmarshalJSON(b []byte) error { - v := struct { - Type model.ValueType `json:"resultType"` - Result json.RawMessage `json:"result"` - }{} - - err := json.Unmarshal(b, &v) - if err != nil { - return err - } - - switch v.Type { - case model.ValScalar: - var sv model.Scalar - err = json.Unmarshal(v.Result, &sv) - qr.v = &sv - - case model.ValVector: - var vv model.Vector - err = json.Unmarshal(v.Result, &vv) - qr.v = vv - - case model.ValMatrix: - var mv model.Matrix - err = json.Unmarshal(v.Result, &mv) - qr.v = mv - - default: - err = fmt.Errorf("unexpected value type %q", v.Type) - } - return err -} - -func retryUntilSucceeds(validator func() error, timeout time.Duration) { - startTime := time.Now() - var err error - for { - err = validator() - if err == nil { - return - } - if time.Since(startTime) >= timeout { - break - } - framework.Logf(err.Error()) - time.Sleep(prometheusSleepBetweenAttempts) - } - framework.Failf(err.Error()) -} - -func getAllNodes(c clientset.Interface) ([]string, error) { - nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{}) - if err != nil { - return nil, err - } - result := []string{} - for _, node := range nodeList.Items { - result = append(result, node.Name) - } - return result, nil -}