From 125f7599076fe1bf91467fb1706bb4736e2f3328 Mon Sep 17 00:00:00 2001 From: Mik Vyatskov Date: Wed, 12 Jul 2017 17:02:40 +0200 Subject: [PATCH] Change fluentd-gcp monitoring to use metrics exposed by SD plugin --- .../fluentd-gcp/fluentd-gcp-configmap.yaml | 128 ++++++------------ .../addons/fluentd-gcp/fluentd-gcp-ds.yaml | 6 +- 2 files changed, 47 insertions(+), 87 deletions(-) diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml index a1d7a5d1abf..5c38ed99b1b 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml @@ -70,27 +70,14 @@ data: # Detect exceptions in the log output and forward them as one log entry. - @type copy + @type detect_exceptions - - @type prometheus - - - type counter - name logging_line_count - desc Total number of lines generated by application containers - - - - @type detect_exceptions - - remove_tag_prefix raw - message log - stream stream - multiline_flush_interval 5 - max_bytes 500000 - max_lines 1000 - + remove_tag_prefix raw + message log + stream stream + multiline_flush_interval 5 + max_bytes 500000 + max_lines 1000 system.input.conf: |- # Example: @@ -342,77 +329,50 @@ data: # compute.googleapis.com service rather than container.googleapis.com to keep # them separate since most users don't care about the node logs. - @type copy + @type google_cloud - - @type google_cloud - - # Set the buffer type to file to improve the reliability and reduce the memory consumption - buffer_type file - buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer - # Set queue_full action to block because we want to pause gracefully - # in case of the off-the-limits load instead of throwing an exception - buffer_queue_full_action block - # Set the chunk limit conservatively to avoid exceeding the GCL limit - # of 10MiB per write request. - buffer_chunk_limit 2M - # Cap the combined memory usage of this buffer and the one below to - # 2MiB/chunk * (6 + 2) chunks = 16 MiB - buffer_queue_limit 6 - # Never wait more than 5 seconds before flushing logs in the non-error case. - flush_interval 5s - # Never wait longer than 30 seconds between retries. - max_retry_wait 30 - # Disable the limit on the number of retries (retry forever). - disable_retry_limit - # Use multiple threads for processing. - num_threads 2 - - - @type prometheus - - - type counter - name logging_entry_count - desc Total number of log entries generated by either application containers or system components - - component container - - - + # Collect metrics in Prometheus registry about plugin activity. + enable_monitoring true + monitoring_type prometheus + # Set the buffer type to file to improve the reliability and reduce the memory consumption + buffer_type file + buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer + # Set queue_full action to block because we want to pause gracefully + # in case of the off-the-limits load instead of throwing an exception + buffer_queue_full_action block + # Set the chunk limit conservatively to avoid exceeding the GCL limit + # of 10MiB per write request. + buffer_chunk_limit 2M + # Cap the combined memory usage of this buffer and the one below to + # 2MiB/chunk * (6 + 2) chunks = 16 MiB + buffer_queue_limit 6 + # Never wait more than 5 seconds before flushing logs in the non-error case. + flush_interval 5s + # Never wait longer than 30 seconds between retries. + max_retry_wait 30 + # Disable the limit on the number of retries (retry forever). + disable_retry_limit + # Use multiple threads for processing. + num_threads 2 # Keep a smaller buffer here since these logs are less important than the user's # container logs. - @type copy + @type google_cloud - - @type google_cloud - - detect_subservice false - buffer_type file - buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer - buffer_queue_full_action block - buffer_chunk_limit 2M - buffer_queue_limit 2 - flush_interval 5s - max_retry_wait 30 - disable_retry_limit - num_threads 2 - - - @type prometheus - - - type counter - name logging_entry_count - desc Total number of log entries generated by either application containers or system components - - component system - - - + enable_monitoring true + monitoring_type prometheus + detect_subservice false + buffer_type file + buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer + buffer_queue_full_action block + buffer_chunk_limit 2M + buffer_queue_limit 2 + flush_interval 5s + max_retry_wait 30 + disable_retry_limit + num_threads 2 metadata: name: fluentd-gcp-config-v1.1 diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml index d9e5775bb21..c4304f5c156 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml @@ -27,7 +27,7 @@ spec: hostNetwork: true containers: - name: fluentd-gcp - image: gcr.io/google-containers/fluentd-gcp:2.0.7 + image: gcr.io/google-containers/fluentd-gcp:2.0.8 # If fluentd consumes its own logs, the following situation may happen: # fluentd fails to send a chunk to the server => writes it to the log => # tries to send this message to the server => fails to send a chunk and so on. @@ -90,13 +90,13 @@ spec: exit 1; fi; - name: prometheus-to-sd-exporter - image: gcr.io/google-containers/prometheus-to-sd:v0.1.0 + image: gcr.io/google-containers/prometheus-to-sd:v0.1.3 command: - /monitor - --component=fluentd - --target-port=31337 - --stackdriver-prefix=container.googleapis.com/internal/addons - - --whitelisted-metrics=logging_line_count,logging_entry_count + - --whitelisted-metrics=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count volumeMounts: - name: ssl-certs mountPath: /etc/ssl/certs