From 125f7599076fe1bf91467fb1706bb4736e2f3328 Mon Sep 17 00:00:00 2001
From: Mik Vyatskov <vmik@google.com>
Date: Wed, 12 Jul 2017 17:02:40 +0200
Subject: [PATCH] Change fluentd-gcp monitoring to use metrics exposed by SD
 plugin

---
 .../fluentd-gcp/fluentd-gcp-configmap.yaml    | 128 ++++++------------
 .../addons/fluentd-gcp/fluentd-gcp-ds.yaml    |   6 +-
 2 files changed, 47 insertions(+), 87 deletions(-)
diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
index a1d7a5d1abf..5c38ed99b1b 100644
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@@ -70,27 +70,14 @@ data:
 
     # Detect exceptions in the log output and forward them as one log entry.
     <match raw.kubernetes.**>
-      @type copy
+      @type detect_exceptions
 
-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_line_count
-          desc Total number of lines generated by application containers
-        </metric>
-      </store>
-      <store>
-        @type detect_exceptions
-
-        remove_tag_prefix raw
-        message log
-        stream stream
-        multiline_flush_interval 5
-        max_bytes 500000
-        max_lines 1000
-      </store>
+      remove_tag_prefix raw
+      message log
+      stream stream
+      multiline_flush_interval 5
+      max_bytes 500000
+      max_lines 1000
     </match>
   system.input.conf: |-
     # Example:
@@ -342,77 +329,50 @@ data:
     # compute.googleapis.com service rather than container.googleapis.com to keep
     # them separate since most users don't care about the node logs.
     <match kubernetes.**>
-      @type copy
+      @type google_cloud
 
-      <store>
-        @type google_cloud
-
-        # Set the buffer type to file to improve the reliability and reduce the memory consumption
-        buffer_type file
-        buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
-        # Set queue_full action to block because we want to pause gracefully
-        # in case of the off-the-limits load instead of throwing an exception
-        buffer_queue_full_action block
-        # Set the chunk limit conservatively to avoid exceeding the GCL limit
-        # of 10MiB per write request.
-        buffer_chunk_limit 2M
-        # Cap the combined memory usage of this buffer and the one below to
-        # 2MiB/chunk * (6 + 2) chunks = 16 MiB
-        buffer_queue_limit 6
-        # Never wait more than 5 seconds before flushing logs in the non-error case.
-        flush_interval 5s
-        # Never wait longer than 30 seconds between retries.
-        max_retry_wait 30
-        # Disable the limit on the number of retries (retry forever).
-        disable_retry_limit
-        # Use multiple threads for processing.
-        num_threads 2
-      </store>
-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_entry_count
-          desc Total number of log entries generated by either application containers or system components
-          <labels>
-            component container
-          </labels>
-        </metric>
-      </store>
+      # Collect metrics in Prometheus registry about plugin activity.
+      enable_monitoring true
+      monitoring_type prometheus
+      # Set the buffer type to file to improve the reliability and reduce the memory consumption
+      buffer_type file
+      buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
+      # Set queue_full action to block because we want to pause gracefully
+      # in case of the off-the-limits load instead of throwing an exception
+      buffer_queue_full_action block
+      # Set the chunk limit conservatively to avoid exceeding the GCL limit
+      # of 10MiB per write request.
+      buffer_chunk_limit 2M
+      # Cap the combined memory usage of this buffer and the one below to
+      # 2MiB/chunk * (6 + 2) chunks = 16 MiB
+      buffer_queue_limit 6
+      # Never wait more than 5 seconds before flushing logs in the non-error case.
+      flush_interval 5s
+      # Never wait longer than 30 seconds between retries.
+      max_retry_wait 30
+      # Disable the limit on the number of retries (retry forever).
+      disable_retry_limit
+      # Use multiple threads for processing.
+      num_threads 2
     </match>
 
     # Keep a smaller buffer here since these logs are less important than the user's
     # container logs.
     <match **>
-      @type copy
+      @type google_cloud
 
-      <store>
-        @type google_cloud
-
-        detect_subservice false
-        buffer_type file
-        buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
-        buffer_queue_full_action block
-        buffer_chunk_limit 2M
-        buffer_queue_limit 2
-        flush_interval 5s
-        max_retry_wait 30
-        disable_retry_limit
-        num_threads 2
-      </store>
-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_entry_count
-          desc Total number of log entries generated by either application containers or system components
-          <labels>
-            component system
-          </labels>
-        </metric>
-      </store>
+      enable_monitoring true
+      monitoring_type prometheus
+      detect_subservice false
+      buffer_type file
+      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
+      buffer_queue_full_action block
+      buffer_chunk_limit 2M
+      buffer_queue_limit 2
+      flush_interval 5s
+      max_retry_wait 30
+      disable_retry_limit
+      num_threads 2
     </match>
 metadata:
   name: fluentd-gcp-config-v1.1
diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
index d9e5775bb21..c4304f5c156 100644
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@@ -27,7 +27,7 @@ spec:
       hostNetwork: true
       containers:
       - name: fluentd-gcp
-        image: gcr.io/google-containers/fluentd-gcp:2.0.7
+        image: gcr.io/google-containers/fluentd-gcp:2.0.8
         # If fluentd consumes its own logs, the following situation may happen:
         # fluentd fails to send a chunk to the server => writes it to the log =>
         # tries to send this message to the server => fails to send a chunk and so on.
@@ -90,13 +90,13 @@ spec:
                 exit 1;
               fi;
       - name: prometheus-to-sd-exporter
-        image: gcr.io/google-containers/prometheus-to-sd:v0.1.0
+        image: gcr.io/google-containers/prometheus-to-sd:v0.1.3
         command:
           - /monitor
           - --component=fluentd
           - --target-port=31337
           - --stackdriver-prefix=container.googleapis.com/internal/addons
-          - --whitelisted-metrics=logging_line_count,logging_entry_count
+          - --whitelisted-metrics=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count
         volumeMounts:
         - name: ssl-certs
           mountPath: /etc/ssl/certs