Merge pull request #41567 from Crassirostris/fluentd-gcp-monitoring

Automatic merge from submit-queue (batch tested with PRs 39855, 41433, 41567, 41887, 41652) Add fluentd monitoring to fluentd-gcp image Right now we are not able to monitor the state of fluentd in cluster, which may result in logging subsystem quietly failing. This PR tries to address that problem by introducing the fluentd container monitoring: * fluentd internal metrics, like number of buffers and number of data in buffers * `logging_line_count`, number of lines, read by fluentd from application containers' logs * Has `tag` label, corresponding to the fluentd tag of the entry * `logging_entry_count`, number of entries, emitted to the output plugin * With label `component` set to `container`, generated by application containers * With label `component` set to `system`, generated by system components like kubelet, docker, scheduler, etc. * Has `tag` label, corresponding to the fluentd tag of the entry CC @fabxc @igorpeshansky @edsiper
2025-07-31 07:20:13 +00:00 · 2017-02-23 09:36:33 -08:00 · 2017-02-23 09:36:33 -08:00 · bb5fdff58b
commit bb5fdff58b
parent 346a8a778f 8d2d91070a
3 changed files with 110 additions and 41 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
@ -22,7 +22,6 @@

 FROM gcr.io/google_containers/ubuntu-slim:0.6

-
 # Disable prompts from apt
 ENV DEBIAN_FRONTEND noninteractive

@ -37,6 +36,7 @@ RUN apt-get -qq update && \
    td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \
    td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.6 && \
    td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \
+    td-agent-gem install --no-document fluent-plugin-prometheus -v 0.2.1 && \
    # Remove build tools
    apt-get remove -y -qq gcc make && \
    apt-get autoremove -y -qq && \
@ -56,5 +56,7 @@ COPY fluent.conf /etc/td-agent/td-agent.conf
 # Copy the entrypoint for the container
 COPY run.sh /run.sh

+EXPOSE 80
+
 # Start Fluentd to pick up our config that watches Docker container logs.
 CMD /run.sh $FLUENTD_ARGS
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
@ -26,7 +26,7 @@
 .PHONY:	build push

 PREFIX=gcr.io/google_containers
-TAG = 1.38
+TAG = 1.40

 build:
 	docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) .
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
@ -75,13 +75,30 @@

 # Detect exceptions in the log output and forward them as one log entry.
 <match raw.kubernetes.**>
-  type detect_exceptions
-  remove_tag_prefix raw
-  message log
-  stream stream
-  multiline_flush_interval 5
-  max_bytes 500000
-  max_lines 1000
+  @type copy
+
+  <store>
+    @type prometheus
+
+    <metric>
+      type counter
+      name logging_line_count
+      desc Total number of lines generated by application containers
+      <labels>
+        tag ${tag}
+      </labels>
+    </metric>
+  </store>
+  <store>
+    @type detect_exceptions
+
+    remove_tag_prefix raw
+    message log
+    stream stream
+    multiline_flush_interval 5
+    max_bytes 500000
+    max_lines 1000
+  </store>
 </match>

 # Example:
@ -283,46 +300,96 @@
  tag kubelet
 </source>

+# Prometheus monitoring
+<source>
+  @type prometheus
+  port 80
+</source>
+
+<source>
+  @type prometheus_monitor
+</source>
+
+<match fluent.**>
+  @type null
+</match>
+
 # We use 2 output stanzas - one to handle the container logs and one to handle
 # the node daemon logs, the latter of which explicitly sends its logs to the
 # compute.googleapis.com service rather than container.googleapis.com to keep
 # them separate since most users don't care about the node logs.
 <match kubernetes.**>
-  type google_cloud
-  # Set the buffer type to file to improve the reliability and reduce the memory consumption
-  buffer_type file
-  buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
-  # Set queue_full action to block because we want to pause gracefully
-  # in case of the off-the-limits load instead of throwing an exception
-  buffer_queue_full_action block
-  # Set the chunk limit conservatively to avoid exceeding the GCL limit
-  # of 10MiB per write request.
-  buffer_chunk_limit 2M
-  # Cap the combined memory usage of this buffer and the one below to
-  # 2MiB/chunk * (6 + 2) chunks = 16 MiB
-  buffer_queue_limit 6
-  # Never wait more than 5 seconds before flushing logs in the non-error case.
-  flush_interval 5s
-  # Never wait longer than 30 seconds between retries.
-  max_retry_wait 30
-  # Disable the limit on the number of retries (retry forever).
-  disable_retry_limit
-  # Use multiple threads for processing.
-  num_threads 2
+  @type copy
+
+  <store>
+    @type google_cloud
+
+    # Set the buffer type to file to improve the reliability and reduce the memory consumption
+    buffer_type file
+    buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
+    # Set queue_full action to block because we want to pause gracefully
+    # in case of the off-the-limits load instead of throwing an exception
+    buffer_queue_full_action block
+    # Set the chunk limit conservatively to avoid exceeding the GCL limit
+    # of 10MiB per write request.
+    buffer_chunk_limit 2M
+    # Cap the combined memory usage of this buffer and the one below to
+    # 2MiB/chunk * (6 + 2) chunks = 16 MiB
+    buffer_queue_limit 6
+    # Never wait more than 5 seconds before flushing logs in the non-error case.
+    flush_interval 5s
+    # Never wait longer than 30 seconds between retries.
+    max_retry_wait 30
+    # Disable the limit on the number of retries (retry forever).
+    disable_retry_limit
+    # Use multiple threads for processing.
+    num_threads 2
+  </store>
+  <store>
+    @type prometheus
+
+    <metric>
+      type counter
+      name logging_entry_count
+      desc Total number of log entries generated by application containers
+      <labels>
+        tag ${tag}
+        component container
+      </labels>
+    </metric>
+  </store>
 </match>

 # Keep a smaller buffer here since these logs are less important than the user's
 # container logs.
 <match **>
-  type google_cloud
-  detect_subservice false
-  buffer_type file
-  buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
-  buffer_queue_full_action block
-  buffer_chunk_limit 2M
-  buffer_queue_limit 2
-  flush_interval 5s
-  max_retry_wait 30
-  disable_retry_limit
-  num_threads 2
+  @type copy
+
+  <store>
+    @type google_cloud
+
+    detect_subservice false
+    buffer_type file
+    buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
+    buffer_queue_full_action block
+    buffer_chunk_limit 2M
+    buffer_queue_limit 2
+    flush_interval 5s
+    max_retry_wait 30
+    disable_retry_limit
+    num_threads 2
+  </store>
+  <store>
+    @type prometheus
+
+    <metric>
+      type counter
+      name logging_entry_count
+      desc Total number of log entries generated by system components
+      <labels>
+        tag ${tag}
+        component system
+      </labels>
+    </metric>
+  </store>
 </match>