Merge pull request #41567 from Crassirostris/fluentd-gcp-monitoring

Automatic merge from submit-queue (batch tested with PRs 39855, 41433, 41567, 41887, 41652) Add fluentd monitoring to fluentd-gcp image Right now we are not able to monitor the state of fluentd in cluster, which may result in logging subsystem quietly failing. This PR tries to address that problem by introducing the fluentd container monitoring: * fluentd internal metrics, like number of buffers and number of data in buffers * `logging_line_count`, number of lines, read by fluentd from application containers' logs * Has `tag` label, corresponding to the fluentd tag of the entry * `logging_entry_count`, number of entries, emitted to the output plugin * With label `component` set to `container`, generated by application containers * With label `component` set to `system`, generated by system components like kubelet, docker, scheduler, etc. * Has `tag` label, corresponding to the fluentd tag of the entry CC @fabxc @igorpeshansky @edsiper
2025-07-31 15:25:57 +00:00 · 2017-02-23 09:36:33 -08:00 · 2017-02-23 09:36:33 -08:00 · bb5fdff58b
commit bb5fdff58b
parent 346a8a778f 8d2d91070a
3 changed files with 110 additions and 41 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
@ -22,7 +22,6 @@
 FROM gcr.io/google_containers/ubuntu-slim:0.6
 # Disable prompts from apt
 ENV DEBIAN_FRONTEND noninteractive
@ -37,6 +36,7 @@ RUN apt-get -qq update && \
    td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \
    td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.6 && \
    td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \
    td-agent-gem install --no-document fluent-plugin-prometheus -v 0.2.1 && \
    # Remove build tools
    apt-get remove -y -qq gcc make && \
    apt-get autoremove -y -qq && \
@ -56,5 +56,7 @@ COPY fluent.conf /etc/td-agent/td-agent.conf
 # Copy the entrypoint for the container
 COPY run.sh /run.sh
 EXPOSE 80
 # Start Fluentd to pick up our config that watches Docker container logs.
 CMD /run.sh $FLUENTD_ARGS
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
@ -26,7 +26,7 @@
 .PHONY:	build push
 PREFIX=gcr.io/google_containers
-TAG = 1.38
+TAG = 1.40
 build:
 	docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) .
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
@ -75,13 +75,30 @@
 # Detect exceptions in the log output and forward them as one log entry.
 <match raw.kubernetes.**>
-  type detect_exceptions
+  @type copy
-  remove_tag_prefix raw
+
-  message log
+  <store>
-  stream stream
+    @type prometheus
-  multiline_flush_interval 5
+
-  max_bytes 500000
+    <metric>
-  max_lines 1000
+      type counter
      name logging_line_count
      desc Total number of lines generated by application containers
      <labels>
        tag ${tag}
      </labels>
    </metric>
  </store>
  <store>
    @type detect_exceptions
    remove_tag_prefix raw
    message log
    stream stream
    multiline_flush_interval 5
    max_bytes 500000
    max_lines 1000
  </store>
 </match>
 # Example:
@ -283,46 +300,96 @@
  tag kubelet
 </source>
 # Prometheus monitoring
 <source>
  @type prometheus
  port 80
 </source>
 <source>
  @type prometheus_monitor
 </source>
 <match fluent.**>
  @type null
 </match>
 # We use 2 output stanzas - one to handle the container logs and one to handle
 # the node daemon logs, the latter of which explicitly sends its logs to the
 # compute.googleapis.com service rather than container.googleapis.com to keep
 # them separate since most users don't care about the node logs.
 <match kubernetes.**>
-  type google_cloud
+  @type copy
-  # Set the buffer type to file to improve the reliability and reduce the memory consumption
+
-  buffer_type file
+  <store>
-  buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
+    @type google_cloud
-  # Set queue_full action to block because we want to pause gracefully
+
-  # in case of the off-the-limits load instead of throwing an exception
+    # Set the buffer type to file to improve the reliability and reduce the memory consumption
-  buffer_queue_full_action block
+    buffer_type file
-  # Set the chunk limit conservatively to avoid exceeding the GCL limit
+    buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
-  # of 10MiB per write request.
+    # Set queue_full action to block because we want to pause gracefully
-  buffer_chunk_limit 2M
+    # in case of the off-the-limits load instead of throwing an exception
-  # Cap the combined memory usage of this buffer and the one below to
+    buffer_queue_full_action block
-  # 2MiB/chunk * (6 + 2) chunks = 16 MiB
+    # Set the chunk limit conservatively to avoid exceeding the GCL limit
-  buffer_queue_limit 6
+    # of 10MiB per write request.
-  # Never wait more than 5 seconds before flushing logs in the non-error case.
+    buffer_chunk_limit 2M
-  flush_interval 5s
+    # Cap the combined memory usage of this buffer and the one below to
-  # Never wait longer than 30 seconds between retries.
+    # 2MiB/chunk * (6 + 2) chunks = 16 MiB
-  max_retry_wait 30
+    buffer_queue_limit 6
-  # Disable the limit on the number of retries (retry forever).
+    # Never wait more than 5 seconds before flushing logs in the non-error case.
-  disable_retry_limit
+    flush_interval 5s
-  # Use multiple threads for processing.
+    # Never wait longer than 30 seconds between retries.
-  num_threads 2
+    max_retry_wait 30
    # Disable the limit on the number of retries (retry forever).
    disable_retry_limit
    # Use multiple threads for processing.
    num_threads 2
  </store>
  <store>
    @type prometheus
    <metric>
      type counter
      name logging_entry_count
      desc Total number of log entries generated by application containers
      <labels>
        tag ${tag}
        component container
      </labels>
    </metric>
  </store>
 </match>
 # Keep a smaller buffer here since these logs are less important than the user's
 # container logs.
 <match **>
-  type google_cloud
+  @type copy
-  detect_subservice false
+
-  buffer_type file
+  <store>
-  buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
+    @type google_cloud
-  buffer_queue_full_action block
+
-  buffer_chunk_limit 2M
+    detect_subservice false
-  buffer_queue_limit 2
+    buffer_type file
-  flush_interval 5s
+    buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
-  max_retry_wait 30
+    buffer_queue_full_action block
-  disable_retry_limit
+    buffer_chunk_limit 2M
-  num_threads 2
+    buffer_queue_limit 2
    flush_interval 5s
    max_retry_wait 30
    disable_retry_limit
    num_threads 2
  </store>
  <store>
    @type prometheus
    <metric>
      type counter
      name logging_entry_count
      desc Total number of log entries generated by system components
      <labels>
        tag ${tag}
        component system
      </labels>
    </metric>
  </store>
 </match>