A large set of improvements to the Stackdriver components.

Metadata Agent Improvements Bump metadata agent version to 0.2-0.0.21-1. Expand the metadata agent's access to all API groups. Remove metadata agent config maps in favor of command line flags. Update the metadata agent's liveness probe to a new /healthz handler. Logging Agent Improvements Bump logging agent version to 0.2-1.5.33-1-k8s-1. Appropriately set log severity for k8s_container. Fix detect exceptions plugin to analyze message field instead of log field. Fix detect exceptions plugin to analyze streams based on local resource id. Disable the metadata agent for monitored resource construction in logging. Disable timestamp adjustment in logs to optimize performance. Reduce logging agent buffer chunk limit to 512k to optimize performance.
2025-09-06 03:33:26 +00:00 · 2018-07-22 11:10:54 -04:00
parent fda2b024d2
commit 32c2bfadfd
7 changed files with 47 additions and 73 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@@ -98,6 +98,8 @@ data:
        # instead of jsonPayload after extracting 'time', 'severity' and
        # 'stream' from the record.
        message ${record['log']}
+        # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
+        severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
      </record>
      tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
      remove_keys stream,log
@@ -109,7 +111,7 @@ data:

      remove_tag_prefix raw
      message message
-      stream stream
+      stream "logging.googleapis.com/local_resource_id"
      multiline_flush_interval 5
      max_bytes 500000
      max_lines 1000
@@ -408,9 +410,9 @@ data:
      buffer_queue_full_action block
      # Set the chunk limit conservatively to avoid exceeding the recommended
      # chunk size of 5MB per write request.
-      buffer_chunk_limit 1M
+      buffer_chunk_limit 512k
      # Cap the combined memory usage of this buffer and the one below to
-      # 1MiB/chunk * (6 + 2) chunks = 8 MiB
+      # 512KiB/chunk * (6 + 2) chunks = 4 MiB
      buffer_queue_limit 6
      # Never wait more than 5 seconds before flushing logs in the non-error case.
      flush_interval 5s
@@ -421,8 +423,9 @@ data:
      # Use multiple threads for processing.
      num_threads 2
      use_grpc true
-      # Use Metadata Agent to get monitored resource.
-      enable_metadata_agent true
+      # Skip timestamp adjustment as this is in a controlled environment with
+      # known timestamp format. This helps with CPU usage.
+      adjust_invalid_timestamps false
    </match>

    # Attach local_resource_id for 'k8s_node' monitored resource.
@@ -450,15 +453,16 @@ data:
      buffer_type file
      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
      buffer_queue_full_action block
-      buffer_chunk_limit 1M
+      buffer_chunk_limit 512k
      buffer_queue_limit 2
      flush_interval 5s
      max_retry_wait 30
      disable_retry_limit
      num_threads 2
      use_grpc true
-      # Use Metadata Agent to get monitored resource.
-      enable_metadata_agent true
+      # Skip timestamp adjustment as this is in a controlled environment with
+      # known timestamp format. This helps with CPU usage.
+      adjust_invalid_timestamps false
    </match>
 metadata:
  name: fluentd-gcp-config-v1.2.5
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@@ -1,13 +1,13 @@
 apiVersion: extensions/v1beta1
 kind: DaemonSet
 metadata:
-  name: fluentd-gcp-v3.0.0
+  name: fluentd-gcp-{{ fluentd_gcp_yaml_version }}
  namespace: kube-system
  labels:
    k8s-app: fluentd-gcp
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
-    version: v3.0.0
+    version: {{ fluentd_gcp_yaml_version }}
 spec:
  updateStrategy:
    type: RollingUpdate
@@ -16,7 +16,7 @@ spec:
      labels:
        k8s-app: fluentd-gcp
        kubernetes.io/cluster-service: "true"
-        version: v3.0.0
+        version: {{ fluentd_gcp_yaml_version }}
      # This annotation ensures that fluentd does not get evicted if the node
      # supports critical pod annotation based priority scheme.
      # Note that this does not guarantee admission on the nodes (#40573).