From 32c2bfadfdd4e74e42f55810460640df4436b5f6 Mon Sep 17 00:00:00 2001 From: Bryan Moyles Date: Sun, 22 Jul 2018 11:10:54 -0400 Subject: [PATCH] A large set of improvements to the Stackdriver components. Metadata Agent Improvements Bump metadata agent version to 0.2-0.0.21-1. Expand the metadata agent's access to all API groups. Remove metadata agent config maps in favor of command line flags. Update the metadata agent's liveness probe to a new /healthz handler. Logging Agent Improvements Bump logging agent version to 0.2-1.5.33-1-k8s-1. Appropriately set log severity for k8s_container. Fix detect exceptions plugin to analyze message field instead of log field. Fix detect exceptions plugin to analyze streams based on local resource id. Disable the metadata agent for monitored resource construction in logging. Disable timestamp adjustment in logs to optimize performance. Reduce logging agent buffer chunk limit to 512k to optimize performance. --- .../fluentd-gcp/fluentd-gcp-configmap.yaml | 20 +++-- .../addons/fluentd-gcp/fluentd-gcp-ds.yaml | 6 +- .../stackdriver/metadata-agent-rbac.yaml | 4 +- .../stackdriver/metadata-agent.yaml | 78 ++++++------------- cluster/gce/config-default.sh | 4 +- cluster/gce/config-test.sh | 4 +- cluster/gce/gci/configure-helper.sh | 4 +- 7 files changed, 47 insertions(+), 73 deletions(-) diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml index 25a884b5435..e345d593f4c 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml @@ -98,6 +98,8 @@ data: # instead of jsonPayload after extracting 'time', 'severity' and # 'stream' from the record. message ${record['log']} + # If 'severity' is not set, assume stderr is ERROR and stdout is INFO. + severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end} tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end} remove_keys stream,log @@ -109,7 +111,7 @@ data: remove_tag_prefix raw message message - stream stream + stream "logging.googleapis.com/local_resource_id" multiline_flush_interval 5 max_bytes 500000 max_lines 1000 @@ -408,9 +410,9 @@ data: buffer_queue_full_action block # Set the chunk limit conservatively to avoid exceeding the recommended # chunk size of 5MB per write request. - buffer_chunk_limit 1M + buffer_chunk_limit 512k # Cap the combined memory usage of this buffer and the one below to - # 1MiB/chunk * (6 + 2) chunks = 8 MiB + # 512KiB/chunk * (6 + 2) chunks = 4 MiB buffer_queue_limit 6 # Never wait more than 5 seconds before flushing logs in the non-error case. flush_interval 5s @@ -421,8 +423,9 @@ data: # Use multiple threads for processing. num_threads 2 use_grpc true - # Use Metadata Agent to get monitored resource. - enable_metadata_agent true + # Skip timestamp adjustment as this is in a controlled environment with + # known timestamp format. This helps with CPU usage. + adjust_invalid_timestamps false # Attach local_resource_id for 'k8s_node' monitored resource. @@ -450,15 +453,16 @@ data: buffer_type file buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer buffer_queue_full_action block - buffer_chunk_limit 1M + buffer_chunk_limit 512k buffer_queue_limit 2 flush_interval 5s max_retry_wait 30 disable_retry_limit num_threads 2 use_grpc true - # Use Metadata Agent to get monitored resource. - enable_metadata_agent true + # Skip timestamp adjustment as this is in a controlled environment with + # known timestamp format. This helps with CPU usage. + adjust_invalid_timestamps false metadata: name: fluentd-gcp-config-v1.2.5 diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml index b801c291a58..3a82519e3d1 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml @@ -1,13 +1,13 @@ apiVersion: extensions/v1beta1 kind: DaemonSet metadata: - name: fluentd-gcp-v3.0.0 + name: fluentd-gcp-{{ fluentd_gcp_yaml_version }} namespace: kube-system labels: k8s-app: fluentd-gcp kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile - version: v3.0.0 + version: {{ fluentd_gcp_yaml_version }} spec: updateStrategy: type: RollingUpdate @@ -16,7 +16,7 @@ spec: labels: k8s-app: fluentd-gcp kubernetes.io/cluster-service: "true" - version: v3.0.0 + version: {{ fluentd_gcp_yaml_version }} # This annotation ensures that fluentd does not get evicted if the node # supports critical pod annotation based priority scheme. # Note that this does not guarantee admission on the nodes (#40573). diff --git a/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml b/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml index dfcada4d585..532b4b82133 100644 --- a/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml +++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml @@ -7,9 +7,7 @@ metadata: addonmanager.kubernetes.io/mode: Reconcile rules: - apiGroups: - - "" - - "apps" - - "extensions" + - "*" resources: - "*" verbs: diff --git a/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml b/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml index 0d9ea98b6f7..d02362cb037 100644 --- a/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml +++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml @@ -7,22 +7,6 @@ metadata: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile --- -apiVersion: v1 -kind: ConfigMap -metadata: - name: metadata-agent-config - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -data: - node_level.conf: |- - KubernetesUseWatch: true - KubernetesClusterLevelMetadata: false - cluster_level.conf: |- - KubernetesUseWatch: true - KubernetesClusterLevelMetadata: true ---- kind: DaemonSet apiVersion: extensions/v1beta1 metadata: @@ -45,27 +29,22 @@ spec: spec: serviceAccountName: metadata-agent containers: - - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1 + - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1 imagePullPolicy: IfNotPresent name: metadata-agent livenessProbe: - exec: - command: - - /bin/bash - - -c - - | - if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then - exit 1; - fi - periodSeconds: 10 + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 60 + timeoutSeconds: 5 failureThreshold: 1 successThreshold: 1 - volumeMounts: - - name: metadata-agent-config-volume - mountPath: /etc/config - command: - - /opt/stackdriver/metadata/sbin/metadatad - - --config-file=/etc/config/node_level.conf + args: + - -o KubernetesUseWatch=true + - -o KubernetesClusterLevelMetadata=false + - -o MetadataReporterPurgeDeleted=true ports: - containerPort: 8000 hostPort: 8799 @@ -78,10 +57,6 @@ spec: restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 - volumes: - - name: metadata-agent-config-volume - configMap: - name: metadata-agent-config updateStrategy: rollingUpdate: maxUnavailable: 1 @@ -110,27 +85,22 @@ spec: spec: serviceAccountName: metadata-agent containers: - - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1 + - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1 imagePullPolicy: IfNotPresent name: metadata-agent livenessProbe: - exec: - command: - - /bin/bash - - -c - - | - if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then - exit 1; - fi - periodSeconds: 10 + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 60 + timeoutSeconds: 5 failureThreshold: 1 successThreshold: 1 - volumeMounts: - - name: metadata-agent-config-volume - mountPath: /etc/config - command: - - /opt/stackdriver/metadata/sbin/metadatad - - --config-file=/etc/config/cluster_level.conf + args: + - -o KubernetesUseWatch=true + - -o KubernetesClusterLevelMetadata=true + - -o MetadataReporterPurgeDeleted=true ports: - containerPort: 8000 protocol: TCP @@ -142,10 +112,6 @@ spec: restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 - volumes: - - name: metadata-agent-config-volume - configMap: - name: metadata-agent-config strategy: rollingUpdate: maxUnavailable: 1 diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index fbfe95bb7be..74bc1f2f5ff 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -404,6 +404,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then fi # Fluentd requirements +# YAML exists to trigger a configuration refresh when changes are made. +FLUENTD_GCP_YAML_VERSION="v3.1.0" FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}" FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}" @@ -422,7 +424,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}" LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}" # Adding to PROVIDER_VARS, since this is GCP-specific. -PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" +PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" # Fluentd configuration for node-journal ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index a4a0f058973..d48177a70c0 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -420,6 +420,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then fi # Fluentd requirements +# YAML exists to trigger a configuration refresh when changes are made. +FLUENTD_GCP_YAML_VERSION="v3.1.0" FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}" FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}" @@ -438,7 +440,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}" LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}" # Adding to PROVIDER_VARS, since this is GCP-specific. -PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" +PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" # Fluentd configuration for node-journal ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}" diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 1330b3edfdd..6f8a4549c76 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2237,7 +2237,9 @@ function setup-fluentd { fluentd_gcp_configmap_name="fluentd-gcp-config-old" fi sed -i -e "s@{{ fluentd_gcp_configmap_name }}@${fluentd_gcp_configmap_name}@g" "${fluentd_gcp_yaml}" - fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" + fluentd_gcp_yaml_version="${FLUENTD_GCP_YAML_VERSION:-v3.1.0}" + sed -i -e "s@{{ fluentd_gcp_yaml_version }}@${fluentd_gcp_yaml_version}@g" "${fluentd_gcp_yaml}" + fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.3-1.5.34-1-k8s-1}" sed -i -e "s@{{ fluentd_gcp_version }}@${fluentd_gcp_version}@g" "${fluentd_gcp_yaml}" update-prometheus-to-sd-parameters ${fluentd_gcp_yaml} start-fluentd-resource-update ${fluentd_gcp_yaml}