diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml index 25a884b5435..e345d593f4c 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml @@ -98,6 +98,8 @@ data: # instead of jsonPayload after extracting 'time', 'severity' and # 'stream' from the record. message ${record['log']} + # If 'severity' is not set, assume stderr is ERROR and stdout is INFO. + severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end} tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end} remove_keys stream,log @@ -109,7 +111,7 @@ data: remove_tag_prefix raw message message - stream stream + stream "logging.googleapis.com/local_resource_id" multiline_flush_interval 5 max_bytes 500000 max_lines 1000 @@ -408,9 +410,9 @@ data: buffer_queue_full_action block # Set the chunk limit conservatively to avoid exceeding the recommended # chunk size of 5MB per write request. - buffer_chunk_limit 1M + buffer_chunk_limit 512k # Cap the combined memory usage of this buffer and the one below to - # 1MiB/chunk * (6 + 2) chunks = 8 MiB + # 512KiB/chunk * (6 + 2) chunks = 4 MiB buffer_queue_limit 6 # Never wait more than 5 seconds before flushing logs in the non-error case. flush_interval 5s @@ -421,8 +423,9 @@ data: # Use multiple threads for processing. num_threads 2 use_grpc true - # Use Metadata Agent to get monitored resource. - enable_metadata_agent true + # Skip timestamp adjustment as this is in a controlled environment with + # known timestamp format. This helps with CPU usage. + adjust_invalid_timestamps false # Attach local_resource_id for 'k8s_node' monitored resource. @@ -450,15 +453,16 @@ data: buffer_type file buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer buffer_queue_full_action block - buffer_chunk_limit 1M + buffer_chunk_limit 512k buffer_queue_limit 2 flush_interval 5s max_retry_wait 30 disable_retry_limit num_threads 2 use_grpc true - # Use Metadata Agent to get monitored resource. - enable_metadata_agent true + # Skip timestamp adjustment as this is in a controlled environment with + # known timestamp format. This helps with CPU usage. + adjust_invalid_timestamps false metadata: name: fluentd-gcp-config-v1.2.5 diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml index b801c291a58..3a82519e3d1 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml @@ -1,13 +1,13 @@ apiVersion: extensions/v1beta1 kind: DaemonSet metadata: - name: fluentd-gcp-v3.0.0 + name: fluentd-gcp-{{ fluentd_gcp_yaml_version }} namespace: kube-system labels: k8s-app: fluentd-gcp kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile - version: v3.0.0 + version: {{ fluentd_gcp_yaml_version }} spec: updateStrategy: type: RollingUpdate @@ -16,7 +16,7 @@ spec: labels: k8s-app: fluentd-gcp kubernetes.io/cluster-service: "true" - version: v3.0.0 + version: {{ fluentd_gcp_yaml_version }} # This annotation ensures that fluentd does not get evicted if the node # supports critical pod annotation based priority scheme. # Note that this does not guarantee admission on the nodes (#40573). diff --git a/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml b/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml index dfcada4d585..532b4b82133 100644 --- a/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml +++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml @@ -7,9 +7,7 @@ metadata: addonmanager.kubernetes.io/mode: Reconcile rules: - apiGroups: - - "" - - "apps" - - "extensions" + - "*" resources: - "*" verbs: diff --git a/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml b/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml index 0d9ea98b6f7..d02362cb037 100644 --- a/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml +++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml @@ -7,22 +7,6 @@ metadata: kubernetes.io/cluster-service: "true" addonmanager.kubernetes.io/mode: Reconcile --- -apiVersion: v1 -kind: ConfigMap -metadata: - name: metadata-agent-config - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -data: - node_level.conf: |- - KubernetesUseWatch: true - KubernetesClusterLevelMetadata: false - cluster_level.conf: |- - KubernetesUseWatch: true - KubernetesClusterLevelMetadata: true ---- kind: DaemonSet apiVersion: extensions/v1beta1 metadata: @@ -45,27 +29,22 @@ spec: spec: serviceAccountName: metadata-agent containers: - - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1 + - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1 imagePullPolicy: IfNotPresent name: metadata-agent livenessProbe: - exec: - command: - - /bin/bash - - -c - - | - if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then - exit 1; - fi - periodSeconds: 10 + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 60 + timeoutSeconds: 5 failureThreshold: 1 successThreshold: 1 - volumeMounts: - - name: metadata-agent-config-volume - mountPath: /etc/config - command: - - /opt/stackdriver/metadata/sbin/metadatad - - --config-file=/etc/config/node_level.conf + args: + - -o KubernetesUseWatch=true + - -o KubernetesClusterLevelMetadata=false + - -o MetadataReporterPurgeDeleted=true ports: - containerPort: 8000 hostPort: 8799 @@ -78,10 +57,6 @@ spec: restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 - volumes: - - name: metadata-agent-config-volume - configMap: - name: metadata-agent-config updateStrategy: rollingUpdate: maxUnavailable: 1 @@ -110,27 +85,22 @@ spec: spec: serviceAccountName: metadata-agent containers: - - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1 + - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1 imagePullPolicy: IfNotPresent name: metadata-agent livenessProbe: - exec: - command: - - /bin/bash - - -c - - | - if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then - exit 1; - fi - periodSeconds: 10 + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 60 + timeoutSeconds: 5 failureThreshold: 1 successThreshold: 1 - volumeMounts: - - name: metadata-agent-config-volume - mountPath: /etc/config - command: - - /opt/stackdriver/metadata/sbin/metadatad - - --config-file=/etc/config/cluster_level.conf + args: + - -o KubernetesUseWatch=true + - -o KubernetesClusterLevelMetadata=true + - -o MetadataReporterPurgeDeleted=true ports: - containerPort: 8000 protocol: TCP @@ -142,10 +112,6 @@ spec: restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 - volumes: - - name: metadata-agent-config-volume - configMap: - name: metadata-agent-config strategy: rollingUpdate: maxUnavailable: 1 diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index fbfe95bb7be..74bc1f2f5ff 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -404,6 +404,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then fi # Fluentd requirements +# YAML exists to trigger a configuration refresh when changes are made. +FLUENTD_GCP_YAML_VERSION="v3.1.0" FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}" FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}" @@ -422,7 +424,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}" LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}" # Adding to PROVIDER_VARS, since this is GCP-specific. -PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" +PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" # Fluentd configuration for node-journal ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index a4a0f058973..d48177a70c0 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -420,6 +420,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then fi # Fluentd requirements +# YAML exists to trigger a configuration refresh when changes are made. +FLUENTD_GCP_YAML_VERSION="v3.1.0" FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}" FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}" @@ -438,7 +440,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}" LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}" # Adding to PROVIDER_VARS, since this is GCP-specific. -PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" +PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES" # Fluentd configuration for node-journal ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}" diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 1330b3edfdd..6f8a4549c76 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2237,7 +2237,9 @@ function setup-fluentd { fluentd_gcp_configmap_name="fluentd-gcp-config-old" fi sed -i -e "s@{{ fluentd_gcp_configmap_name }}@${fluentd_gcp_configmap_name}@g" "${fluentd_gcp_yaml}" - fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}" + fluentd_gcp_yaml_version="${FLUENTD_GCP_YAML_VERSION:-v3.1.0}" + sed -i -e "s@{{ fluentd_gcp_yaml_version }}@${fluentd_gcp_yaml_version}@g" "${fluentd_gcp_yaml}" + fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.3-1.5.34-1-k8s-1}" sed -i -e "s@{{ fluentd_gcp_version }}@${fluentd_gcp_version}@g" "${fluentd_gcp_yaml}" update-prometheus-to-sd-parameters ${fluentd_gcp_yaml} start-fluentd-resource-update ${fluentd_gcp_yaml}