Cleanup fluentd-gcp image, rebase on debian-base

2025-07-24 12:15:52 +00:00 · 2017-02-23 11:29:51 -08:00 · 2017-02-23 11:29:51 -08:00 · 4b4c3e4944
commit 4b4c3e4944
parent 17375fc59f
6 changed files with 60 additions and 428 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Dockerfile
@ -20,43 +20,39 @@
 # scope and that the Logging API has been enabled for the project
 # in the Google Developer Console.

-FROM gcr.io/google_containers/ubuntu-slim:0.6
+FROM gcr.io/google-containers/debian-base-amd64:0.1

-# Disable prompts from apt
-ENV DEBIAN_FRONTEND noninteractive
+COPY Gemfile /Gemfile

-# Install build tools
-RUN apt-get -qq update && \
-    apt-get install -y -qq curl ca-certificates gcc make bash sudo && \
-    apt-get install -y -qq --reinstall lsb-base lsb-release && \
-    # Install logging agent and required gems
-    /usr/bin/curl -sSL https://toolbelt.treasuredata.com/sh/install-ubuntu-xenial-td-agent2.sh | sh && \
-    sed -i -e "s/USER=td-agent/USER=root/" -e "s/GROUP=td-agent/GROUP=root/" /etc/init.d/td-agent && \
-    td-agent-gem install --no-document fluent-plugin-record-reformer -v 0.8.2 && \
-    td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \
-    td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.6 && \
-    td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \
-    td-agent-gem install --no-document fluent-plugin-prometheus -v 0.2.1 && \
-    # Remove build tools
-    apt-get remove -y -qq gcc make && \
-    apt-get autoremove -y -qq && \
-    apt-get clean -qq && \
-    # Remove unnecessary files
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
-        /opt/td-agent/embedded/share/doc \
-        /opt/td-agent/embedded/share/gtk-doc \
-        /opt/td-agent/embedded/lib/postgresql \
-        /opt/td-agent/embedded/bin/postgres \
-        /opt/td-agent/embedded/share/postgresql \
-        /etc/td-agent/td-agent.conf
+# 1. Install & configure dependencies.
+# 2. Install fluentd via ruby.
+# 3. Remove build dependencies.
+# 4. Cleanup leftover caches & files.
+RUN BUILD_DEPS="make gcc g++ libc-dev ruby-dev" \
+    && clean-install $BUILD_DEPS \
+                     ca-certificates \
+                     libjemalloc1 \
+                     liblz4-1 \
+                     ruby \
+    && echo 'gem: --no-document' >> /etc/gemrc \
+    && gem install --file Gemfile \
+    && apt-get purge -y --auto-remove \
+                     -o APT::AutoRemove::RecommendsImportant=false \
+                     $BUILD_DEPS \
+    && rm -rf /tmp/* \
+              /var/lib/apt/lists/* \
+              /usr/lib/ruby/gems/*/cache/*.gem \
+              /var/log/* \
+              /var/tmp/*

 # Copy the Fluentd configuration file for logging Docker container logs.
-COPY fluent.conf /etc/td-agent/td-agent.conf
-
-# Copy the entrypoint for the container
+COPY fluent.conf /etc/fluent/fluent.conf
 COPY run.sh /run.sh

+# Expose prometheus metrics.
 EXPOSE 80

+ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.1
+
 # Start Fluentd to pick up our config that watches Docker container logs.
 CMD /run.sh $FLUENTD_ARGS
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Gemfile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Gemfile
@ -0,0 +1,9 @@
+source 'https://rubygems.org'
+
+gem 'fluentd', '~>0.12.32'
+gem 'fluent-plugin-record-reformer', '~>0.8.3'
+gem 'fluent-plugin-systemd', '~>0.0.7'
+gem 'fluent-plugin-google-cloud', '~>0.5.6'
+gem 'fluent-plugin-detect-exceptions', '~>0.0.5'
+gem 'fluent-plugin-prometheus', '~>0.2.1'
+gem 'oj', '~>2.18.1'
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
@ -25,8 +25,8 @@

 .PHONY:	build push

-PREFIX=gcr.io/google_containers
-TAG = 1.40
+PREFIX=gcr.io/google-containers
+TAG = 2.0

 build:
 	docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) .
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/README.md
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/README.md
@ -6,6 +6,20 @@ This image is designed to be used as part of the [Kubernetes](https://github.com
 cluster bring up process. The image resides at DockerHub under the name
 [kubernetes/fluentd-gcp](https://registry.hub.docker.com/u/kubernetes/fluentd-gcp/).

+# Usage
+
+The image is built with its own set of plugins which you can later use
+in the configuration. The set of plugin is enumerated in a Gemfile in the
+image's directory. You can find details about fluentd configuration on the
+[official site](http://docs.fluentd.org/articles/config-file).
+
+In order to configure fluentd image, you should mount a directory with `.conf`
+files to `/etc/fluent/config.d` or add files to that directory by building
+a new image on top. All `.conf` files in the `/etc/fluent/config.d` directory
+will be included to the final fluentd configuration.
+
+Command line arguments to the fluentd executable are passed
+via environment variable `FLUENTD_ARGS`.


 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/fluentd-gcp/fluentd-gcp-image/README.md?pixel)]()
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/fluent.conf
@ -1,395 +1,8 @@
-# This configuration file for Fluentd / td-agent is used
-# to watch changes to Docker log files that live in the
-# directory /var/lib/docker/containers/ and are symbolically
-# linked to from the /var/log directory using names that capture the
-# pod name and container name. These logs are then submitted to
-# Google Cloud Logging which assumes the installation of the cloud-logging plug-in.
-#
-# Example
-# =======
-# A line in the Docker log file might like like this JSON:
-#
-# {"log":"2014/09/25 21:15:03 Got request with path wombat\n",
-#  "stream":"stderr",
-#   "time":"2014-09-25T21:15:03.499185026Z"}
-#
-# The record reformer is used to write the tag to focus on the pod name
-# and the Kubernetes container name. For example a Docker container's logs
-# might be in the directory:
-#  /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b
-# and in the file:
-#  997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
-# where 997599971ee6... is the Docker ID of the running container.
-# The Kubernetes kubelet makes a symbolic link to this file on the host machine
-# in the /var/log/containers directory which includes the pod name and the Kubernetes
-# container name:
-#    synthetic-logger-0.25lps-pod_default-synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 
-#    -> 
-#    /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log
-# The /var/log directory on the host is mapped to the /var/log directory in the container
-# running this instance of Fluentd and we end up collecting the file:
-#   /var/log/containers/synthetic-logger-0.25lps-pod_default-synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
-# This results in the tag:
-#  var.log.containers.synthetic-logger-0.25lps-pod_default-synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log
-# The record reformer is used is discard the var.log.containers prefix and
-# the Docker container ID suffix and "kubernetes." is pre-pended giving the
-# final tag which is ingested into Elasticsearch:
-#   kubernetes.synthetic-logger-0.25lps-pod_default-synth-lgr
-# This makes it easier for users to search for logs by pod name or by
-# the name of the Kubernetes container regardless of how many times the
-# Kubernetes pod has been restarted (resulting in a several Docker container IDs).
+# This is the root config file, which only includes components of the actual configuration

-# Prevent fluentd from handling records containing its own logs. Otherwise
-# it can lead to an infinite loop, when error in sending one message generates
-# another message which also fails to be sent and so on.
+# Do not collect fluentd's own logs to avoid infinite loops.
 <match fluent.**>
  type null
 </match>

-# Example:
-# {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
-<source>
-  type tail
-  format json
-  time_key time
-  path /var/log/containers/*.log
-  pos_file /var/log/gcp-containers.log.pos
-  time_format %Y-%m-%dT%H:%M:%S.%N%Z
-  tag reform.*
-  read_from_head true
-</source>
-
-<filter reform.**>
-  type parser
-  format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
-  reserve_data true
-  suppress_parse_error_log true
-  key_name log
-</filter>
-
-<match reform.**>
-  type record_reformer
-  enable_ruby true
-  tag raw.kubernetes.${tag_suffix[4].split('-')[0..-2].join('-')}
-</match>
-
-# Detect exceptions in the log output and forward them as one log entry.
-<match raw.kubernetes.**>
-  @type copy
-
-  <store>
-    @type prometheus
-
-    <metric>
-      type counter
-      name logging_line_count
-      desc Total number of lines generated by application containers
-      <labels>
-        tag ${tag}
-      </labels>
-    </metric>
-  </store>
-  <store>
-    @type detect_exceptions
-
-    remove_tag_prefix raw
-    message log
-    stream stream
-    multiline_flush_interval 5
-    max_bytes 500000
-    max_lines 1000
-  </store>
-</match>
-
-# Example:
-# 2015-12-21 23:17:22,066 [salt.state       ][INFO    ] Completed state [net.ipv4.ip_forward] at time 23:17:22.066081
-<source>
-  type tail
-  format /^(?<time>[^ ]* [^ ,]*)[^\[]*\[[^\]]*\]\[(?<severity>[^ \]]*) *\] (?<message>.*)$/
-  time_format %Y-%m-%d %H:%M:%S
-  path /var/log/salt/minion
-  pos_file /var/log/gcp-salt.pos
-  tag salt
-</source>
-
-# Example:
-# Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
-<source>
-  type tail
-  format syslog
-  path /var/log/startupscript.log
-  pos_file /var/log/gcp-startupscript.log.pos
-  tag startupscript
-</source>
-
-# Examples:
-# time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
-# time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
-<source>
-  type tail
-  format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
-  path /var/log/docker.log
-  pos_file /var/log/gcp-docker.log.pos
-  tag docker
-</source>
-
-# Example:
-# 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal
-<source>
-  type tail
-  # Not parsing this, because it doesn't have anything particularly useful to
-  # parse out of it (like severities).
-  format none
-  path /var/log/etcd.log
-  pos_file /var/log/gcp-etcd.log.pos
-  tag etcd
-</source>
-
-# Multi-line parsing is required for all the kube logs because very large log
-# statements, such as those that include entire object bodies, get split into
-# multiple lines by glog.
-
-# Example:
-# I0204 07:32:30.020537    3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/kubelet.log
-  pos_file /var/log/gcp-kubelet.log.pos
-  tag kubelet
-</source>
-
-# Example:
-# I1118 21:26:53.975789       6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/kube-proxy.log
-  pos_file /var/log/gcp-kube-proxy.log.pos
-  tag kube-proxy
-</source>
-
-# Example:
-# I0204 07:00:19.604280       5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266]
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/kube-apiserver.log
-  pos_file /var/log/gcp-kube-apiserver.log.pos
-  tag kube-apiserver
-</source>
-
-# Example:
-# 2017-02-09T00:15:57.992775796Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" ip="104.132.1.72" method="GET" user="kubecfg" as="<self>" asgroups="<lookup>" namespace="default" uri="/api/v1/namespaces/default/pods"
-# 2017-02-09T00:15:57.993528822Z AUDIT: id="90c73c7c-97d6-4b65-9461-f94606ff825f" response="200"
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\S+\s+AUDIT:/
-  # Fields must be explicitly captured by name to be parsed into the record.
-  # Fields may not always be present, and order may change, so this just looks
-  # for a list of key="\"quoted\" value" pairs separated by spaces.
-  # Unknown fields are ignored.
-  # Note: We can't separate query/response lines as format1/format2 because
-  #       they don't always come one after the other for a given query.
-  # TODO: Maybe add a JSON output mode to audit log so we can get rid of this?
-  format1 /^(?<time>\S+) AUDIT:(?: (?:id="(?<id>(?:[^"\\]|\\.)*)"|ip="(?<ip>(?:[^"\\]|\\.)*)"|method="(?<method>(?:[^"\\]|\\.)*)"|user="(?<user>(?:[^"\\]|\\.)*)"|groups="(?<groups>(?:[^"\\]|\\.)*)"|as="(?<as>(?:[^"\\]|\\.)*)"|asgroups="(?<asgroups>(?:[^"\\]|\\.)*)"|namespace="(?<namespace>(?:[^"\\]|\\.)*)"|uri="(?<uri>(?:[^"\\]|\\.)*)"|response="(?<response>(?:[^"\\]|\\.)*)"|\w+="(?:[^"\\]|\\.)*"))*/
-  time_format %FT%T.%L%Z
-  path /var/log/kube-apiserver-audit.log
-  pos_file /var/log/gcp-kube-apiserver-audit.log.pos
-  tag kube-apiserver-audit
-</source>
-
-# Example:
-# I0204 06:55:31.872680       5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/kube-controller-manager.log
-  pos_file /var/log/gcp-kube-controller-manager.log.pos
-  tag kube-controller-manager
-</source>
-
-# Example:
-# W0204 06:49:18.239674       7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312]
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/kube-scheduler.log
-  pos_file /var/log/gcp-kube-scheduler.log.pos
-  tag kube-scheduler
-</source>
-
-# Example:
-# I1104 10:36:20.242766       5 rescheduler.go:73] Running Rescheduler
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/rescheduler.log
-  pos_file /var/log/gcp-rescheduler.log.pos
-  tag rescheduler
-</source>
-
-# Example:
-# I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/glbc.log
-  pos_file /var/log/gcp-glbc.log.pos
-  tag glbc
-</source>
-
-# Example:
-# I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
-<source>
-  type tail
-  format multiline
-  multiline_flush_interval 5s
-  format_firstline /^\w\d{4}/
-  format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/
-  time_format %m%d %H:%M:%S.%N
-  path /var/log/cluster-autoscaler.log
-  pos_file /var/log/gcp-cluster-autoscaler.log.pos
-  tag cluster-autoscaler
-</source>
-
-# Logs from systemd-journal for interesting services.
-<source>
-  type systemd
-  filters [{ "_SYSTEMD_UNIT": "docker.service" }]
-  pos_file /var/log/gcp-journald-docker.pos
-  read_from_head true
-  tag docker
-</source>
-
-<source>
-  type systemd
-  filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
-  pos_file /var/log/gcp-journald-kubelet.pos
-  read_from_head true
-  tag kubelet
-</source>
-
-# Prometheus monitoring
-<source>
-  @type prometheus
-  port 80
-</source>
-
-<source>
-  @type prometheus_monitor
-</source>
-
-<match fluent.**>
-  @type null
-</match>
-
-# We use 2 output stanzas - one to handle the container logs and one to handle
-# the node daemon logs, the latter of which explicitly sends its logs to the
-# compute.googleapis.com service rather than container.googleapis.com to keep
-# them separate since most users don't care about the node logs.
-<match kubernetes.**>
-  @type copy
-
-  <store>
-    @type google_cloud
-
-    # Set the buffer type to file to improve the reliability and reduce the memory consumption
-    buffer_type file
-    buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
-    # Set queue_full action to block because we want to pause gracefully
-    # in case of the off-the-limits load instead of throwing an exception
-    buffer_queue_full_action block
-    # Set the chunk limit conservatively to avoid exceeding the GCL limit
-    # of 10MiB per write request.
-    buffer_chunk_limit 2M
-    # Cap the combined memory usage of this buffer and the one below to
-    # 2MiB/chunk * (6 + 2) chunks = 16 MiB
-    buffer_queue_limit 6
-    # Never wait more than 5 seconds before flushing logs in the non-error case.
-    flush_interval 5s
-    # Never wait longer than 30 seconds between retries.
-    max_retry_wait 30
-    # Disable the limit on the number of retries (retry forever).
-    disable_retry_limit
-    # Use multiple threads for processing.
-    num_threads 2
-  </store>
-  <store>
-    @type prometheus
-
-    <metric>
-      type counter
-      name logging_entry_count
-      desc Total number of log entries generated by application containers
-      <labels>
-        tag ${tag}
-        component container
-      </labels>
-    </metric>
-  </store>
-</match>
-
-# Keep a smaller buffer here since these logs are less important than the user's
-# container logs.
-<match **>
-  @type copy
-
-  <store>
-    @type google_cloud
-
-    detect_subservice false
-    buffer_type file
-    buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
-    buffer_queue_full_action block
-    buffer_chunk_limit 2M
-    buffer_queue_limit 2
-    flush_interval 5s
-    max_retry_wait 30
-    disable_retry_limit
-    num_threads 2
-  </store>
-  <store>
-    @type prometheus
-
-    <metric>
-      type counter
-      name logging_entry_count
-      desc Total number of log entries generated by system components
-      <labels>
-        tag ${tag}
-        component system
-      </labels>
-    </metric>
-  </store>
-</match>
+@include /etc/fluent/config.d/*.conf
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/run.sh
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/run.sh
@ -14,16 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# These steps must be executed once the host /var and /lib volumes have
+# been mounted, and therefore cannot be done in the docker build stage.
+
 # For systems without journald
 mkdir -p /var/log/journal

-if [ ! -z "`ls /host/lib/libsystemd* 2>/dev/null`" ]
-then
+# Copy host libsystemd into image to avoid compatibility issues.
+if [ ! -z "$(ls /host/lib/libsystemd* 2>/dev/null)" ]; then
  rm /lib/x86_64-linux-gnu/libsystemd*
  cp -a /host/lib/libsystemd* /lib/x86_64-linux-gnu/
 fi

-LD_PRELOAD=/opt/td-agent/embedded/lib/libjemalloc.so
-RUBY_GC_HEAP_OLDOBJECT_LIMIT_FACTOR=0.9
-
-/usr/sbin/td-agent $@
+/usr/local/bin/fluentd $@