Add fluentd monitoring to fluentd-gcp image

This commit is contained in:
Mik Vyatskov 2017-02-16 17:04:13 +01:00
parent 8ecc256e88
commit 8d2d91070a
3 changed files with 110 additions and 41 deletions

View File

@ -22,7 +22,6 @@
FROM gcr.io/google_containers/ubuntu-slim:0.6 FROM gcr.io/google_containers/ubuntu-slim:0.6
# Disable prompts from apt # Disable prompts from apt
ENV DEBIAN_FRONTEND noninteractive ENV DEBIAN_FRONTEND noninteractive
@ -37,6 +36,7 @@ RUN apt-get -qq update && \
td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \ td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \
td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.2 && \ td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.2 && \
td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \ td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \
td-agent-gem install --no-document fluent-plugin-prometheus -v 0.2.1 && \
# Remove build tools # Remove build tools
apt-get remove -y -qq gcc make && \ apt-get remove -y -qq gcc make && \
apt-get autoremove -y -qq && \ apt-get autoremove -y -qq && \
@ -56,5 +56,7 @@ COPY fluent.conf /etc/td-agent/td-agent.conf
# Copy the entrypoint for the container # Copy the entrypoint for the container
COPY run.sh /run.sh COPY run.sh /run.sh
EXPOSE 80
# Start Fluentd to pick up our config that watches Docker container logs. # Start Fluentd to pick up our config that watches Docker container logs.
CMD /run.sh $FLUENTD_ARGS CMD /run.sh $FLUENTD_ARGS

View File

@ -26,7 +26,7 @@
.PHONY: build push .PHONY: build push
PREFIX=gcr.io/google_containers PREFIX=gcr.io/google_containers
TAG = 1.38 TAG = 1.40
build: build:
docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) . docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) .

View File

@ -75,13 +75,30 @@
# Detect exceptions in the log output and forward them as one log entry. # Detect exceptions in the log output and forward them as one log entry.
<match raw.kubernetes.**> <match raw.kubernetes.**>
type detect_exceptions @type copy
remove_tag_prefix raw
message log <store>
stream stream @type prometheus
multiline_flush_interval 5
max_bytes 500000 <metric>
max_lines 1000 type counter
name logging_line_count
desc Total number of lines generated by application containers
<labels>
tag ${tag}
</labels>
</metric>
</store>
<store>
@type detect_exceptions
remove_tag_prefix raw
message log
stream stream
multiline_flush_interval 5
max_bytes 500000
max_lines 1000
</store>
</match> </match>
# Example: # Example:
@ -283,46 +300,96 @@
tag kubelet tag kubelet
</source> </source>
# Prometheus monitoring
<source>
@type prometheus
port 80
</source>
<source>
@type prometheus_monitor
</source>
<match fluent.**>
@type null
</match>
# We use 2 output stanzas - one to handle the container logs and one to handle # We use 2 output stanzas - one to handle the container logs and one to handle
# the node daemon logs, the latter of which explicitly sends its logs to the # the node daemon logs, the latter of which explicitly sends its logs to the
# compute.googleapis.com service rather than container.googleapis.com to keep # compute.googleapis.com service rather than container.googleapis.com to keep
# them separate since most users don't care about the node logs. # them separate since most users don't care about the node logs.
<match kubernetes.**> <match kubernetes.**>
type google_cloud @type copy
# Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_type file <store>
buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer @type google_cloud
# Set queue_full action to block because we want to pause gracefully
# in case of the off-the-limits load instead of throwing an exception # Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_queue_full_action block buffer_type file
# Set the chunk limit conservatively to avoid exceeding the GCL limit buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
# of 10MiB per write request. # Set queue_full action to block because we want to pause gracefully
buffer_chunk_limit 2M # in case of the off-the-limits load instead of throwing an exception
# Cap the combined memory usage of this buffer and the one below to buffer_queue_full_action block
# 2MiB/chunk * (6 + 2) chunks = 16 MiB # Set the chunk limit conservatively to avoid exceeding the GCL limit
buffer_queue_limit 6 # of 10MiB per write request.
# Never wait more than 5 seconds before flushing logs in the non-error case. buffer_chunk_limit 2M
flush_interval 5s # Cap the combined memory usage of this buffer and the one below to
# Never wait longer than 30 seconds between retries. # 2MiB/chunk * (6 + 2) chunks = 16 MiB
max_retry_wait 30 buffer_queue_limit 6
# Disable the limit on the number of retries (retry forever). # Never wait more than 5 seconds before flushing logs in the non-error case.
disable_retry_limit flush_interval 5s
# Use multiple threads for processing. # Never wait longer than 30 seconds between retries.
num_threads 2 max_retry_wait 30
# Disable the limit on the number of retries (retry forever).
disable_retry_limit
# Use multiple threads for processing.
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by application containers
<labels>
tag ${tag}
component container
</labels>
</metric>
</store>
</match> </match>
# Keep a smaller buffer here since these logs are less important than the user's # Keep a smaller buffer here since these logs are less important than the user's
# container logs. # container logs.
<match **> <match **>
type google_cloud @type copy
detect_subservice false
buffer_type file <store>
buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer @type google_cloud
buffer_queue_full_action block
buffer_chunk_limit 2M detect_subservice false
buffer_queue_limit 2 buffer_type file
flush_interval 5s buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
max_retry_wait 30 buffer_queue_full_action block
disable_retry_limit buffer_chunk_limit 2M
num_threads 2 buffer_queue_limit 2
flush_interval 5s
max_retry_wait 30
disable_retry_limit
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by system components
<labels>
tag ${tag}
component system
</labels>
</metric>
</store>
</match> </match>