Merge pull request #41567 from Crassirostris/fluentd-gcp-monitoring

Automatic merge from submit-queue (batch tested with PRs 39855, 41433, 41567, 41887, 41652)

Add fluentd monitoring to fluentd-gcp image

Right now we are not able to monitor the state of fluentd in cluster, which may result in logging subsystem quietly failing. This PR tries to address that problem by introducing the fluentd container monitoring:

* fluentd internal metrics, like number of buffers and number of data in buffers
* `logging_line_count`, number of lines, read by fluentd from application containers' logs
    * Has `tag` label, corresponding to the fluentd tag of the entry
* `logging_entry_count`, number of entries, emitted to the output plugin
    * With label `component` set to `container`, generated by application containers
    * With label `component` set to `system`, generated by system components like kubelet, docker, scheduler, etc.
    * Has `tag` label, corresponding to the fluentd tag of the entry

CC @fabxc @igorpeshansky @edsiper
This commit is contained in:
Kubernetes Submit Queue 2017-02-23 09:36:33 -08:00 committed by GitHub
commit bb5fdff58b
3 changed files with 110 additions and 41 deletions

View File

@ -22,7 +22,6 @@
FROM gcr.io/google_containers/ubuntu-slim:0.6
# Disable prompts from apt
ENV DEBIAN_FRONTEND noninteractive
@ -37,6 +36,7 @@ RUN apt-get -qq update && \
td-agent-gem install --no-document fluent-plugin-systemd -v 0.0.5 && \
td-agent-gem install --no-document fluent-plugin-google-cloud -v 0.5.6 && \
td-agent-gem install --no-document fluent-plugin-detect-exceptions -v 0.0.4 && \
td-agent-gem install --no-document fluent-plugin-prometheus -v 0.2.1 && \
# Remove build tools
apt-get remove -y -qq gcc make && \
apt-get autoremove -y -qq && \
@ -56,5 +56,7 @@ COPY fluent.conf /etc/td-agent/td-agent.conf
# Copy the entrypoint for the container
COPY run.sh /run.sh
EXPOSE 80
# Start Fluentd to pick up our config that watches Docker container logs.
CMD /run.sh $FLUENTD_ARGS

View File

@ -26,7 +26,7 @@
.PHONY: build push
PREFIX=gcr.io/google_containers
TAG = 1.38
TAG = 1.40
build:
docker build --pull -t $(PREFIX)/fluentd-gcp:$(TAG) .

View File

@ -75,13 +75,30 @@
# Detect exceptions in the log output and forward them as one log entry.
<match raw.kubernetes.**>
type detect_exceptions
remove_tag_prefix raw
message log
stream stream
multiline_flush_interval 5
max_bytes 500000
max_lines 1000
@type copy
<store>
@type prometheus
<metric>
type counter
name logging_line_count
desc Total number of lines generated by application containers
<labels>
tag ${tag}
</labels>
</metric>
</store>
<store>
@type detect_exceptions
remove_tag_prefix raw
message log
stream stream
multiline_flush_interval 5
max_bytes 500000
max_lines 1000
</store>
</match>
# Example:
@ -283,46 +300,96 @@
tag kubelet
</source>
# Prometheus monitoring
<source>
@type prometheus
port 80
</source>
<source>
@type prometheus_monitor
</source>
<match fluent.**>
@type null
</match>
# We use 2 output stanzas - one to handle the container logs and one to handle
# the node daemon logs, the latter of which explicitly sends its logs to the
# compute.googleapis.com service rather than container.googleapis.com to keep
# them separate since most users don't care about the node logs.
<match kubernetes.**>
type google_cloud
# Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
# Set queue_full action to block because we want to pause gracefully
# in case of the off-the-limits load instead of throwing an exception
buffer_queue_full_action block
# Set the chunk limit conservatively to avoid exceeding the GCL limit
# of 10MiB per write request.
buffer_chunk_limit 2M
# Cap the combined memory usage of this buffer and the one below to
# 2MiB/chunk * (6 + 2) chunks = 16 MiB
buffer_queue_limit 6
# Never wait more than 5 seconds before flushing logs in the non-error case.
flush_interval 5s
# Never wait longer than 30 seconds between retries.
max_retry_wait 30
# Disable the limit on the number of retries (retry forever).
disable_retry_limit
# Use multiple threads for processing.
num_threads 2
@type copy
<store>
@type google_cloud
# Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
# Set queue_full action to block because we want to pause gracefully
# in case of the off-the-limits load instead of throwing an exception
buffer_queue_full_action block
# Set the chunk limit conservatively to avoid exceeding the GCL limit
# of 10MiB per write request.
buffer_chunk_limit 2M
# Cap the combined memory usage of this buffer and the one below to
# 2MiB/chunk * (6 + 2) chunks = 16 MiB
buffer_queue_limit 6
# Never wait more than 5 seconds before flushing logs in the non-error case.
flush_interval 5s
# Never wait longer than 30 seconds between retries.
max_retry_wait 30
# Disable the limit on the number of retries (retry forever).
disable_retry_limit
# Use multiple threads for processing.
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by application containers
<labels>
tag ${tag}
component container
</labels>
</metric>
</store>
</match>
# Keep a smaller buffer here since these logs are less important than the user's
# container logs.
<match **>
type google_cloud
detect_subservice false
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
buffer_queue_full_action block
buffer_chunk_limit 2M
buffer_queue_limit 2
flush_interval 5s
max_retry_wait 30
disable_retry_limit
num_threads 2
@type copy
<store>
@type google_cloud
detect_subservice false
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
buffer_queue_full_action block
buffer_chunk_limit 2M
buffer_queue_limit 2
flush_interval 5s
max_retry_wait 30
disable_retry_limit
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by system components
<labels>
tag ${tag}
component system
</labels>
</metric>
</store>
</match>