From bbe7b2ebed1a58ea2d18c526a95001702c162c6a Mon Sep 17 00:00:00 2001 From: Barni S Date: Tue, 11 Aug 2020 19:09:47 -0400 Subject: [PATCH] Use fluentbit as logging agent. Replace stackdriver --- cluster/gce/windows/configure.ps1 | 2 +- cluster/gce/windows/k8s-node-setup.psm1 | 618 +++++++++++++----------- 2 files changed, 345 insertions(+), 275 deletions(-) diff --git a/cluster/gce/windows/configure.ps1 b/cluster/gce/windows/configure.ps1 index 7674efc9846..6cd31b48280 100644 --- a/cluster/gce/windows/configure.ps1 +++ b/cluster/gce/windows/configure.ps1 @@ -148,7 +148,7 @@ try { Configure-GcePdTools Configure-Kubelet - # Even if Stackdriver is already installed, the function will still [re]start the service. + # Even if Logging agent is already installed, the function will still [re]start the service. if (IsLoggingEnabled $kube_env) { Install-LoggingAgent Configure-LoggingAgent diff --git a/cluster/gce/windows/k8s-node-setup.psm1 b/cluster/gce/windows/k8s-node-setup.psm1 index df55317419e..845990521b0 100644 --- a/cluster/gce/windows/k8s-node-setup.psm1 +++ b/cluster/gce/windows/k8s-node-setup.psm1 @@ -1529,349 +1529,419 @@ function Start_Containerd { Log-Output "Starting containerd service" Start-Service containerd } - -# TODO(pjh): move the Stackdriver logging agent code below into a separate +# TODO(pjh): move the logging agent code below into a separate # module; it was put here temporarily to avoid disrupting the file layout in # the K8s release machinery. -$STACKDRIVER_VERSION = 'v1-11' -$STACKDRIVER_ROOT = 'C:\Program Files (x86)\Stackdriver' +$LOGGINGAGENT_VERSION = '1.6.0' +$LOGGINGAGENT_ROOT = 'C:\fluent-bit' +$LOGGINGAGENT_SERVICE = 'fluent-bit' +$LOGGINGAGENT_CMDLINE = '*fluent-bit.exe*' -# Restarts the Stackdriver logging agent, or starts it if it is not currently -# running. A standard `Restart-Service StackdriverLogging` may fail because -# StackdriverLogging sometimes is unstoppable, so this function works around it -# by killing the processes. +$LOGGINGEXPORTER_VERSION = 'v0.10.3' +$LOGGINGEXPORTER_ROOT = 'C:\flb-exporter' +$LOGGINGEXPORTER_SERVICE = 'flb-exporter' +$LOGGINGEXPORTER_CMDLINE = '*flb-exporter.exe*' + +# Restart Logging agent or starts it if it is not currently running function Restart-LoggingAgent { - Stop-Service -NoWait -ErrorAction Ignore StackdriverLogging + Restart-LogService $LOGGINGEXPORTER_SERVICE $LOGGINGEXPORTER_CMDLINE + Restart-LogService $LOGGINGAGENT_SERVICE $LOGGINGAGENT_CMDLINE +} + +# Restarts the service, or starts it if it is not currently +# running. A standard `Restart-Service` may fail because +# the process is sometimes unstoppable, so this function works around it +# by killing the processes. +function Restart-LogService([string]$service, [string]$cmdline) { + Stop-Service -NoWait -ErrorAction Ignore $service # Wait (if necessary) for service to stop. $timeout = 10 - $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped' + $stopped = (Get-service $service).Status -eq 'Stopped' for ($i = 0; $i -lt $timeout -and !($stopped); $i++) { Start-Sleep 1 - $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped' + $stopped = (Get-service $service).Status -eq 'Stopped' } - if ((Get-service StackdriverLogging).Status -ne 'Stopped') { + if ((Get-service $service).Status -ne 'Stopped') { # Force kill the processes. Stop-Process -Force -PassThru -Id (Get-WmiObject win32_process | - Where CommandLine -Like '*Stackdriver/logging*').ProcessId + Where CommandLine -Like $cmdline).ProcessId # Wait until process has stopped. $waited = 0 $log_period = 10 $timeout = 60 - while ((Get-service StackdriverLogging).Status -ne 'Stopped' -and $waited -lt $timeout) { + while ((Get-service $service).Status -ne 'Stopped' -and $waited -lt $timeout) { Start-Sleep 1 $waited++ if ($waited % $log_period -eq 0) { - Log-Output "Waiting for StackdriverLogging service to stop" + Log-Output "Waiting for ${service} service to stop" } } # Timeout occurred if ($waited -ge $timeout) { - Throw ("Timeout while waiting for StackdriverLogging service to stop") + Throw ("Timeout while waiting for ${service} service to stop") } } - Start-Service StackdriverLogging + Start-Service $service } # Check whether the logging agent is installed by whether it's registered as service function IsLoggingAgentInstalled { - $stackdriver_status = (Get-Service StackdriverLogging -ErrorAction Ignore).Status - return -not [string]::IsNullOrEmpty($stackdriver_status) + $logging_status = (Get-Service $LOGGINGAGENT_SERVICE -ErrorAction Ignore).Status + return -not [string]::IsNullOrEmpty($logging_status) } -# Clean up the logging agent's registry key and root folder if they exist from a prior installation. -# Try to uninstall it first, if it failed, remove the registry key at least, -# as the registry key will block the silent installation later on. -function Cleanup-LoggingAgent { - # For 64 bits app, the registry path is 'HKLM:\SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall' - # for 32 bits app, it's 'HKLM:\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall' - # StackdriverLogging is installed as 32 bits app - $x32_app_reg = 'HKLM:\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall' - $uninstall_string = (Get-ChildItem $x32_app_reg | Get-ItemProperty | Where-Object {$_.DisplayName -match "Stackdriver"}).UninstallString - if (-not [string]::IsNullOrEmpty($uninstall_string)) { - try { - Start-Process -FilePath "$uninstall_string" -ArgumentList "/S" -Wait - } catch { - Log-Output "Exception happens during uninstall logging agent, so remove the registry key at least" - Remove-Item -Path "$x32_app_reg\GoogleStackdriverLoggingAgent\" - } - } - - # If we chose reboot after uninstallation, the root folder would be clean. - # But since we couldn't reboot, so some files & folders would be left there, - # which could block the re-installation later on, so clean it up - if(Test-Path $STACKDRIVER_ROOT){ - Remove-Item -Force -Recurse $STACKDRIVER_ROOT - } -} - -# Installs the Stackdriver logging agent according to -# https://cloud.google.com/logging/docs/agent/installation. -# TODO(yujuhong): Update to a newer Stackdriver agent once it is released to -# support kubernetes metadata properly. The current version does not recognizes -# the local resource key "logging.googleapis.com/local_resource_id", and fails -# to label namespace, pod and container names on the logs. +# Installs the logging agent according to https://docs.fluentbit.io/manual/installation/windows# +# Also installs fluent bit stackdriver exporter function Install-LoggingAgent { - # Remove the existing storage.json file if it exists. This is a workaround - # for the bug where the logging agent cannot start up if the file is - # corrupted. - Remove-Item ` - -Force ` - -ErrorAction Ignore ` - ("$STACKDRIVER_ROOT\LoggingAgent\Main\pos\winevtlog.pos\worker0\" + - "storage.json") - if (IsLoggingAgentInstalled) { - # Note: we should reinstall the Stackdriver agent if $REDO_STEPS is true + # Note: we should reinstall the agent if $REDO_STEPS is true # here, but we don't know how to run the installer without it prompting - # when Stackdriver is already installed. We dumped the strings in the + # when logging agent is already installed. We dumped the strings in the # installer binary and searched for flags to do this but found nothing. Oh # well. - Log-Output ("Skip: Stackdriver logging agent is already installed") + Log-Output ("Skip: Fluentbit logging agent is already installed") return } - - # After a crash, the StackdriverLogging service could be missing, but its files will still be present - Cleanup-LoggingAgent - - $url = ("https://storage.googleapis.com/gke-release/winnode/stackdriver/" + - "StackdriverLogging-${STACKDRIVER_VERSION}.exe") - $tmp_dir = 'C:\stackdriver_tmp' - New-Item $tmp_dir -ItemType 'directory' -Force | Out-Null - $installer_file = "${tmp_dir}\StackdriverLogging-${STACKDRIVER_VERSION}.exe" - MustDownload-File -OutFile $installer_file -URLs $url - - # Start the installer silently. This automatically starts the - # "StackdriverLogging" service. - Log-Output 'Invoking Stackdriver installer' - Start-Process $installer_file -ArgumentList "/S" -Wait - - # Install the record-reformer plugin. - Start-Process "$STACKDRIVER_ROOT\LoggingAgent\Main\bin\fluent-gem" ` - -ArgumentList "install","fluent-plugin-record-reformer" ` - -Wait - - # Install the multi-format-parser plugin. - Start-Process "$STACKDRIVER_ROOT\LoggingAgent\Main\bin\fluent-gem" ` - -ArgumentList "install","fluent-plugin-multi-format-parser" ` - -Wait - - Remove-Item -Force -Recurse $tmp_dir + + DownloadAndInstall-LoggingAgents + Create-LoggingAgentServices } -# Writes the logging configuration file for Stackdriver. Restart-LoggingAgent +function DownloadAndInstall-LoggingAgents { + # Install Logging agent if not present + if (ShouldWrite-File $LOGGINGAGENT_ROOT\td-agent-bit-${LOGGINGAGENT_VERSION}-win64) { + $install_dir = 'C:\flb-installers' + $url = ("https://storage.googleapis.com/gke-release/winnode/fluentbit/td-agent-bit-${LOGGINGAGENT_VERSION}-win64.zip") + + Log-Output 'Downloading Logging agent' + New-Item $install_dir -ItemType 'directory' -Force | Out-Null + MustDownload-File -OutFile $install_dir\td.zip -URLs $url + + cd $install_dir + Log-Output 'Extracting Logging agent' + Expand-Archive td.zip + mv .\td\td-agent-bit-${LOGGINGAGENT_VERSION}-win64\ $LOGGINGAGENT_ROOT + Remove-Item -Force -Recurse $install_dir + } + + # Download Logging exporter if needed + if (ShouldWrite-File $LOGGINGEXPORTER_ROOT\flb-exporter.exe) { + Log-Output 'Downloading logging exporter' + New-Item $LOGGINGEXPORTER_ROOT -ItemType 'directory' -Force | Out-Null + MustDownload-File ` + -OutFile $LOGGINGEXPORTER_ROOT\flb-exporter.exe ` + -URLs 'https://storage.googleapis.com/gke-release/winnode/fluentbit-exporter/${LOGGINGEXPORTER_VERSION}/flb-exporter-${LOGGINGEXPORTER_VERSION}.exe' + } +} + +function Create-LoggingAgentServices { + cd $LOGGINGAGENT_ROOT + + Log-Output 'Creating service: ${LOGGINGAGENT_SERVICE}' + sc.exe create $LOGGINGAGENT_SERVICE binpath= "${LOGGINGAGENT_ROOT}\bin\fluent-bit.exe -c \fluent-bit\conf\fluent-bit.conf" + sc.exe failure $LOGGINGAGENT_SERVICE reset= 30 actions= restart/5000 + sc.exe query $LOGGINGAGENT_SERVICE + + Log-Output 'Creating service: ${LOGGINGEXPORTER_SERVICE}' + sc.exe create $LOGGINGEXPORTER_SERVICE binpath= "${LOGGINGEXPORTER_ROOT}\flb-exporter.exe --kubernetes-separator=_ --stackdriver-resource-model=k8s --enable-pod-label-discovery --logtostderr --winsvc --pod-label-dot-replacement=_" + sc.exe failure $LOGGINGEXPORTER_SERVICE reset= 30 actions= restart/5000 + sc.exe query $LOGGINGEXPORTER_SERVICE +} + +# Writes the logging configuration file for Logging agent. Restart-LoggingAgent # should then be called to pick up the new configuration. function Configure-LoggingAgent { - $fluentd_config_dir = "$STACKDRIVER_ROOT\LoggingAgent\config.d" - $fluentd_config_file = "$fluentd_config_dir\k8s_containers.conf" + $fluentbit_config_file = "$LOGGINGAGENT_ROOT\conf\fluent-bit.conf" + $FLUENTBIT_CONFIG | Out-File -FilePath $fluentbit_config_file -Encoding ASCII + Log-Output "Wrote logging config to $fluentbit_config_file" - # Create a configuration file for kubernetes containers. - # The config.d directory should have already been created automatically, but - # try creating again just in case. - New-Item $fluentd_config_dir -ItemType 'directory' -Force | Out-Null - - $config = $FLUENTD_CONFIG.replace('NODE_NAME', (hostname)) - $config | Out-File -FilePath $fluentd_config_file -Encoding ASCII - Log-Output "Wrote fluentd logging config to $fluentd_config_file" + $fluentbit_parser_file = "$LOGGINGAGENT_ROOT\conf\parsers.conf" + $PARSERS_CONFIG | Out-File -FilePath $fluentbit_parser_file -Encoding ASCII + Log-Output "Wrote logging config to $fluentbit_parser_file" } -# The NODE_NAME placeholder must be replaced with the node's name (hostname). -$FLUENTD_CONFIG = @' -# This configuration file for Fluentd is used to watch changes to kubernetes -# container logs in the directory /var/lib/docker/containers/ and submit the -# log records to Google Cloud Logging using the cloud-logging plugin. -# -# Example -# ======= -# A line in the Docker log file might look like this JSON: -# -# {"log":"2014/09/25 21:15:03 Got request with path wombat\\n", -# "stream":"stderr", -# "time":"2014-09-25T21:15:03.499185026Z"} -# -# The original tag is derived from the log file's location. -# For example a Docker container's logs might be in the directory: -# /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b -# and in the file: -# 997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log -# where 997599971ee6... is the Docker ID of the running container. -# The Kubernetes kubelet makes a symbolic link to this file on the host -# machine in the /var/log/containers directory which includes the pod name, -# the namespace name and the Kubernetes container name: -# synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log -# -> -# /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log -# The /var/log directory on the host is mapped to the /var/log directory in the container -# running this instance of Fluentd and we end up collecting the file: -# /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log -# This results in the tag: -# var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log -# where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the -# namespace name, 'synth-lgr' is the container name and '997599971ee6..' is -# the container ID. -# The record reformer is used to extract pod_name, namespace_name and -# container_name from the tag and set them in a local_resource_id in the -# format of: -# 'k8s_container...'. -# The reformer also changes the tags to 'stderr' or 'stdout' based on the -# value of 'stream'. -# local_resource_id is later used by google_cloud plugin to determine the -# monitored resource to ingest logs against. +# Fluentbit main config file +$FLUENTBIT_CONFIG = @' +[SERVICE] + Flush 5 + Grace 120 + Log_Level debug + Log_File /var/log/fluentbit.log + Daemon off + Parsers_File parsers.conf + HTTP_Server off + HTTP_Listen 0.0.0.0 + HTTP_PORT 2020 + plugins_file plugins.conf + + # Storage + # ======= + # Fluent Bit can use memory and filesystem buffering based mechanisms + # + # - https://docs.fluentbit.io/manual/administration/buffering-and-storage + # + # storage metrics + # --------------- + # publish storage pipeline metrics in '/api/v1/storage'. The metrics are + # exported only if the 'http_server' option is enabled. + # + # storage.metrics on + + # storage.path + # ------------ + # absolute file system path to store filesystem data buffers (chunks). + # + # storage.path /tmp/storage + + # storage.sync + # ------------ + # configure the synchronization mode used to store the data into the + # filesystem. It can take the values normal or full. + # + # storage.sync normal + + # storage.checksum + # ---------------- + # enable the data integrity check when writing and reading data from the + # filesystem. The storage layer uses the CRC32 algorithm. + # + # storage.checksum off + + # storage.backlog.mem_limit + # ------------------------- + # if storage.path is set, Fluent Bit will look for data chunks that were + # not delivered and are still in the storage layer, these are called + # backlog data. This option configure a hint of maximum value of memory + # to use when processing these records. + # + # storage.backlog.mem_limit 5M + + +[INPUT] + Name winlog + Interval_Sec 2 + # Channels Setup,Windows PowerShell + Channels application,system,security + Tag winevent.raw + DB winlog.sqlite # + # Json Log Example: # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} -# CRI Log Example: -# 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here - - @type tail - path /var/log/containers/*.log - pos_file /var/log/gcp-containers.log.pos - # Tags at this point are in the format of: - # reform.var.log.containers.__-.log - tag reform.* - read_from_head true - - @type multi_format - - format json - time_key time - time_format %Y-%m-%dT%H:%M:%S.%NZ - - - format /^(? - - +[INPUT] + Name tail + Alias kube_containers + Tag kube___ + Tag_Regex (?[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?[^_]+)_(?.+)- + Path /var/log/containers/*.log + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 5 + DB flb_kube.db + + # Settings from fluentd missing here. + # tag reform.* + # format json + # time_key time + # time_format %Y-%m-%dT%H:%M:%S.%NZ + # Example: # I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537] - - @type tail - format multiline - multiline_flush_interval 5s - format_firstline /^\w\d{4}/ - format1 /^(?\w)(?