feat: initial Prometheus analyzers (#855)

* feat: initial Prometheus analyzers

Added a prometheus integration with two analyzers:
1. PrometheusConfigValidate
2. PrometheusConfigRelabelReport

The integration does not deploy any Prometheus stack in the cluster.
Instead, it searches the provided --namespace for a Prometheus
configuration, stored in a ConfigMap or Secret. If it finds one, it
unmarshals it into memory and runs the analyzers on it.

PrometheusConfigValidate checks if the actual Prometheus configuration is valid or has
any errors.

PrometheusConfigRelabelReport tries to distill the scrape config
relabeling rules to give a concise label set per job that targets need
to have to be scraped. This analyzer is unconventional, in that it does
not necessarily mean there are issues with the config. It merely tries
to give a human-readable explanation of the relabel rules it discovers,
leaning on the LLM and prompt.

Tested on both kube-prometheus and Google Managed Prometheus
stacks.

Signed-off-by: Daniel Clark <danielclark@google.com>

* review: feedback cycle 1

Simplify ConfigValidate prompt and add comments.

Signed-off-by: Daniel Clark <danielclark@google.com>

* review: feedback cycle 2

Add Prometheus configuration discovery to integration activate command.

Also improve logging to make this more clear to users.

Signed-off-by: Daniel Clark <danielclark@google.com>

---------

Signed-off-by: Daniel Clark <danielclark@google.com>
This commit is contained in:
Daniel Clark 2024-01-12 04:58:09 -05:00 committed by GitHub
parent 4106d39c32
commit 45fa827c04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1810 additions and 8 deletions

11
go.mod
View File

@ -7,7 +7,8 @@ require (
github.com/fatih/color v1.16.0
github.com/magiconair/properties v1.8.7
github.com/mittwald/go-helm-client v0.12.5
github.com/sashabaranov/go-openai v1.17.11
github.com/prometheus/prometheus v1.8.2-0.20211119115433-692a54649ed7
github.com/sashabaranov/go-openai v1.17.10
github.com/schollz/progressbar/v3 v3.14.1
github.com/spf13/cobra v1.8.0
github.com/spf13/viper v1.18.2
@ -49,6 +50,7 @@ require (
github.com/Azure/azure-sdk-for-go/sdk/internal v1.5.1 // indirect
github.com/AzureAD/microsoft-authentication-library-for-go v1.2.0 // indirect
github.com/Microsoft/hcsshim v0.11.4 // indirect
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect
github.com/anchore/go-struct-converter v0.0.0-20230627203149-c72ef8859ca9 // indirect
github.com/cohere-ai/tokenizer v1.1.1 // indirect
github.com/containerd/log v0.1.0 // indirect
@ -56,6 +58,8 @@ require (
github.com/dlclark/regexp2 v1.10.0 // indirect
github.com/evanphx/json-patch/v5 v5.7.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-kit/log v0.2.1 // indirect
github.com/go-logfmt/logfmt v0.5.1 // indirect
github.com/golang-jwt/jwt/v5 v5.0.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect
@ -63,9 +67,12 @@ require (
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.0 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/jpillora/backoff v1.0.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
@ -209,7 +216,7 @@ require (
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v2 v2.4.0
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.28.4
k8s.io/apiserver v0.28.4 // indirect

1275
go.sum

File diff suppressed because it is too large Load Diff

View File

@ -8,10 +8,52 @@ const (
`
trivy_vuln_prompt = "Explain the following trivy scan result and the detail risk or root cause of the CVE ID, then provide a solution. Response in %s: %s"
trivy_conf_prompt = "Explain the following trivy scan result and the detail risk or root cause of the security check, then provide a solution."
prom_conf_prompt = `Simplify the following Prometheus error message delimited by triple dashes written in --- %s --- language; --- %s ---.
This error came when validating the Prometheus configuration file.
Provide step by step instructions to fix, with suggestions, referencing Prometheus documentation if relevant.
Write the output in the following format in no more than 300 characters:
Error: {Explain error here}
Solution: {Step by step solution here}
`
prom_relabel_prompt = `
Return your prompt in this language: %s, beginning with
The following is a list of the form:
job_name:
{Prometheus job_name}
relabel_configs:
{Prometheus relabel_configs}
kubernetes_sd_configs:
{Prometheus service discovery config}
---
%s
---
For each job_name, describe the Kubernetes service and pod labels,
namespaces, ports, and containers they match.
Return the message:
Discovered and parsed Prometheus scrape configurations.
For targets to be scraped by Prometheus, ensure they are running with
at least one of the following label sets:
Then for each job, write this format:
- Job: {job_name}
- Service Labels:
- {list of service labels}
- Pod Labels:
- {list of pod labels}
- Namespaces:
- {list of namespaces}
- Ports:
- {list of ports}
- Containers:
- {list of container names}
`
)
var PromptMap = map[string]string{
"default": default_prompt,
"VulnerabilityReport": trivy_vuln_prompt, // for Trivy integration, the key should match `Result.Kind` in pkg/common/types.go
"ConfigAuditReport": trivy_conf_prompt,
"default": default_prompt,
"VulnerabilityReport": trivy_vuln_prompt, // for Trivy integration, the key should match `Result.Kind` in pkg/common/types.go
"ConfigAuditReport": trivy_conf_prompt,
"PrometheusConfigValidate": prom_conf_prompt,
"PrometheusConfigRelabelReport": prom_relabel_prompt,
}

View File

@ -18,6 +18,7 @@ import (
"fmt"
"github.com/k8sgpt-ai/k8sgpt/pkg/common"
"github.com/k8sgpt-ai/k8sgpt/pkg/integration/prometheus"
"github.com/k8sgpt-ai/k8sgpt/pkg/integration/trivy"
"github.com/k8sgpt-ai/k8sgpt/pkg/util"
"github.com/spf13/viper"
@ -44,7 +45,8 @@ type Integration struct {
}
var integrations = map[string]IIntegration{
"trivy": trivy.NewTrivy(),
"trivy": trivy.NewTrivy(),
"prometheus": prometheus.NewPrometheus(),
}
func NewIntegration() *Integration {

View File

@ -0,0 +1,290 @@
package prometheus
import (
"bytes"
"compress/gzip"
"context"
"errors"
"fmt"
"io"
"net/http"
"path/filepath"
"strings"
"github.com/k8sgpt-ai/k8sgpt/pkg/common"
"github.com/k8sgpt-ai/k8sgpt/pkg/util"
promconfig "github.com/prometheus/prometheus/config"
yaml "gopkg.in/yaml.v2"
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
const (
prometheusContainerName = "prometheus"
configReloaderContainerName = "config-reloader"
prometheusConfigFlag = "--config.file="
configReloaderConfigFlag = "--config-file="
)
var prometheusPodLabels = map[string]string{
"app": "prometheus",
"app.kubernetes.io/name": "prometheus",
}
type ConfigAnalyzer struct {
}
// podConfig groups a specific pod with the Prometheus configuration and any
// other state used for informing the common.Result.
type podConfig struct {
b []byte
pod *corev1.Pod
}
func (c *ConfigAnalyzer) Analyze(a common.Analyzer) ([]common.Result, error) {
ctx := a.Context
client := a.Client.GetClient()
namespace := a.Namespace
kind := ConfigValidate
podConfigs, err := findPrometheusPodConfigs(ctx, client, namespace)
if err != nil {
return nil, err
}
var preAnalysis = map[string]common.PreAnalysis{}
for _, pc := range podConfigs {
var failures []common.Failure
pod := pc.pod
// Check upstream validation.
// The Prometheus configuration structs do not generally have validation
// methods and embed their validation logic in the UnmarshalYAML methods.
config, err := unmarshalPromConfigBytes(pc.b)
if err != nil {
failures = append(failures, common.Failure{
Text: fmt.Sprintf("error validating Prometheus YAML configuration: %s", err),
})
}
_, err = yaml.Marshal(config)
if err != nil {
failures = append(failures, common.Failure{
Text: fmt.Sprintf("error validating Prometheus struct configuration: %s", err),
})
}
// Check for empty scrape config.
if len(config.ScrapeConfigs) == 0 {
failures = append(failures, common.Failure{
Text: "no scrape configurations. Prometheus will not scrape any metrics.",
})
}
if len(failures) > 0 {
preAnalysis[fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)] = common.PreAnalysis{
Pod: *pod,
FailureDetails: failures,
}
}
}
for key, value := range preAnalysis {
var currentAnalysis = common.Result{
Kind: kind,
Name: key,
Error: value.FailureDetails,
}
parent, _ := util.GetParent(a.Client, value.Pod.ObjectMeta)
currentAnalysis.ParentObject = parent
a.Results = append(a.Results, currentAnalysis)
}
return a.Results, nil
}
func configKey(namespace string, volume *corev1.Volume) (string, error) {
if volume.ConfigMap != nil {
return fmt.Sprintf("configmap/%s/%s", namespace, volume.ConfigMap.Name), nil
} else if volume.Secret != nil {
return fmt.Sprintf("secret/%s/%s", namespace, volume.Secret.SecretName), nil
} else {
return "", errors.New("volume format must be ConfigMap or Secret")
}
}
func findPrometheusPodConfigs(ctx context.Context, client kubernetes.Interface, namespace string) ([]podConfig, error) {
var configs []podConfig
pods, err := findPrometheusPods(ctx, client, namespace)
if err != nil {
return nil, err
}
var configCache = make(map[string]bool)
for _, pod := range pods {
// Extract volume of Promethues config.
volume, key, err := findPrometheusConfigVolumeAndKey(ctx, client, &pod)
if err != nil {
return nil, err
}
// See if we processed it already; if so, don't process again.
ck, err := configKey(pod.Namespace, volume)
if err != nil {
return nil, err
}
_, ok := configCache[ck]
if ok {
continue
}
configCache[ck] = true
// Extract Prometheus config bytes from volume.
b, err := extractPrometheusConfigFromVolume(ctx, client, volume, pod.Namespace, key)
if err != nil {
return nil, err
}
configs = append(configs, podConfig{
pod: &pod,
b: b,
})
}
return configs, nil
}
func findPrometheusPods(ctx context.Context, client kubernetes.Interface, namespace string) ([]corev1.Pod, error) {
var proms []corev1.Pod
for k, v := range prometheusPodLabels {
pods, err := util.GetPodListByLabels(client, namespace, map[string]string{
k: v,
})
if err != nil {
return nil, err
}
proms = append(proms, pods.Items...)
}
// If we still haven't found any Prometheus pods, make a last-ditch effort to
// scrape the namespace for "prometheus" containers.
if len(proms) == 0 {
pods, err := client.CoreV1().Pods(namespace).List(ctx, v1.ListOptions{})
if err != nil {
return nil, err
}
for _, pod := range pods.Items {
for _, c := range pod.Spec.Containers {
if c.Name == prometheusContainerName {
proms = append(proms, pod)
}
}
}
}
return proms, nil
}
func findPrometheusConfigPath(ctx context.Context, client kubernetes.Interface, pod *corev1.Pod) (string, error) {
var path string
var err error
for _, container := range pod.Spec.Containers {
for _, arg := range container.Args {
// Prefer the config-reloader container config file as it normally
// references the ConfigMap or Secret volume mount.
// Fallback to the prometheus container if that's not found.
if strings.HasPrefix(arg, prometheusConfigFlag) {
path = strings.TrimLeft(arg, prometheusConfigFlag)
}
if strings.HasPrefix(arg, configReloaderConfigFlag) {
path = strings.TrimLeft(arg, configReloaderConfigFlag)
}
}
if container.Name == configReloaderContainerName {
return path, nil
}
}
if path == "" {
err = fmt.Errorf("prometheus config path not found in pod: %s", pod.Name)
}
return path, err
}
func findPrometheusConfigVolumeAndKey(ctx context.Context, client kubernetes.Interface, pod *corev1.Pod) (*corev1.Volume, string, error) {
path, err := findPrometheusConfigPath(ctx, client, pod)
if err != nil {
return nil, "", err
}
// Find the volumeMount the config path is pointing to.
var volumeName = ""
for _, container := range pod.Spec.Containers {
for _, vm := range container.VolumeMounts {
if strings.HasPrefix(path, vm.MountPath) {
volumeName = vm.Name
break
}
}
}
// Get the actual Volume from the name.
for _, volume := range pod.Spec.Volumes {
if volume.Name == volumeName {
return &volume, filepath.Base(path), nil
}
}
return nil, "", errors.New("volume for Prometheus config not found")
}
func extractPrometheusConfigFromVolume(ctx context.Context, client kubernetes.Interface, volume *corev1.Volume, namespace, key string) ([]byte, error) {
var b []byte
var ok bool
// Check for Secret volume.
if vs := volume.Secret; vs != nil {
s, err := client.CoreV1().Secrets(namespace).Get(ctx, vs.SecretName, v1.GetOptions{})
if err != nil {
return nil, err
}
b, ok = s.Data[key]
if !ok {
return nil, fmt.Errorf("unable to find file key in secret: %s", key)
}
}
// Check for ConfigMap volume.
if vcm := volume.ConfigMap; vcm != nil {
cm, err := client.CoreV1().ConfigMaps(namespace).Get(ctx, vcm.Name, v1.GetOptions{})
if err != nil {
return nil, err
}
s, ok := cm.Data[key]
b = []byte(s)
if !ok {
return nil, fmt.Errorf("unable to find file key in configmap: %s", key)
}
}
return b, nil
}
func unmarshalPromConfigBytes(b []byte) (*promconfig.Config, error) {
var config promconfig.Config
// Unmarshal the data into a Prometheus config.
if err := yaml.Unmarshal(b, &config); err == nil {
return &config, nil
// If there were errors, try gunziping the data.
} else if content := http.DetectContentType(b); content == "application/x-gzip" {
r, err := gzip.NewReader(bytes.NewBuffer(b))
if err != nil {
return &config, err
}
gunzipBytes, err := io.ReadAll(r)
if err != nil {
return &config, err
}
err = yaml.Unmarshal(gunzipBytes, &config)
if err != nil {
return nil, err
}
return &config, nil
} else {
return &config, err
}
}

View File

@ -0,0 +1,105 @@
package prometheus
import (
"context"
"errors"
"fmt"
"os"
"github.com/fatih/color"
"github.com/k8sgpt-ai/k8sgpt/pkg/common"
"github.com/k8sgpt-ai/k8sgpt/pkg/kubernetes"
"github.com/spf13/viper"
)
const (
ConfigValidate = "PrometheusConfigValidate"
ConfigRelabel = "PrometheusConfigRelabelReport"
)
type Prometheus struct {
}
func NewPrometheus() *Prometheus {
return &Prometheus{}
}
func (p *Prometheus) Deploy(namespace string) error {
// no-op
color.Green("Activating prometheus integration...")
// TODO(pintohutch): add timeout or inherit an upstream context
// for better signal management.
ctx := context.Background()
kubecontext := viper.GetString("kubecontext")
kubeconfig := viper.GetString("kubeconfig")
client, err := kubernetes.NewClient(kubecontext, kubeconfig)
if err != nil {
color.Red("Error initialising kubernetes client: %v", err)
os.Exit(1)
}
// We just care about existing deployments.
// Try and find Prometheus configurations in the cluster using the provided namespace.
//
// Note: We could cache this state and inject it into the various analyzers
// to save additional parsing later.
// However, the state of the cluster can change from activation to analysis,
// so we would want to run this again on each analyze call anyway.
//
// One consequence of this is one can run `activate` in one namespace
// and run `analyze` in another, without issues, as long as Prometheus
// is found in both.
// We accept this as a trade-off for the time-being to avoid having the tool
// manage Prometheus on the behalf of users.
podConfigs, err := findPrometheusPodConfigs(ctx, client.GetClient(), namespace)
if err != nil {
color.Red("Error discovering Prometheus worklads: %v", err)
os.Exit(1)
}
if len(podConfigs) == 0 {
color.Yellow(fmt.Sprintf(`Prometheus installation not found in namespace: %s.
Please ensure Prometheus is deployed to analyze.`, namespace))
return errors.New("no prometheus installation found")
}
// Prime state of the analyzer so
color.Green("Found existing installation")
return nil
}
func (p *Prometheus) UnDeploy(_ string) error {
// no-op
// We just care about existing deployments.
color.Yellow("Integration will leave Prometheus resources deployed. This is an effective no-op in the cluster.")
return nil
}
func (p *Prometheus) AddAnalyzer(mergedMap *map[string]common.IAnalyzer) {
(*mergedMap)[ConfigValidate] = &ConfigAnalyzer{}
(*mergedMap)[ConfigRelabel] = &RelabelAnalyzer{}
}
func (p *Prometheus) GetAnalyzerName() []string {
return []string{ConfigValidate, ConfigRelabel}
}
func (p *Prometheus) GetNamespace() (string, error) {
return "", nil
}
func (p *Prometheus) OwnsAnalyzer(analyzer string) bool {
return (analyzer == ConfigValidate) || (analyzer == ConfigRelabel)
}
func (t *Prometheus) IsActivate() bool {
activeFilters := viper.GetStringSlice("active_filters")
for _, filter := range t.GetAnalyzerName() {
for _, af := range activeFilters {
if af == filter {
return true
}
}
}
return false
}

View File

@ -0,0 +1,85 @@
package prometheus
import (
"fmt"
"github.com/k8sgpt-ai/k8sgpt/pkg/common"
"github.com/k8sgpt-ai/k8sgpt/pkg/util"
discoverykube "github.com/prometheus/prometheus/discovery/kubernetes"
"gopkg.in/yaml.v2"
)
type RelabelAnalyzer struct {
}
func (r *RelabelAnalyzer) Analyze(a common.Analyzer) ([]common.Result, error) {
ctx := a.Context
client := a.Client.GetClient()
namespace := a.Namespace
kind := ConfigRelabel
podConfigs, err := findPrometheusPodConfigs(ctx, client, namespace)
if err != nil {
return nil, err
}
var preAnalysis = map[string]common.PreAnalysis{}
for _, pc := range podConfigs {
var failures []common.Failure
pod := pc.pod
// Check upstream validation.
// The Prometheus configuration structs do not generally have validation
// methods and embed their validation logic in the UnmarshalYAML methods.
config, _ := unmarshalPromConfigBytes(pc.b)
// Limit output for brevity.
limit := 6
i := 0
for _, sc := range config.ScrapeConfigs {
if i == limit {
break
}
if sc == nil {
continue
}
brc, _ := yaml.Marshal(sc.RelabelConfigs)
var bsd []byte
for _, cfg := range sc.ServiceDiscoveryConfigs {
ks, ok := cfg.(*discoverykube.SDConfig)
if !ok {
continue
}
bsd, _ = yaml.Marshal(ks)
}
// Don't bother with relabel analysis if the scrape config
// or service discovery config are empty.
if len(brc) == 0 || len(bsd) == 0 {
continue
}
failures = append(failures, common.Failure{
Text: fmt.Sprintf("job_name:\n%s\nrelabel_configs:\n%s\nkubernetes_sd_configs:\n%s\n", sc.JobName, string(brc), string(bsd)),
})
i++
}
if len(failures) > 0 {
preAnalysis[fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)] = common.PreAnalysis{
Pod: *pod,
FailureDetails: failures,
}
}
}
for key, value := range preAnalysis {
var currentAnalysis = common.Result{
Kind: kind,
Name: key,
Error: value.FailureDetails,
}
parent, _ := util.GetParent(a.Client, value.Pod.ObjectMeta)
currentAnalysis.ParentObject = parent
a.Results = append(a.Results, currentAnalysis)
}
return a.Results, nil
}