kubeshark/agent/pkg/oas/gibberish.go
Andrey Pokhilko a2118b869e
OAS: Improve URL clustering (#711)
* Evidence of problems

* Improving it

* Progressing

* Progress

* Fixes

* progress

* log is not right

* Fixing thresholds

* Almost there

* Imptovements

* Move some false negatives into TODO

* cosmetics

* linter (I disagree)

* linter (I disagree)

* fix test
2022-02-02 12:33:24 +02:00

186 lines
3.8 KiB
Go

package oas
import (
"math"
"regexp"
"strings"
"unicode"
)
var (
patUuid4 = regexp.MustCompile(`(?i)[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
patEmail = regexp.MustCompile(`^\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*$`)
patLongNum = regexp.MustCompile(`^\d{3,}$`)
patLongNumB = regexp.MustCompile(`[^\d]\d{3,}`)
patLongNumA = regexp.MustCompile(`\d{3,}[^\d]`)
)
func IsGibberish(str string) bool {
if IsVersionString(str) {
return false
}
if patEmail.MatchString(str) {
return true
}
if patUuid4.MatchString(str) {
return true
}
if patLongNum.MatchString(str) || patLongNumB.MatchString(str) || patLongNumA.MatchString(str) {
return true
}
//alNum := cleanStr(str, isAlNumRune)
//alpha := cleanStr(str, isAlphaRune)
// noiseAll := isNoisy(alNum)
//triAll := isTrigramBad(strings.ToLower(alpha))
// _ = noiseAll
isNotAlNum := func(r rune) bool { return !isAlNumRune(r) }
chunks := strings.FieldsFunc(str, isNotAlNum)
noisyLen := 0
alnumLen := 0
for _, chunk := range chunks {
alnumLen += len(chunk)
noise := isNoisy(chunk)
tri := isTrigramBad(strings.ToLower(chunk))
if noise || tri {
noisyLen += len(chunk)
}
}
return float64(noisyLen) > 0
//if float64(noisyLen) > 0 {
// return true
//}
//if len(chunks) > 0 && float64(noisyLen) >= float64(alnumLen)/3.0 {
// return true
//}
//if triAll {
//return true
//}
// return false
}
func noiseLevel(str string) (score float64) {
// opinionated algo of certain char pairs marking the non-human strings
prev := *new(rune)
cnt := 0.0
for _, char := range str {
cnt += 1
if prev > 0 {
switch {
// continued class of upper/lower/digit adds no noise
case unicode.IsUpper(prev) && unicode.IsUpper(char):
case unicode.IsLower(prev) && unicode.IsLower(char):
case unicode.IsDigit(prev) && unicode.IsDigit(char):
// upper =>
case unicode.IsUpper(prev) && unicode.IsLower(char):
score += 0.10
case unicode.IsUpper(prev) && unicode.IsDigit(char):
score += 0.5
// lower =>
case unicode.IsLower(prev) && unicode.IsUpper(char):
score += 0.75
case unicode.IsLower(prev) && unicode.IsDigit(char):
score += 0.5
// digit =>
case unicode.IsDigit(prev) && unicode.IsUpper(char):
score += 0.75
case unicode.IsDigit(prev) && unicode.IsLower(char):
score += 1.0
// the rest is 100% noise
default:
score += 1
}
}
prev = char
}
return score
}
func IsVersionString(component string) bool {
if component == "" {
return false
}
hasV := false
if strings.HasPrefix(component, "v") {
component = component[1:]
hasV = true
}
for _, c := range component {
if string(c) != "." && !unicode.IsDigit(c) {
return false
}
}
if !hasV && !strings.Contains(component, ".") {
return false
}
return true
}
func trigramScore(str string) (float64, int) {
tgScore := 0.0
trigrams := ngrams(str, 3)
if len(trigrams) > 0 {
for _, trigram := range trigrams {
score, found := corpus_trigrams[trigram]
if found {
tgScore += score
}
}
}
return tgScore, len(trigrams)
}
func isTrigramBad(s string) bool {
tgScore, cnt := trigramScore(s)
if cnt > 0 {
val := math.Sqrt(tgScore) / float64(cnt)
val2 := tgScore / float64(cnt)
threshold := 0.005
bad := val < threshold
threshold2 := math.Log(float64(cnt)-2) * 0.1
bad2 := val2 < threshold2
return bad && bad2 // TODO: improve this logic to be clearer
}
return false
}
func isNoisy(s string) bool {
noise := noiseLevel(s)
if len(s) > 0 {
val := (noise * noise) / float64(len(s))
threshold := 0.6
bad := val > threshold
return bad
}
return false
}
func ngrams(s string, n int) []string {
result := make([]string, 0)
for i := 0; i < len(s)-n+1; i++ {
result = append(result, s[i:i+n])
}
return result
}