mirror of
https://github.com/kubeshark/kubeshark.git
synced 2025-06-20 21:44:42 +00:00
* Evidence of problems * Improving it * Progressing * Progress * Fixes * progress * log is not right * Fixing thresholds * Almost there * Imptovements * Move some false negatives into TODO * cosmetics * linter (I disagree) * linter (I disagree) * fix test
186 lines
3.8 KiB
Go
186 lines
3.8 KiB
Go
package oas
|
|
|
|
import (
|
|
"math"
|
|
"regexp"
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
var (
|
|
patUuid4 = regexp.MustCompile(`(?i)[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
|
|
patEmail = regexp.MustCompile(`^\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*$`)
|
|
patLongNum = regexp.MustCompile(`^\d{3,}$`)
|
|
patLongNumB = regexp.MustCompile(`[^\d]\d{3,}`)
|
|
patLongNumA = regexp.MustCompile(`\d{3,}[^\d]`)
|
|
)
|
|
|
|
func IsGibberish(str string) bool {
|
|
if IsVersionString(str) {
|
|
return false
|
|
}
|
|
|
|
if patEmail.MatchString(str) {
|
|
return true
|
|
}
|
|
|
|
if patUuid4.MatchString(str) {
|
|
return true
|
|
}
|
|
|
|
if patLongNum.MatchString(str) || patLongNumB.MatchString(str) || patLongNumA.MatchString(str) {
|
|
return true
|
|
}
|
|
|
|
//alNum := cleanStr(str, isAlNumRune)
|
|
//alpha := cleanStr(str, isAlphaRune)
|
|
// noiseAll := isNoisy(alNum)
|
|
//triAll := isTrigramBad(strings.ToLower(alpha))
|
|
// _ = noiseAll
|
|
|
|
isNotAlNum := func(r rune) bool { return !isAlNumRune(r) }
|
|
chunks := strings.FieldsFunc(str, isNotAlNum)
|
|
noisyLen := 0
|
|
alnumLen := 0
|
|
for _, chunk := range chunks {
|
|
alnumLen += len(chunk)
|
|
noise := isNoisy(chunk)
|
|
tri := isTrigramBad(strings.ToLower(chunk))
|
|
if noise || tri {
|
|
noisyLen += len(chunk)
|
|
}
|
|
}
|
|
|
|
return float64(noisyLen) > 0
|
|
|
|
//if float64(noisyLen) > 0 {
|
|
// return true
|
|
//}
|
|
|
|
//if len(chunks) > 0 && float64(noisyLen) >= float64(alnumLen)/3.0 {
|
|
// return true
|
|
//}
|
|
|
|
//if triAll {
|
|
//return true
|
|
//}
|
|
|
|
// return false
|
|
}
|
|
|
|
func noiseLevel(str string) (score float64) {
|
|
// opinionated algo of certain char pairs marking the non-human strings
|
|
prev := *new(rune)
|
|
cnt := 0.0
|
|
for _, char := range str {
|
|
cnt += 1
|
|
if prev > 0 {
|
|
switch {
|
|
// continued class of upper/lower/digit adds no noise
|
|
case unicode.IsUpper(prev) && unicode.IsUpper(char):
|
|
case unicode.IsLower(prev) && unicode.IsLower(char):
|
|
case unicode.IsDigit(prev) && unicode.IsDigit(char):
|
|
|
|
// upper =>
|
|
case unicode.IsUpper(prev) && unicode.IsLower(char):
|
|
score += 0.10
|
|
case unicode.IsUpper(prev) && unicode.IsDigit(char):
|
|
score += 0.5
|
|
|
|
// lower =>
|
|
case unicode.IsLower(prev) && unicode.IsUpper(char):
|
|
score += 0.75
|
|
case unicode.IsLower(prev) && unicode.IsDigit(char):
|
|
score += 0.5
|
|
|
|
// digit =>
|
|
case unicode.IsDigit(prev) && unicode.IsUpper(char):
|
|
score += 0.75
|
|
case unicode.IsDigit(prev) && unicode.IsLower(char):
|
|
score += 1.0
|
|
|
|
// the rest is 100% noise
|
|
default:
|
|
score += 1
|
|
}
|
|
}
|
|
prev = char
|
|
}
|
|
|
|
return score
|
|
}
|
|
|
|
func IsVersionString(component string) bool {
|
|
if component == "" {
|
|
return false
|
|
}
|
|
|
|
hasV := false
|
|
if strings.HasPrefix(component, "v") {
|
|
component = component[1:]
|
|
hasV = true
|
|
}
|
|
|
|
for _, c := range component {
|
|
if string(c) != "." && !unicode.IsDigit(c) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
if !hasV && !strings.Contains(component, ".") {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func trigramScore(str string) (float64, int) {
|
|
tgScore := 0.0
|
|
trigrams := ngrams(str, 3)
|
|
if len(trigrams) > 0 {
|
|
for _, trigram := range trigrams {
|
|
score, found := corpus_trigrams[trigram]
|
|
if found {
|
|
tgScore += score
|
|
}
|
|
}
|
|
}
|
|
|
|
return tgScore, len(trigrams)
|
|
}
|
|
|
|
func isTrigramBad(s string) bool {
|
|
tgScore, cnt := trigramScore(s)
|
|
|
|
if cnt > 0 {
|
|
val := math.Sqrt(tgScore) / float64(cnt)
|
|
val2 := tgScore / float64(cnt)
|
|
threshold := 0.005
|
|
bad := val < threshold
|
|
threshold2 := math.Log(float64(cnt)-2) * 0.1
|
|
bad2 := val2 < threshold2
|
|
return bad && bad2 // TODO: improve this logic to be clearer
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isNoisy(s string) bool {
|
|
noise := noiseLevel(s)
|
|
|
|
if len(s) > 0 {
|
|
val := (noise * noise) / float64(len(s))
|
|
threshold := 0.6
|
|
bad := val > threshold
|
|
return bad
|
|
}
|
|
return false
|
|
}
|
|
|
|
func ngrams(s string, n int) []string {
|
|
result := make([]string, 0)
|
|
for i := 0; i < len(s)-n+1; i++ {
|
|
result = append(result, s[i:i+n])
|
|
}
|
|
return result
|
|
}
|