OAS: Improve URL clustering (#711)

* Evidence of problems

* Improving it

* Progressing

* Progress

* Fixes

* progress

* log is not right

* Fixing thresholds

* Almost there

* Imptovements

* Move some false negatives into TODO

* cosmetics

* linter (I disagree)

* linter (I disagree)

* fix test
This commit is contained in:
Andrey Pokhilko 2022-02-02 13:33:24 +03:00 committed by GitHub
parent c37f6478f3
commit a2118b869e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 8034 additions and 72 deletions

View File

@ -1,22 +1,26 @@
package oas
import (
"math"
"regexp"
"strings"
"unicode"
)
var (
patBase64 = regexp.MustCompile(`^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$`)
patUuid4 = regexp.MustCompile(`(?i)[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
patEmail = regexp.MustCompile(`^\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*$`)
patHexLower = regexp.MustCompile(`(0x)?[0-9a-f]{6,}`)
patHexUpper = regexp.MustCompile(`(0x)?[0-9A-F]{6,}`)
patLongNum = regexp.MustCompile(`\d{6,}`)
patLongNum = regexp.MustCompile(`^\d{3,}$`)
patLongNumB = regexp.MustCompile(`[^\d]\d{3,}`)
patLongNumA = regexp.MustCompile(`\d{3,}[^\d]`)
)
func IsGibberish(str string) bool {
if patBase64.MatchString(str) && len(str) > 32 {
if IsVersionString(str) {
return false
}
if patEmail.MatchString(str) {
return true
}
@ -24,16 +28,44 @@ func IsGibberish(str string) bool {
return true
}
if patEmail.MatchString(str) {
if patLongNum.MatchString(str) || patLongNumB.MatchString(str) || patLongNumA.MatchString(str) {
return true
}
if patHexLower.MatchString(str) || patHexUpper.MatchString(str) || patLongNum.MatchString(str) {
return true
//alNum := cleanStr(str, isAlNumRune)
//alpha := cleanStr(str, isAlphaRune)
// noiseAll := isNoisy(alNum)
//triAll := isTrigramBad(strings.ToLower(alpha))
// _ = noiseAll
isNotAlNum := func(r rune) bool { return !isAlNumRune(r) }
chunks := strings.FieldsFunc(str, isNotAlNum)
noisyLen := 0
alnumLen := 0
for _, chunk := range chunks {
alnumLen += len(chunk)
noise := isNoisy(chunk)
tri := isTrigramBad(strings.ToLower(chunk))
if noise || tri {
noisyLen += len(chunk)
}
}
noise := noiseLevel(str)
return noise >= 0.2
return float64(noisyLen) > 0
//if float64(noisyLen) > 0 {
// return true
//}
//if len(chunks) > 0 && float64(noisyLen) >= float64(alnumLen)/3.0 {
// return true
//}
//if triAll {
//return true
//}
// return false
}
func noiseLevel(str string) (score float64) {
@ -51,21 +83,21 @@ func noiseLevel(str string) (score float64) {
// upper =>
case unicode.IsUpper(prev) && unicode.IsLower(char):
score += 0.25
score += 0.10
case unicode.IsUpper(prev) && unicode.IsDigit(char):
score += 0.25
score += 0.5
// lower =>
case unicode.IsLower(prev) && unicode.IsUpper(char):
score += 0.75
case unicode.IsLower(prev) && unicode.IsDigit(char):
score += 0.25
score += 0.5
// digit =>
case unicode.IsDigit(prev) && unicode.IsUpper(char):
score += 0.75
case unicode.IsDigit(prev) && unicode.IsLower(char):
score += 0.75
score += 1.0
// the rest is 100% noise
default:
@ -75,8 +107,6 @@ func noiseLevel(str string) (score float64) {
prev = char
}
score /= cnt // weigh it
return score
}
@ -97,9 +127,59 @@ func IsVersionString(component string) bool {
}
}
if !hasV && strings.Contains(component, ".") {
if !hasV && !strings.Contains(component, ".") {
return false
}
return true
}
func trigramScore(str string) (float64, int) {
tgScore := 0.0
trigrams := ngrams(str, 3)
if len(trigrams) > 0 {
for _, trigram := range trigrams {
score, found := corpus_trigrams[trigram]
if found {
tgScore += score
}
}
}
return tgScore, len(trigrams)
}
func isTrigramBad(s string) bool {
tgScore, cnt := trigramScore(s)
if cnt > 0 {
val := math.Sqrt(tgScore) / float64(cnt)
val2 := tgScore / float64(cnt)
threshold := 0.005
bad := val < threshold
threshold2 := math.Log(float64(cnt)-2) * 0.1
bad2 := val2 < threshold2
return bad && bad2 // TODO: improve this logic to be clearer
}
return false
}
func isNoisy(s string) bool {
noise := noiseLevel(s)
if len(s) > 0 {
val := (noise * noise) / float64(len(s))
threshold := 0.6
bad := val > threshold
return bad
}
return false
}
func ngrams(s string, n int) []string {
result := make([]string, 0)
for i := 0; i < len(s)-n+1; i++ {
result = append(result, s[i:i+n])
}
return result
}

View File

@ -1,17 +1,95 @@
package oas
import "testing"
import (
"testing"
)
func TestNegative(t *testing.T) {
cases := []string{
"",
"{}",
"0.0.29",
"0.1",
"1.0",
"1.0.0",
"2.1.73",
"abTestV2",
"actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get",
"AddUserGroupLink",
"advert-management.adBlockerMessage.html",
"agents.author.1.json",
"animated-gif",
"b", // can be valid hexadecimal
"GetUniversalVariableUser",
"big-danger-coronavirus-panic-greater-crisis",
"breakout-box",
"callback",
"runs",
"tcfv2",
"StartUpCheckout",
"core.algorithm_execution.view",
"core.devices.view",
"data.json",
"dialog.overlay.infinity.json",
"domain-input",
"embeddable", // lul, it's a valid HEX!
"embeddable_blip",
"E PLURIBUS UNUM",
"etc",
"eu-central-1a",
"fcgi-bin",
"footer.include.html",
"fullHashes:find",
"generate-feed",
"GetAds",
"GetCart",
"GetUniversalVariableUser",
"github-audit-exports",
"g.js",
"g.pixel",
".html",
"Hugo Michiels",
"image.sbix",
"index.html",
"iPad",
"Joanna Mazewski",
"LibGit2Sharp",
"Michael_Vaughan1.png",
"New RSS feed has been generated",
"nick-clegg",
"opt-out",
"pixel_details.html",
"post.json",
"profile-method-info",
"project-id",
"publisha.1.json",
"publish_and_moderate",
"Ronna McDaniel",
"rtb-h",
"runs",
"sign-up",
"some-uuid-maybe",
"stable-4.0-version.json",
"StartUpCheckout",
"Steve Flunk",
"sync_a9",
"Ted Cruz",
"test.png",
"token",
"ToList",
"v2.1.3",
"VersionCheck.php",
"v Rusiji",
"Walnut St",
"web_widget",
"zoom_in.cur",
"xray",
"web",
"vipbets1",
"trcc",
"fbpixel",
// TODO below
// "tcfv2",
// "Matt-cartoon-255x206px-small.png",
// "TheTelegraph_portal_white-320-small.png",
// "testdata-10kB.js",
}
for _, str := range cases {
@ -23,44 +101,92 @@ func TestNegative(t *testing.T) {
func TestPositive(t *testing.T) {
cases := []string{
"e21f7112-3d3b-4632-9da3-a4af2e0e9166",
"952bea17-3776-11ea-9341-42010a84012a",
"456795af-b48f-4a8d-9b37-3e932622c2f0",
"0a0d0174-b338-4520-a1c3-24f7e3d5ec50.html",
"6120c057c7a97b03f6986f1b",
"1024807212418223",
"11ca096cbc224a67360493d44a9903",
"1553183382779",
"1554507871",
"19180481",
"203ef0f713abcebd8d62c35c0e3f12f87d71e5e4",
"456795af-b48f-4a8d-9b37-3e932622c2f0",
"601a2bdcc5b69137248ddbbf",
"60fe9aaeaefe2400012df94f",
"610bc3fd5a77a7fa25033fb0",
"610bd0315a77a7fa25034368",
"610bd0315a77a7fa25034368zh",
"6120c057c7a97b03f6986f1b",
"710a462e",
"1554507871",
"qwerqwerasdfqwer@protonmai.com",
"john.dow.1981@protonmail.com",
"ci12NC01YzkyNTEzYzllMDRhLTAtYy5tb25pdG9yaW5nLmpzb24=", // long base64
"11ca096cbc224a67360493d44a9903",
"c738338322370b47a79251f7510dd", // prefixed hex
"QgAAAC6zw0qH2DJtnXe8Z7rUJP0FgAFKkOhcHdFWzL1ZYggtwBgiB3LSoele9o3ZqFh7iCBhHbVLAnMuJ0HF8hEw7UKecE6wd-MBXgeRMdubGydhAMZSmuUjRpqplML40bmrb8VjJKNZswD1Cg",
"QgAAAC6zw0qH2DJtnXe8Z7rUJP0rG4sjLa_KVLlww5WEDJ__30J15en-K_6Y68jb_rU93e2TFY6fb0MYiQ1UrLNMQufqODHZUl39Lo6cXAOVOThjAMZSmuVH7n85JOYSCgzpvowMAVueGG0Xxg",
"203ef0f713abcebd8d62c35c0e3f12f87d71e5e4",
"MDEyOk9yZ2FuaXphdGlvbjU3MzI0Nzk1",
"730970532670-compute@developer.gserviceaccount.com",
"arn-aws-ecs-eu-west-2-396248696294-cluster-london-01-ECSCluster-27iuIYva8nO4", // ?
"819db2242a648b305395537022523d65",
"952bea17-3776-11ea-9341-42010a84012a",
"a3226860758.html",
"AAAA028295945",
"sp_ANQXRpqH_urn$3Auri$3Abase64$3A6698b0a3-97ad-52ce-8fc3-17d99e37a726",
"arn-aws-ecs-eu-west-2-396248696294-cluster-london-01-ECSCluster-27iuIYva8nO4",
"arn-aws-ecs-eu-west-2-396248696294-cluster-london-01-ECSCluster-27iuIYva8nO4", // ?
"bnjksfd897345nl098asd53412kl98",
"c738338322370b47a79251f7510dd", // prefixed hex
"ci12NC01YzkyNTEzYzllMDRhLTAtYy5tb25pdG9yaW5nLmpzb24=", // long base64
"css/login.0f48c49a34eb53ea4623.min.css",
"d_fLLxlhzDilixeBEimaZ5",
"e21f7112-3d3b-4632-9da3-a4af2e0e9166",
"e8782afc112720300c049ff124434b79",
"fb6cjraf9cejut2a",
"i-0236530c66ed02200",
"JEHJW4BKVFDRTMTUQLHKK5WVAU",
"john.dow.1981@protonmail.com",
"MDEyOk9yZ2FuaXphdGlvbjU3MzI0Nzk1",
"MNUTGVFMGLEMFTBH0XSE5E02F6J2DS",
"n63nd45qsj",
"n9z9QGNiz",
"NC4WTmcy",
"proxy.3d2100fd7107262ecb55ce6847f01fa5.html",
"QgAAAC6zw0qH2DJtnXe8Z7rUJP0FgAFKkOhcHdFWzL1ZYggtwBgiB3LSoele9o3ZqFh7iCBhHbVLAnMuJ0HF8hEw7UKecE6wd-MBXgeRMdubGydhAMZSmuUjRpqplML40bmrb8VjJKNZswD1Cg",
"QgAAAC6zw0qH2DJtnXe8Z7rUJP0rG4sjLa_KVLlww5WEDJ__30J15en-K_6Y68jb_rU93e2TFY6fb0MYiQ1UrLNMQufqODHZUl39Lo6cXAOVOThjAMZSmuVH7n85JOYSCgzpvowMAVueGG0Xxg",
"qwerqwerasdfqwer@protonmai.com",
"r-ext-5579e00a95c90",
"r-ext-5579e8b12f11e",
"r-v4-5c92513c9e04a",
"r-v4-5c92513c9e04a-0-c.monitoring.json",
"segments-1563566437171.639994",
"sp_ANQXRpqH_urn$3Auri$3Abase64$3A6698b0a3-97ad-52ce-8fc3-17d99e37a726",
"sp_dxJTfx11_576742227280287872",
"sp_NnUPB5wj_601a2bdcc5b69137248ddbbf",
"sp_NxITuoE4_premiumchron-article-14302157_c_ryGQBs_r_yIWvwP",
"t_52d94268-8810-4a7e-ba87-ffd657a6752f",
"timeouts-1563566437171.639994",
"u_YPF3GsGKMo02",
"0000000000 65535 f",
"0000000178 00000 n",
"0-10000",
"01526123,",
"0,18168,183955,3,4,1151616,5663,731,223,5104,207,3204,10,1051,175,364,1435,4,60,576,241,383,246,5,1102",
"05/10/2020",
"14336456724940333",
"fb6cjraf9cejut2a",
"JEHJW4BKVFDRTMTUQLHKK5WVAU",
// TODO
// "fb6cjraf9cejut2a",
// "Fxvd1timk", // questionable
// "JEHJW4BKVFDRTMTUQLHKK5WVAU",
/*
"0,20",
"0.001",
"YISAtiX1",
"Fxvd1timk", // questionable
"B4GCSkORAJs",
"D_4EDAqenHQ",
"EICJp29EGOk",
"Fxvd1timk",
"GTqMZELYfQQ",
"GZPTpLPEGmwHGWPC",
"_HChnE9NDPY",
"NwhjgIWHgGg",
"production/tsbqksph4xswqjexfbec",
"p/u/bguhrxupr23mw3nwxcrw",
"nRSNapbJZnc",
"zgfpbtolciznub5egzxk",
"zufnu7aimadua9wrgwwo",
"zznto1jzch9yjsbtbrul",
*/
}
for _, str := range cases {
@ -69,3 +195,18 @@ func TestPositive(t *testing.T) {
}
}
}
func TestVersionStrings(t *testing.T) {
cases := []string{
"1.0",
"1.0.0",
"v2.1.3",
"2.1.73",
}
for _, str := range cases {
if !IsVersionString(str) {
t.Errorf("Mistakenly false: %s", str)
}
}
}

View File

@ -12,9 +12,9 @@ var ignoredHeaders = []string{
"authorization", "cache-control", "connection", "content-encoding", "content-length", "content-type", "cookie",
"date", "dnt", "expect", "forwarded", "from", "front-end-https", "host", "http2-settings",
"max-forwards", "origin", "pragma", "proxy-authorization", "proxy-connection", "range", "referer",
"save-data", "te", "trailer", "transfer-encoding", "upgrade", "upgrade-insecure-requests",
"server", "user-agent", "via", "warning", "strict-transport-security",
"x-att-deviceid", "x-correlation-id", "correlation-id", "x-client-data",
"save-data", "te", "trailer", "transfer-encoding", "upgrade", "upgrade-insecure-requests", "x-download-options",
"server", "user-agent", "via", "warning", "strict-transport-security", "x-permitted-cross-domain-policies",
"x-att-deviceid", "x-correlation-id", "correlation-id", "x-client-data", "x-dns-prefetch-control",
"x-http-method-override", "x-real-ip", "x-request-id", "x-request-start", "x-requested-with", "x-uidh",
"x-same-domain", "x-content-type-options", "x-frame-options", "x-xss-protection",
"x-wap-profile", "x-scheme", "status", "x-cache", "x-application-context", "retry-after",
@ -31,7 +31,7 @@ var ignoredHeaderPrefixes = []string{
":", "accept-", "access-control-", "if-", "sec-", "grpc-",
"x-forwarded-", "x-original-", "cf-",
"x-up9-", "x-envoy-", "x-hasura-", "x-b3-", "x-datadog-", "x-envoy-", "x-amz-", "x-newrelic-", "x-prometheus-",
"x-akamai-", "x-spotim-", "x-amzn-", "x-ratelimit-",
"x-akamai-", "x-spotim-", "x-amzn-", "x-ratelimit-", "x-goog-",
}
func isCtypeIgnored(ctype string) bool {

View File

@ -160,22 +160,6 @@ func suggestTags(oas *openapi.OpenAPI) {
}
}
func getSimilarPrefix(strs []string) string {
chunked := make([][]string, 0)
for _, item := range strs {
chunked = append(chunked, strings.Split(item, "/"))
}
cmn := longestCommonXfix(chunked, true)
res := make([]string, 0)
for _, chunk := range cmn {
if chunk != "api" && !IsVersionString(chunk) && !strings.HasPrefix(chunk, "{") {
res = append(res, chunk)
}
}
return strings.Join(res[1:], ".")
}
func deleteFromSlice(s []string, val string) []string {
temp := s[:0]
for _, x := range s {

View File

@ -62,7 +62,6 @@ func (n *Node) getOrSet(path NodePath, existingPathObj *openapi.PathObj) (node *
if paramObj != nil {
node.pathParam = paramObj
} else if chunkIsGibberish {
newParam := n.createParam()
node.pathParam = newParam
} else {
@ -100,11 +99,14 @@ func (n *Node) createParam() *openapi.ParameterObj {
} else if strings.HasSuffix(*n.constant, "s") && len(*n.constant) > 3 {
name = *n.constant
name = name[:len(name)-1] + "Id"
} else if isAlpha(*n.constant) {
} else {
name = *n.constant + "Id"
}
name = cleanNonAlnum([]byte(name))
name = cleanStr(name, isAlNumRune)
if !isAlphaRune(rune(name[0])) {
name = "_" + name
}
}
newParam := createSimpleParam(name, "path", "string")

View File

@ -15,7 +15,7 @@ func TestTree(t *testing.T) {
{"/", 0, ""},
{"/v1.0.0/config/launcher/sp_nKNHCzsN/f34efcae-6583-11eb-908a-00b0fcb9d4f6/vendor,init,conversation", 1, "vendor,init,conversation"},
{"/v1.0.0/config/launcher/sp_nKNHCzsN/{f34efcae-6583-11eb-908a-00b0fcb9d4f6}/vendor,init,conversation", 0, "vendor,init,conversation"},
{"/getSvgs/size/small/brand/SFLY/layoutId/170943/layoutVersion/1/sizeId/742/surface/0/isLandscape/true/childSkus/%7B%7D", 1, ""},
{"/getSvgs/size/small/brand/SFLY/layoutId/170943/layoutVersion/1/sizeId/742/surface/0/isLandscape/true/childSkus/%7B%7D", 1, "{}"},
}
tree := new(Node)

7707
agent/pkg/oas/trigrams.go Normal file

File diff suppressed because it is too large Load Diff

View File

@ -261,6 +261,22 @@ func longestCommonXfix(strs [][]string, pre bool) []string { // https://github.c
return xfix
}
func getSimilarPrefix(strs []string) string {
chunked := make([][]string, 0)
for _, item := range strs {
chunked = append(chunked, strings.Split(item, "/"))
}
cmn := longestCommonXfix(chunked, true)
res := make([]string, 0)
for _, chunk := range cmn {
if chunk != "api" && !IsVersionString(chunk) && !strings.HasPrefix(chunk, "{") {
res = append(res, chunk)
}
}
return strings.Join(res[1:], ".")
}
// returns all non-nil ops in PathObj
func getOps(pathObj *openapi.PathObj) []*openapi.Operation {
ops := []**openapi.Operation{&pathObj.Get, &pathObj.Patch, &pathObj.Put, &pathObj.Options, &pathObj.Post, &pathObj.Trace, &pathObj.Head, &pathObj.Delete}
@ -324,13 +340,11 @@ func anyJSON(text string) (anyVal interface{}, isJSON bool) {
return nil, false
}
func cleanNonAlnum(s []byte) string {
func cleanStr(str string, criterion func(r rune) bool) string {
s := []byte(str)
j := 0
for _, b := range s {
if ('a' <= b && b <= 'z') ||
('A' <= b && b <= 'Z') ||
('0' <= b && b <= '9') ||
b == ' ' {
if criterion(rune(b)) {
s[j] = b
j++
}
@ -338,11 +352,21 @@ func cleanNonAlnum(s []byte) string {
return string(s[:j])
}
/*
func isAlpha(s string) bool {
for _, r := range s {
if (r < 'a' || r > 'z') && (r < 'A' || r > 'Z') {
if isAlphaRune(r) {
return false
}
}
return true
}
*/
func isAlphaRune(r rune) bool {
return !((r < 'a' || r > 'z') && (r < 'A' || r > 'Z'))
}
func isAlNumRune(b rune) bool {
return isAlphaRune(b) || ('0' <= b && b <= '9')
}

View File

@ -33,3 +33,27 @@ func TestAnyJSON(t *testing.T) {
}
}
}
func TestStrRunes(t *testing.T) {
if isAlphaRune('5') {
t.Logf("Failed")
}
if !isAlphaRune('a') {
t.Logf("Failed")
}
if !isAlNumRune('5') {
t.Logf("Failed")
}
if isAlNumRune(' ') {
t.Logf("Failed")
}
if cleanStr("-abc_567", isAlphaRune) != "abc" {
t.Logf("Failed")
}
if cleanStr("-abc_567", isAlNumRune) != "abc567" {
t.Logf("Failed")
}
}