Update bluemonday to v1.0.15 (#16379) (#16380)

* Update bluemonday to v1.0.15 (#16379) * Fix TESTS
2025-09-05 07:25:34 +00:00 · 2021-07-09 02:47:27 +02:00
parent ac0f452b30
commit d98694e6ca
174 changed files with 14579 additions and 11967 deletions
--- a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
+++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go
@@ -31,6 +31,7 @@ package bluemonday

 import (
 	"bytes"
+	"fmt"
 	"io"
 	"net/url"
 	"regexp"
@@ -47,10 +48,11 @@ var (
 	dataAttributeXMLPrefix    = regexp.MustCompile("^xml.+")
 	dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+")
 	cssUnicodeChar            = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`)
+	dataURIbase64Prefix       = regexp.MustCompile(`^data:[^,]*;base64,`)
 )

 // Sanitize takes a string that contains a HTML fragment or document and applies
-// the given policy whitelist.
+// the given policy allowlist.
 //
 // It returns a HTML string that has been sanitized by the policy or an empty
 // string if an error has occurred (most likely as a consequence of extremely
@@ -60,11 +62,11 @@ func (p *Policy) Sanitize(s string) string {
 		return s
 	}

-	return p.sanitize(strings.NewReader(s)).String()
+	return p.sanitizeWithBuff(strings.NewReader(s)).String()
 }

 // SanitizeBytes takes a []byte that contains a HTML fragment or document and applies
-// the given policy whitelist.
+// the given policy allowlist.
 //
 // It returns a []byte containing the HTML that has been sanitized by the policy
 // or an empty []byte if an error has occurred (most likely as a consequence of
@@ -74,26 +76,32 @@ func (p *Policy) SanitizeBytes(b []byte) []byte {
 		return b
 	}

-	return p.sanitize(bytes.NewReader(b)).Bytes()
+	return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes()
 }

 // SanitizeReader takes an io.Reader that contains a HTML fragment or document
-// and applies the given policy whitelist.
+// and applies the given policy allowlist.
 //
 // It returns a bytes.Buffer containing the HTML that has been sanitized by the
 // policy. Errors during sanitization will merely return an empty result.
 func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer {
-	return p.sanitize(r)
+	return p.sanitizeWithBuff(r)
+}
+
+// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document
+// and applies the given policy allowlist and writes to the provided writer returning
+// an error if there is one.
+func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error {
+	return p.sanitize(r, w)
 }

 const escapedURLChars = "'<>\"\r"

-func escapeUrlComponent(val string) string {
-	w := bytes.NewBufferString("")
+func escapeUrlComponent(w stringWriterWriter, val string) error {
 	i := strings.IndexAny(val, escapedURLChars)
 	for i != -1 {
 		if _, err := w.WriteString(val[:i]); err != nil {
-			return w.String()
+			return err
 		}
 		var esc string
 		switch val[i] {
@@ -114,15 +122,15 @@ func escapeUrlComponent(val string) string {
 		}
 		val = val[i+1:]
 		if _, err := w.WriteString(esc); err != nil {
-			return w.String()
+			return err
 		}
 		i = strings.IndexAny(val, escapedURLChars)
 	}
-	w.WriteString(val)
-	return w.String()
+	_, err := w.WriteString(val)
+	return err
 }

-// Query represents a query
+// Query represents a single part of the query string, a query param 
 type Query struct {
 	Key      string
 	Value    string
@@ -130,6 +138,10 @@ type Query struct {
 }

 func parseQuery(query string) (values []Query, err error) {
+	// This is essentially a copy of parseQuery from
+	// https://golang.org/src/net/url/url.go but adjusted to build our values
+	// based on our type, which we need to preserve the ordering of the query
+	// string
 	for query != "" {
 		key := query
 		if i := strings.IndexAny(key, "&;"); i >= 0 {
@@ -170,18 +182,18 @@ func parseQuery(query string) (values []Query, err error) {
 }

 func encodeQueries(queries []Query) string {
-	var b strings.Builder
+	var buff bytes.Buffer
 	for i, query := range queries {
-		b.WriteString(url.QueryEscape(query.Key))
+		buff.WriteString(url.QueryEscape(query.Key))
 		if query.HasValue {
-			b.WriteString("=")
-			b.WriteString(url.QueryEscape(query.Value))
+			buff.WriteString("=")
+			buff.WriteString(url.QueryEscape(query.Value))
 		}
 		if i < len(queries)-1 {
-			b.WriteString("&")
+			buff.WriteString("&")
 		}
 	}
-	return b.String()
+	return buff.String()
 }

 func sanitizedURL(val string) (string, error) {
@@ -205,45 +217,24 @@ func sanitizedURL(val string) (string, error) {
 	return u.String(), nil
 }

-func (p *Policy) writeLinkableBuf(buff *bytes.Buffer, token *html.Token) {
-	// do not escape multiple query parameters
-	tokenBuff := bytes.NewBufferString("")
-	tokenBuff.WriteString("<")
-	tokenBuff.WriteString(token.Data)
-	for _, attr := range token.Attr {
-		tokenBuff.WriteByte(' ')
-		tokenBuff.WriteString(attr.Key)
-		tokenBuff.WriteString(`="`)
-		switch attr.Key {
-		case "href", "src":
-			u, ok := p.validURL(attr.Val)
-			if !ok {
-				tokenBuff.WriteString(html.EscapeString(attr.Val))
-				continue
-			}
-			u, err := sanitizedURL(u)
-			if err == nil {
-				tokenBuff.WriteString(u)
-			} else {
-				// fallthrough
-				tokenBuff.WriteString(html.EscapeString(attr.Val))
-			}
-		default:
-			// re-apply
-			tokenBuff.WriteString(html.EscapeString(attr.Val))
-		}
-		tokenBuff.WriteByte('"')
+// Performs the actual sanitization process.
+func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer {
+	var buff bytes.Buffer
+	if err := p.sanitize(r, &buff); err != nil {
+		return &bytes.Buffer{}
 	}
-	if token.Type == html.SelfClosingTagToken {
-		tokenBuff.WriteString("/")
-	}
-	tokenBuff.WriteString(">")
-	buff.WriteString(tokenBuff.String())
+	return &buff
 }

-// Performs the actual sanitization process.
-func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
+type asStringWriter struct {
+	io.Writer
+}

+func (a *asStringWriter) WriteString(s string) (int, error) {
+	return a.Write([]byte(s))
+}
+
+func (p *Policy) sanitize(r io.Reader, w io.Writer) error {
 	// It is possible that the developer has created the policy via:
 	//   p := bluemonday.Policy{}
 	// rather than:
@@ -252,8 +243,12 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 	// would initiliaze the maps, then we need to do that.
 	p.init()

+	buff, ok := w.(stringWriterWriter)
+	if !ok {
+		buff = &asStringWriter{w}
+	}
+
 	var (
-		buff                     bytes.Buffer
 		skipElementContent       bool
 		skippingElementsCount    int64
 		skipClosingTag           bool
@@ -267,11 +262,11 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 			err := tokenizer.Err()
 			if err == io.EOF {
 				// End of input means end of processing
-				return &buff
+				return nil
 			}

 			// Raw tokenizer error
-			return &bytes.Buffer{}
+			return err
 		}

 		token := tokenizer.Token()
@@ -289,6 +284,10 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 		case html.CommentToken:

 			// Comments are ignored by default
+			if p.allowComments {
+				// But if allowed then write the comment out as-is
+				buff.WriteString(token.String())
+			}

 		case html.StartTagToken:

@@ -303,14 +302,18 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 						skippingElementsCount++
 					}
 					if p.addSpaces {
-						buff.WriteString(" ")
+						if _, err := buff.WriteString(" "); err != nil {
+							return err
+						}
 					}
 					break
 				}
 				aps = aa
 			}
 			if len(token.Attr) != 0 {
-				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
+				token.Attr = escapeAttributes(
+					p.sanitizeAttrs(token.Data, token.Attr, aps),
+				)
 			}

 			if len(token.Attr) == 0 {
@@ -318,18 +321,17 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 					skipClosingTag = true
 					closingTagToSkipStack = append(closingTagToSkipStack, token.Data)
 					if p.addSpaces {
-						buff.WriteString(" ")
+						if _, err := buff.WriteString(" "); err != nil {
+							return err
+						}
 					}
 					break
 				}
 			}

 			if !skipElementContent {
-				// do not escape multiple query parameters
-				if linkable(token.Data) {
-					p.writeLinkableBuf(&buff, &token)
-				} else {
-					buff.WriteString(token.String())
+				if _, err := buff.WriteString(token.String()); err != nil {
+					return err
 				}
 			}

@@ -345,7 +347,9 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 					skipClosingTag = false
 				}
 				if p.addSpaces {
-					buff.WriteString(" ")
+					if _, err := buff.WriteString(" "); err != nil {
+						return err
+					}
 				}
 				break
 			}
@@ -366,14 +370,18 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 				}
 				if !match {
 					if p.addSpaces {
-						buff.WriteString(" ")
+						if _, err := buff.WriteString(" "); err != nil {
+							return err
+						}
 					}
 					break
 				}
 			}

 			if !skipElementContent {
-				buff.WriteString(token.String())
+				if _, err := buff.WriteString(token.String()); err != nil {
+					return err
+				}
 			}

 		case html.SelfClosingTagToken:
@@ -383,7 +391,9 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 				aa, matched := p.matchRegex(token.Data)
 				if !matched {
 					if p.addSpaces && !matched {
-						buff.WriteString(" ")
+						if _, err := buff.WriteString(" "); err != nil {
+							return err
+						}
 					}
 					break
 				}
@@ -391,21 +401,20 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 			}

 			if len(token.Attr) != 0 {
-				token.Attr = p.sanitizeAttrs(token.Data, token.Attr, aps)
+				token.Attr = escapeAttributes(p.sanitizeAttrs(token.Data, token.Attr, aps))
 			}

 			if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) {
 				if p.addSpaces {
-					buff.WriteString(" ")
+					if _, err := buff.WriteString(" "); err != nil {
+						return err
+					}
 					break
 				}
 			}
 			if !skipElementContent {
-				// do not escape multiple query parameters
-				if linkable(token.Data) {
-					p.writeLinkableBuf(&buff, &token)
-				} else {
-					buff.WriteString(token.String())
+				if _, err := buff.WriteString(token.String()); err != nil {
+					return err
 				}
 			}

@@ -416,20 +425,26 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 				case `script`:
 					// not encouraged, but if a policy allows JavaScript we
 					// should not HTML escape it as that would break the output
-					buff.WriteString(token.Data)
-				case `style`:
+					if _, err := buff.WriteString(token.Data); err != nil {
+						return err
+					}
+				case "style":
 					// not encouraged, but if a policy allows CSS styles we
 					// should not HTML escape it as that would break the output
-					buff.WriteString(token.Data)
+					if _, err := buff.WriteString(token.Data); err != nil {
+						return err
+					}
 				default:
 					// HTML escape the text
-					buff.WriteString(token.String())
+					if _, err := buff.WriteString(token.String()); err != nil {
+						return err
+					}
 				}
 			}

 		default:
 			// A token that didn't exist in the html package when we wrote this
-			return &bytes.Buffer{}
+			return fmt.Errorf("unknown token: %v", token)
 		}
 	}
 }
@@ -440,7 +455,7 @@ func (p *Policy) sanitize(r io.Reader) *bytes.Buffer {
 func (p *Policy) sanitizeAttrs(
 	elementName string,
 	attrs []html.Attribute,
-	aps map[string]attrPolicy,
+	aps map[string][]attrPolicy,
 ) []html.Attribute {

 	if len(attrs) == 0 {
@@ -465,8 +480,9 @@ func (p *Policy) sanitizeAttrs(
 	}

 	// Builds a new attribute slice based on the whether the attribute has been
-	// whitelisted explicitly or globally.
+	// allowed explicitly or globally.
 	cleanAttrs := []html.Attribute{}
+attrsLoop:
 	for _, htmlAttr := range attrs {
 		if p.allowDataAttributes {
 			// If we see a data attribute, let it through.
@@ -489,27 +505,32 @@ func (p *Policy) sanitizeAttrs(
 		}

 		// Is there an element specific attribute policy that applies?
-		if ap, ok := aps[htmlAttr.Key]; ok {
-			if ap.regexp != nil {
-				if ap.regexp.MatchString(htmlAttr.Val) {
+		if apl, ok := aps[htmlAttr.Key]; ok {
+			for _, ap := range apl {
+				if ap.regexp != nil {
+					if ap.regexp.MatchString(htmlAttr.Val) {
+						cleanAttrs = append(cleanAttrs, htmlAttr)
+						continue attrsLoop
+					}
+				} else {
 					cleanAttrs = append(cleanAttrs, htmlAttr)
-					continue
+					continue attrsLoop
 				}
-			} else {
-				cleanAttrs = append(cleanAttrs, htmlAttr)
-				continue
 			}
 		}

 		// Is there a global attribute policy that applies?
-		if ap, ok := p.globalAttrs[htmlAttr.Key]; ok {
-
-			if ap.regexp != nil {
-				if ap.regexp.MatchString(htmlAttr.Val) {
+		if apl, ok := p.globalAttrs[htmlAttr.Key]; ok {
+			for _, ap := range apl {
+				if ap.regexp != nil {
+					if ap.regexp.MatchString(htmlAttr.Val) {
+				htmlAttr.Val = escapeAttribute(htmlAttr.Val)
+						cleanAttrs = append(cleanAttrs, htmlAttr)
+					}
+				} else {
+				htmlAttr.Val = escapeAttribute(htmlAttr.Val)
 					cleanAttrs = append(cleanAttrs, htmlAttr)
 				}
-			} else {
-				cleanAttrs = append(cleanAttrs, htmlAttr)
 			}
 		}
 	}
@@ -533,7 +554,7 @@ func (p *Policy) sanitizeAttrs(
 			tmpAttrs := []html.Attribute{}
 			for _, htmlAttr := range cleanAttrs {
 				switch elementName {
-				case "a", "area", "link":
+				case "a", "area", "base", "link":
 					if htmlAttr.Key == "href" {
 						if u, ok := p.validURL(htmlAttr.Val); ok {
 							htmlAttr.Val = u
@@ -542,7 +563,7 @@ func (p *Policy) sanitizeAttrs(
 						break
 					}
 					tmpAttrs = append(tmpAttrs, htmlAttr)
-				case "blockquote", "q":
+				case "blockquote", "del", "ins", "q":
 					if htmlAttr.Key == "cite" {
 						if u, ok := p.validURL(htmlAttr.Val); ok {
 							htmlAttr.Val = u
@@ -551,7 +572,7 @@ func (p *Policy) sanitizeAttrs(
 						break
 					}
 					tmpAttrs = append(tmpAttrs, htmlAttr)
-				case "img", "script":
+				case "audio", "embed", "iframe", "img", "script", "source", "track", "video":
 					if htmlAttr.Key == "src" {
 						if u, ok := p.validURL(htmlAttr.Val); ok {
 							htmlAttr.Val = u
@@ -576,7 +597,7 @@ func (p *Policy) sanitizeAttrs(

 			// Add rel="nofollow" if a "href" exists
 			switch elementName {
-			case "a", "area", "link":
+			case "a", "area", "base", "link":
 				var hrefFound bool
 				var externalLink bool
 				for _, htmlAttr := range cleanAttrs {
@@ -753,14 +774,14 @@ func (p *Policy) sanitizeAttrs(
 func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute {
 	sps := p.elsAndStyles[elementName]
 	if len(sps) == 0 {
-		sps = map[string]stylePolicy{}
+		sps = map[string][]stylePolicy{}
 		// check for any matching elements, if we don't already have a policy found
 		// if multiple matches are found they will be overwritten, it's best
 		// to not have overlapping matchers
 		for regex, policies := range p.elsMatchingAndStyles {
 			if regex.MatchString(elementName) {
 				for k, v := range policies {
-					sps[k] = v
+					sps[k] = append(sps[k], v...)
 				}
 			}
 		}
@@ -778,46 +799,51 @@ func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.At
 	clean := []string{}
 	prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"}

+decLoop:
 	for _, dec := range decs {
-		addedProperty := false
 		tempProperty := strings.ToLower(dec.Property)
 		tempValue := removeUnicode(strings.ToLower(dec.Value))
 		for _, i := range prefixes {
 			tempProperty = strings.TrimPrefix(tempProperty, i)
 		}
-		if sp, ok := sps[tempProperty]; ok {
-			if sp.handler != nil {
-				if sp.handler(tempValue) {
-					clean = append(clean, dec.Property+": "+dec.Value)
-					addedProperty = true
+		if spl, ok := sps[tempProperty]; ok {
+			for _, sp := range spl {
+				if sp.handler != nil {
+					if sp.handler(tempValue) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
+				} else if len(sp.enum) > 0 {
+					if stringInSlice(tempValue, sp.enum) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
+				} else if sp.regexp != nil {
+					if sp.regexp.MatchString(tempValue) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
 				}
-			} else if len(sp.enum) > 0 {
-				if stringInSlice(tempValue, sp.enum) {
-					clean = append(clean, dec.Property+": "+dec.Value)
-					addedProperty = true
-				}
-			} else if sp.regexp != nil {
-				if sp.regexp.MatchString(tempValue) {
-					clean = append(clean, dec.Property+": "+dec.Value)
-					addedProperty = true
-				}
-				continue
 			}
 		}
-		if sp, ok := p.globalStyles[tempProperty]; ok && !addedProperty {
-			if sp.handler != nil {
-				if sp.handler(tempValue) {
-					clean = append(clean, dec.Property+": "+dec.Value)
+		if spl, ok := p.globalStyles[tempProperty]; ok {
+			for _, sp := range spl {
+				if sp.handler != nil {
+					if sp.handler(tempValue) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
+				} else if len(sp.enum) > 0 {
+					if stringInSlice(tempValue, sp.enum) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
+				} else if sp.regexp != nil {
+					if sp.regexp.MatchString(tempValue) {
+						clean = append(clean, dec.Property+": "+dec.Value)
+						continue decLoop
+					}
 				}
-			} else if len(sp.enum) > 0 {
-				if stringInSlice(tempValue, sp.enum) {
-					clean = append(clean, dec.Property+": "+dec.Value)
-				}
-			} else if sp.regexp != nil {
-				if sp.regexp.MatchString(tempValue) {
-					clean = append(clean, dec.Property+": "+dec.Value)
-				}
-				continue
 			}
 		}
 	}
@@ -848,11 +874,28 @@ func (p *Policy) validURL(rawurl string) (string, bool) {
 		rawurl = strings.TrimSpace(rawurl)

 		// URLs cannot contain whitespace, unless it is a data-uri
-		if (strings.Contains(rawurl, " ") ||
+		if strings.Contains(rawurl, " ") ||
 			strings.Contains(rawurl, "\t") ||
-			strings.Contains(rawurl, "\n")) &&
-			!strings.HasPrefix(rawurl, `data:`) {
-			return "", false
+			strings.Contains(rawurl, "\n") {
+			if !strings.HasPrefix(rawurl, `data:`) {
+				return "", false
+			}
+
+			// Remove \r and \n from base64 encoded data to pass url.Parse.
+			matched := dataURIbase64Prefix.FindString(rawurl)
+			if matched != "" {
+				rawurl = matched + strings.Replace(
+					strings.Replace(
+						rawurl[len(matched):],
+						"\r",
+						"",
+						-1,
+					),
+					"\n",
+					"",
+					-1,
+				)
+			}
 		}

 		// URLs are valid if they parse
@@ -863,16 +906,21 @@ func (p *Policy) validURL(rawurl string) (string, bool) {

 		if u.Scheme != "" {

-			urlPolicy, ok := p.allowURLSchemes[u.Scheme]
+			urlPolicies, ok := p.allowURLSchemes[u.Scheme]
 			if !ok {
 				return "", false
-
 			}

-			if urlPolicy == nil || urlPolicy(u) == true {
+			if len(urlPolicies) == 0 {
 				return u.String(), true
 			}

+			for _, urlPolicy := range urlPolicies {
+				if urlPolicy(u) == true {
+					return u.String(), true
+				}
+			}
+
 			return "", false
 		}

@@ -890,7 +938,14 @@ func (p *Policy) validURL(rawurl string) (string, bool) {

 func linkable(elementName string) bool {
 	switch elementName {
-	case "a", "area", "blockquote", "img", "link", "script":
+	case "a", "area", "base", "link":
+		// elements that allow .href
+		return true
+	case "blockquote", "del", "ins", "q":
+		// elements that allow .cite
+		return true
+	case "audio", "embed", "iframe", "img", "input", "script", "track", "video":
+		// elements that allow .src
 		return true
 	default:
 		return false
@@ -957,14 +1012,14 @@ func removeUnicode(value string) string {
 	return substitutedValue
 }

-func (p *Policy) matchRegex(elementName string) (map[string]attrPolicy, bool) {
-	aps := make(map[string]attrPolicy, 0)
+func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) {
+	aps := make(map[string][]attrPolicy, 0)
 	matched := false
 	for regex, attrs := range p.elsMatchingAndAttrs {
 		if regex.MatchString(elementName) {
 			matched = true
 			for k, v := range attrs {
-				aps[k] = v
+				aps[k] = append(aps[k], v...)
 			}
 		}
 	}
@@ -989,3 +1044,18 @@ func normaliseElementName(str string) string {
 		`"`,
 	)
 }
+
+func escapeAttributes(attrs []html.Attribute) []html.Attribute {
+	escapedAttrs := []html.Attribute{}
+	for _, attr := range attrs {
+		attr.Val = escapeAttribute(attr.Val)
+		escapedAttrs = append(escapedAttrs, attr)
+	}
+	return escapedAttrs
+}
+
+func escapeAttribute(val string) string {
+	val = strings.Replace(val, string([]rune{'\u00A0'}), `&nbsp;`, -1)
+	val = strings.Replace(val, `"`, `&quot;`, -1)
+	return val
+}