1
0
mirror of https://github.com/go-gitea/gitea.git synced 2025-04-29 20:25:17 +00:00
gitea/modules/markup/html.go
Dejan Kitic af6be75adb
Valid email address should only start with alphanumeric ()
This fixes issue  where regular expression allowed email address
to start with special symbols. Valid email addresses should start with
alphanumeric character, and as such will be rendered as email.

Added test cases from the bug report to validate, such input will not be
rendered anymore as email address.

---------

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
2025-04-20 19:18:14 +08:00

422 lines
14 KiB
Go

// Copyright 2017 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package markup
import (
"bytes"
"fmt"
"io"
"regexp"
"strings"
"sync"
"code.gitea.io/gitea/modules/markup/common"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"mvdan.cc/xurls/v2"
)
// Issue name styles
const (
IssueNameStyleNumeric = "numeric"
IssueNameStyleAlphanumeric = "alphanumeric"
IssueNameStyleRegexp = "regexp"
)
type globalVarsType struct {
hashCurrentPattern *regexp.Regexp
shortLinkPattern *regexp.Regexp
anyHashPattern *regexp.Regexp
comparePattern *regexp.Regexp
fullURLPattern *regexp.Regexp
emailRegex *regexp.Regexp
emojiShortCodeRegex *regexp.Regexp
issueFullPattern *regexp.Regexp
filesChangedFullPattern *regexp.Regexp
codePreviewPattern *regexp.Regexp
tagCleaner *regexp.Regexp
nulCleaner *strings.Replacer
}
var globalVars = sync.OnceValue(func() *globalVarsType {
v := &globalVarsType{}
// NOTE: All below regex matching do not perform any extra validation.
// Thus a link is produced even if the linked entity does not exist.
// While fast, this is also incorrect and lead to false positives.
// TODO: fix invalid linking issue (update: stale TODO, what issues? maybe no TODO anymore)
// valid chars in encoded path and parameter: [-+~_%.a-zA-Z0-9/]
// hashCurrentPattern matches string that represents a commit SHA, e.g. d8a994ef243349f321568f9e36d5c3f444b99cae
// Although SHA1 hashes are 40 chars long, SHA256 are 64, the regex matches the hash from 7 to 64 chars in length
// so that abbreviated hash links can be used as well. This matches git and GitHub usability.
v.hashCurrentPattern = regexp.MustCompile(`(?:\s|^|\(|\[)([0-9a-f]{7,64})(?:\s|$|\)|\]|[.,:](\s|$))`)
// shortLinkPattern matches short but difficult to parse [[name|link|arg=test]] syntax
v.shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`)
// anyHashPattern splits url containing SHA into parts
v.anyHashPattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{40,64})(/[-+~%./\w]+)?(\?[-+~%.\w&=]+)?(#[-+~%.\w]+)?`)
// comparePattern matches "http://domain/org/repo/compare/COMMIT1...COMMIT2#hash"
v.comparePattern = regexp.MustCompile(`https?://(?:\S+/){4,5}([0-9a-f]{7,64})(\.\.\.?)([0-9a-f]{7,64})?(#[-+~_%.a-zA-Z0-9]+)?`)
// fullURLPattern matches full URL like "mailto:...", "https://..." and "ssh+git://..."
v.fullURLPattern = regexp.MustCompile(`^[a-z][-+\w]+:`)
// emailRegex is definitely not perfect with edge cases,
// it is still accepted by the CommonMark specification, as well as the HTML5 spec:
// http://spec.commonmark.org/0.28/#email-address
// https://html.spec.whatwg.org/multipage/input.html#e-mail-state-(type%3Demail)
// At the moment, we use stricter rule for rendering purpose: only allow the "name" part starting after the word boundary
v.emailRegex = regexp.MustCompile(`\b([-\w.!#$%&'*+/=?^{|}~]*@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9]{2,}(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)\b`)
// emojiShortCodeRegex find emoji by alias like :smile:
v.emojiShortCodeRegex = regexp.MustCompile(`:[-+\w]+:`)
// example: https://domain/org/repo/pulls/27#hash
v.issueFullPattern = regexp.MustCompile(`https?://(?:\S+/)[\w_.-]+/[\w_.-]+/(?:issues|pulls)/((?:\w{1,10}-)?[1-9][0-9]*)([\?|#](\S+)?)?\b`)
// example: https://domain/org/repo/pulls/27/files#hash
v.filesChangedFullPattern = regexp.MustCompile(`https?://(?:\S+/)[\w_.-]+/[\w_.-]+/pulls/((?:\w{1,10}-)?[1-9][0-9]*)/files([\?|#](\S+)?)?\b`)
// codePreviewPattern matches "http://domain/.../{owner}/{repo}/src/commit/{commit}/{filepath}#L10-L20"
v.codePreviewPattern = regexp.MustCompile(`https?://\S+/([^\s/]+)/([^\s/]+)/src/commit/([0-9a-f]{7,64})(/\S+)#(L\d+(-L\d+)?)`)
// cleans: "<foo/bar", "<any words/", ("<html", "<head", "<script", "<style")
v.tagCleaner = regexp.MustCompile(`(?i)<(/?\w+/\w+|/[\w ]+/|/?(html|head|script|style\b))`)
v.nulCleaner = strings.NewReplacer("\000", "")
return v
})
func IsFullURLString(link string) bool {
return globalVars().fullURLPattern.MatchString(link)
}
func IsNonEmptyRelativePath(link string) bool {
return link != "" && !IsFullURLString(link) && link[0] != '?' && link[0] != '#'
}
// CustomLinkURLSchemes allows for additional schemes to be detected when parsing links within text
func CustomLinkURLSchemes(schemes []string) {
schemes = append(schemes, "http", "https")
withAuth := make([]string, 0, len(schemes))
validScheme := regexp.MustCompile(`^[a-z]+$`)
for _, s := range schemes {
if !validScheme.MatchString(s) {
continue
}
without := false
for _, sna := range xurls.SchemesNoAuthority {
if s == sna {
without = true
break
}
}
if without {
s += ":"
} else {
s += "://"
}
withAuth = append(withAuth, s)
}
common.GlobalVars().LinkRegex, _ = xurls.StrictMatchingScheme(strings.Join(withAuth, "|"))
}
type processor func(ctx *RenderContext, node *html.Node)
// PostProcessDefault does the final required transformations to the passed raw HTML
// data, and ensures its validity. Transformations include: replacing links and
// emails with HTML links, parsing shortlinks in the format of [[Link]], like
// MediaWiki, linking issues in the format #ID, and mentions in the format
// @user, and others.
func PostProcessDefault(ctx *RenderContext, input io.Reader, output io.Writer) error {
procs := []processor{
fullIssuePatternProcessor,
comparePatternProcessor,
codePreviewPatternProcessor,
fullHashPatternProcessor,
shortLinkProcessor,
linkProcessor,
mentionProcessor,
issueIndexPatternProcessor,
commitCrossReferencePatternProcessor,
hashCurrentPatternProcessor,
emailAddressProcessor,
emojiProcessor,
emojiShortCodeProcessor,
}
return postProcess(ctx, procs, input, output)
}
// PostProcessCommitMessage will use the same logic as PostProcess, but will disable
// the shortLinkProcessor.
func PostProcessCommitMessage(ctx *RenderContext, content string) (string, error) {
procs := []processor{
fullIssuePatternProcessor,
comparePatternProcessor,
fullHashPatternProcessor,
linkProcessor,
mentionProcessor,
issueIndexPatternProcessor,
commitCrossReferencePatternProcessor,
hashCurrentPatternProcessor,
emailAddressProcessor,
emojiProcessor,
emojiShortCodeProcessor,
}
return postProcessString(ctx, procs, content)
}
var emojiProcessors = []processor{
emojiShortCodeProcessor,
emojiProcessor,
}
// PostProcessCommitMessageSubject will use the same logic as PostProcess and
// PostProcessCommitMessage, but will disable the shortLinkProcessor and
// emailAddressProcessor, will add a defaultLinkProcessor if defaultLink is set,
// which changes every text node into a link to the passed default link.
func PostProcessCommitMessageSubject(ctx *RenderContext, defaultLink, content string) (string, error) {
procs := []processor{
fullIssuePatternProcessor,
comparePatternProcessor,
fullHashPatternProcessor,
linkProcessor,
mentionProcessor,
issueIndexPatternProcessor,
commitCrossReferencePatternProcessor,
hashCurrentPatternProcessor,
emojiShortCodeProcessor,
emojiProcessor,
}
procs = append(procs, func(ctx *RenderContext, node *html.Node) {
ch := &html.Node{Parent: node, Type: html.TextNode, Data: node.Data}
node.Type = html.ElementNode
node.Data = "a"
node.DataAtom = atom.A
node.Attr = []html.Attribute{{Key: "href", Val: defaultLink}, {Key: "class", Val: "muted"}}
node.FirstChild, node.LastChild = ch, ch
})
return postProcessString(ctx, procs, content)
}
// PostProcessIssueTitle to process title on individual issue/pull page
func PostProcessIssueTitle(ctx *RenderContext, title string) (string, error) {
return postProcessString(ctx, []processor{
issueIndexPatternProcessor,
commitCrossReferencePatternProcessor,
hashCurrentPatternProcessor,
emojiShortCodeProcessor,
emojiProcessor,
}, title)
}
// PostProcessDescriptionHTML will use similar logic as PostProcess, but will
// use a single special linkProcessor.
func PostProcessDescriptionHTML(ctx *RenderContext, content string) (string, error) {
return postProcessString(ctx, []processor{
descriptionLinkProcessor,
emojiShortCodeProcessor,
emojiProcessor,
}, content)
}
// PostProcessEmoji for when we want to just process emoji and shortcodes
// in various places it isn't already run through the normal markdown processor
func PostProcessEmoji(ctx *RenderContext, content string) (string, error) {
return postProcessString(ctx, emojiProcessors, content)
}
func postProcessString(ctx *RenderContext, procs []processor, content string) (string, error) {
var buf strings.Builder
if err := postProcess(ctx, procs, strings.NewReader(content), &buf); err != nil {
return "", err
}
return buf.String(), nil
}
func postProcess(ctx *RenderContext, procs []processor, input io.Reader, output io.Writer) error {
if !ctx.usedByRender && ctx.RenderHelper != nil {
defer ctx.RenderHelper.CleanUp()
}
// FIXME: don't read all content to memory
rawHTML, err := io.ReadAll(input)
if err != nil {
return err
}
// parse the HTML
node, err := html.Parse(io.MultiReader(
// prepend "<html><body>"
strings.NewReader("<html><body>"),
// Strip out nuls - they're always invalid
bytes.NewReader(globalVars().tagCleaner.ReplaceAll([]byte(globalVars().nulCleaner.Replace(string(rawHTML))), []byte("&lt;$1"))),
// close the tags
strings.NewReader("</body></html>"),
))
if err != nil {
return fmt.Errorf("markup.postProcess: invalid HTML: %w", err)
}
if node.Type == html.DocumentNode {
node = node.FirstChild
}
visitNode(ctx, procs, node)
newNodes := make([]*html.Node, 0, 5)
if node.Data == "html" {
node = node.FirstChild
for node != nil && node.Data != "body" {
node = node.NextSibling
}
}
if node != nil {
if node.Data == "body" {
child := node.FirstChild
for child != nil {
newNodes = append(newNodes, child)
child = child.NextSibling
}
} else {
newNodes = append(newNodes, node)
}
}
// Render everything to buf.
for _, node := range newNodes {
if err := html.Render(output, node); err != nil {
return fmt.Errorf("markup.postProcess: html.Render: %w", err)
}
}
return nil
}
func isEmojiNode(node *html.Node) bool {
if node.Type == html.ElementNode && node.Data == atom.Span.String() {
for _, attr := range node.Attr {
if (attr.Key == "class" || attr.Key == "data-attr-class") && strings.Contains(attr.Val, "emoji") {
return true
}
}
}
return false
}
func visitNode(ctx *RenderContext, procs []processor, node *html.Node) *html.Node {
if node.Type == html.TextNode {
for _, proc := range procs {
proc(ctx, node) // it might add siblings
}
return node.NextSibling
}
if node.Type != html.ElementNode {
return node.NextSibling
}
processNodeAttrID(node)
if isEmojiNode(node) {
// TextNode emoji will be converted to `<span class="emoji">`, then the next iteration will visit the "span"
// if we don't stop it, it will go into the TextNode again and create an infinite recursion
return node.NextSibling
} else if node.Data == "code" || node.Data == "pre" {
return node.NextSibling // ignore code and pre nodes
} else if node.Data == "img" {
return visitNodeImg(ctx, node)
} else if node.Data == "video" {
return visitNodeVideo(ctx, node)
}
if node.Data == "a" {
processNodeA(ctx, node)
// only use emoji processors for the content in the "A" tag,
// because the content there is not processable, for example: the content is a commit id or a full URL.
procs = emojiProcessors
}
for n := node.FirstChild; n != nil; {
n = visitNode(ctx, procs, n)
}
return node.NextSibling
}
// createKeyword() renders a highlighted version of an action keyword
func createKeyword(ctx *RenderContext, content string) *html.Node {
// CSS class for action keywords (e.g. "closes: #1")
const keywordClass = "issue-keyword"
span := &html.Node{
Type: html.ElementNode,
Data: atom.Span.String(),
Attr: []html.Attribute{},
}
span.Attr = append(span.Attr, ctx.RenderInternal.NodeSafeAttr("class", keywordClass))
text := &html.Node{
Type: html.TextNode,
Data: content,
}
span.AppendChild(text)
return span
}
func createLink(ctx *RenderContext, href, content, class string) *html.Node {
a := &html.Node{
Type: html.ElementNode,
Data: atom.A.String(),
Attr: []html.Attribute{{Key: "href", Val: href}},
}
if !RenderBehaviorForTesting.DisableAdditionalAttributes {
a.Attr = append(a.Attr, html.Attribute{Key: "data-markdown-generated-content"})
}
if class != "" {
a.Attr = append(a.Attr, ctx.RenderInternal.NodeSafeAttr("class", class))
}
text := &html.Node{
Type: html.TextNode,
Data: content,
}
a.AppendChild(text)
return a
}
// replaceContent takes text node, and in its content it replaces a section of
// it with the specified newNode.
func replaceContent(node *html.Node, i, j int, newNode *html.Node) {
replaceContentList(node, i, j, []*html.Node{newNode})
}
// replaceContentList takes text node, and in its content it replaces a section of
// it with the specified newNodes. An example to visualize how this can work can
// be found here: https://play.golang.org/p/5zP8NnHZ03s
func replaceContentList(node *html.Node, i, j int, newNodes []*html.Node) {
// get the data before and after the match
before := node.Data[:i]
after := node.Data[j:]
// Replace in the current node the text, so that it is only what it is
// supposed to have.
node.Data = before
// Get the current next sibling, before which we place the replaced data,
// and after that we place the new text node.
nextSibling := node.NextSibling
for _, n := range newNodes {
node.Parent.InsertBefore(n, nextSibling)
}
if after != "" {
node.Parent.InsertBefore(&html.Node{
Type: html.TextNode,
Data: after,
}, nextSibling)
}
}