From 49b8c975d8ea6830350dcda03b6d82ed3ee94319 Mon Sep 17 00:00:00 2001 From: "Tim St. Clair" Date: Mon, 15 Jun 2015 11:54:48 -0700 Subject: [PATCH] Update the URL-rewriting logic to make minimal modifications. The new approach avoids building a complete parse tree, instead using the lower level token stream. Doing so removes the need for creating "missing" HTML elements, reducing the collateral changes to the rewritten HTML. Resolves: #9766 --- pkg/util/proxy/transport.go | 129 ++++++++++++++++--------------- pkg/util/proxy/transport_test.go | 28 ++----- 2 files changed, 75 insertions(+), 82 deletions(-) diff --git a/pkg/util/proxy/transport.go b/pkg/util/proxy/transport.go index 23e76c53064..c0544dfa8e6 100644 --- a/pkg/util/proxy/transport.go +++ b/pkg/util/proxy/transport.go @@ -29,39 +29,40 @@ import ( "github.com/golang/glog" "golang.org/x/net/html" + "golang.org/x/net/html/atom" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" ) -// tagsToAttrs states which attributes of which tags require URL substitution. +// atomsToAttrs states which attributes of which tags require URL substitution. // Sources: http://www.w3.org/TR/REC-html40/index/attributes.html // http://www.w3.org/html/wg/drafts/html/master/index.html#attributes-1 -var tagsToAttrs = map[string]util.StringSet{ - "a": util.NewStringSet("href"), - "applet": util.NewStringSet("codebase"), - "area": util.NewStringSet("href"), - "audio": util.NewStringSet("src"), - "base": util.NewStringSet("href"), - "blockquote": util.NewStringSet("cite"), - "body": util.NewStringSet("background"), - "button": util.NewStringSet("formaction"), - "command": util.NewStringSet("icon"), - "del": util.NewStringSet("cite"), - "embed": util.NewStringSet("src"), - "form": util.NewStringSet("action"), - "frame": util.NewStringSet("longdesc", "src"), - "head": util.NewStringSet("profile"), - "html": util.NewStringSet("manifest"), - "iframe": util.NewStringSet("longdesc", "src"), - "img": util.NewStringSet("longdesc", "src", "usemap"), - "input": util.NewStringSet("src", "usemap", "formaction"), - "ins": util.NewStringSet("cite"), - "link": util.NewStringSet("href"), - "object": util.NewStringSet("classid", "codebase", "data", "usemap"), - "q": util.NewStringSet("cite"), - "script": util.NewStringSet("src"), - "source": util.NewStringSet("src"), - "video": util.NewStringSet("poster", "src"), +var atomsToAttrs = map[atom.Atom]util.StringSet{ + atom.A: util.NewStringSet("href"), + atom.Applet: util.NewStringSet("codebase"), + atom.Area: util.NewStringSet("href"), + atom.Audio: util.NewStringSet("src"), + atom.Base: util.NewStringSet("href"), + atom.Blockquote: util.NewStringSet("cite"), + atom.Body: util.NewStringSet("background"), + atom.Button: util.NewStringSet("formaction"), + atom.Command: util.NewStringSet("icon"), + atom.Del: util.NewStringSet("cite"), + atom.Embed: util.NewStringSet("src"), + atom.Form: util.NewStringSet("action"), + atom.Frame: util.NewStringSet("longdesc", "src"), + atom.Head: util.NewStringSet("profile"), + atom.Html: util.NewStringSet("manifest"), + atom.Iframe: util.NewStringSet("longdesc", "src"), + atom.Img: util.NewStringSet("longdesc", "src", "usemap"), + atom.Input: util.NewStringSet("src", "usemap", "formaction"), + atom.Ins: util.NewStringSet("cite"), + atom.Link: util.NewStringSet("href"), + atom.Object: util.NewStringSet("classid", "codebase", "data", "usemap"), + atom.Q: util.NewStringSet("cite"), + atom.Script: util.NewStringSet("src"), + atom.Source: util.NewStringSet("src"), + atom.Video: util.NewStringSet("poster", "src"), // TODO: css URLs hidden in style elements. } @@ -108,7 +109,7 @@ func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) { return resp, nil } - return t.fixLinks(req, resp) + return t.rewriteResponse(req, resp) } // rewriteURL rewrites a single URL to go through the proxy, if the URL refers @@ -139,36 +140,42 @@ func (t *Transport) rewriteURL(targetURL string, sourceURL *url.URL) string { return url.String() } -// updateURLs checks and updates any of n's attributes that are listed in tagsToAttrs. -// Any URLs found are, if they share the source host, updated with the necessary changes -// to make a visit to that URL also go through the proxy. -// sourceURL is the URL of the page which we're currently on. -func (t *Transport) updateURLs(n *html.Node, sourceURL *url.URL) { - if n.Type != html.ElementNode { - return - } - attrs, ok := tagsToAttrs[n.Data] - if !ok { - return - } - for i, attr := range n.Attr { - if !attrs.Has(attr.Key) { - continue +// rewriteHTML scans the HTML for tags with url-valued attributes, and updates +// those values with the urlRewriter function. The updated HTML is output to the +// writer. +func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error { + // Note: This assumes the content is UTF-8. + tokenizer := html.NewTokenizer(reader) + + var err error + for err == nil { + tokenType := tokenizer.Next() + switch tokenType { + case html.ErrorToken: + err = tokenizer.Err() + case html.StartTagToken, html.SelfClosingTagToken: + token := tokenizer.Token() + if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok { + for i, attr := range token.Attr { + if urlAttrs.Has(attr.Key) { + token.Attr[i].Val = urlRewriter(attr.Val) + } + } + } + _, err = writer.Write([]byte(token.String())) + default: + _, err = writer.Write(tokenizer.Raw()) } - n.Attr[i].Val = t.rewriteURL(attr.Val, sourceURL) } + if err != io.EOF { + return err + } + return nil } -// scan recursively calls f for every n and every subnode of n. -func (t *Transport) scan(n *html.Node, f func(*html.Node)) { - f(n) - for c := n.FirstChild; c != nil; c = c.NextSibling { - t.scan(c, f) - } -} - -// fixLinks modifies links in an HTML file such that they will be redirected through the proxy if needed. -func (t *Transport) fixLinks(req *http.Request, resp *http.Response) (*http.Response, error) { +// rewriteResponse modifies an HTML response by updating absolute links refering +// to the original host to instead refer to the proxy transport. +func (t *Transport) rewriteResponse(req *http.Request, resp *http.Response) (*http.Response, error) { origBody := resp.Body defer origBody.Close() @@ -195,15 +202,13 @@ func (t *Transport) fixLinks(req *http.Request, resp *http.Response) (*http.Resp return resp, nil } - doc, err := html.Parse(reader) - if err != nil { - glog.Errorf("Parse failed: %v", err) - return resp, err + urlRewriter := func(targetUrl string) string { + return t.rewriteURL(targetUrl, req.URL) } - - t.scan(doc, func(n *html.Node) { t.updateURLs(n, req.URL) }) - if err := html.Render(writer, doc); err != nil { - glog.Errorf("Failed to render: %v", err) + err := rewriteHTML(reader, writer, urlRewriter) + if err != nil { + glog.Errorf("Failed to rewrite URLs: %v", err) + return resp, err } resp.Body = ioutil.NopCloser(newContent) diff --git a/pkg/util/proxy/transport_test.go b/pkg/util/proxy/transport_test.go index e140407ea57..4f5fb01ced9 100644 --- a/pkg/util/proxy/transport_test.go +++ b/pkg/util/proxy/transport_test.go @@ -17,7 +17,6 @@ limitations under the License. package proxy import ( - "bytes" "fmt" "io/ioutil" "net/http" @@ -25,8 +24,6 @@ import ( "net/url" "strings" "testing" - - "golang.org/x/net/html" ) func parseURLOrDie(inURL string) *url.URL { @@ -37,19 +34,6 @@ func parseURLOrDie(inURL string) *url.URL { return parsed } -// fmtHTML parses and re-emits 'in', effectively canonicalizing it. -func fmtHTML(in string) string { - doc, err := html.Parse(strings.NewReader(in)) - if err != nil { - panic(err) - } - out := &bytes.Buffer{} - if err := html.Render(out, doc); err != nil { - panic(err) - } - return string(out.Bytes()) -} - func TestProxyTransport(t *testing.T) { testTransport := &Transport{ Scheme: "http", @@ -81,6 +65,14 @@ func TestProxyTransport(t *testing.T) { contentType: "text/html", forwardedURI: "/proxy/minion/minion1:10250/logs/log.log", }, + "full document": { + input: `
kubelet.loggoogle.log
`, + sourceURL: "http://myminion.com/logs/log.log", + transport: testTransport, + output: `
kubelet.loggoogle.log
`, + contentType: "text/html", + forwardedURI: "/proxy/minion/minion1:10250/logs/log.log", + }, "trailing slash": { input: `
kubelet.loggoogle.log
`, sourceURL: "http://myminion.com/logs/log.log", @@ -161,10 +153,6 @@ func TestProxyTransport(t *testing.T) { } testItem := func(name string, item *Item) { - // Canonicalize the html so we can diff. - item.input = fmtHTML(item.input) - item.output = fmtHTML(item.output) - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { // Check request headers. if got, want := r.Header.Get("X-Forwarded-Uri"), item.forwardedURI; got != want {