Merge pull request #9827 from timstclair/html-rewrite

Html rewrite
This commit is contained in:
Saad Ali 2015-06-17 11:35:30 -07:00
commit f8e58f7efe
2 changed files with 75 additions and 82 deletions

View File

@ -29,39 +29,40 @@ import (
"github.com/golang/glog" "github.com/golang/glog"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/util"
) )
// tagsToAttrs states which attributes of which tags require URL substitution. // atomsToAttrs states which attributes of which tags require URL substitution.
// Sources: http://www.w3.org/TR/REC-html40/index/attributes.html // Sources: http://www.w3.org/TR/REC-html40/index/attributes.html
// http://www.w3.org/html/wg/drafts/html/master/index.html#attributes-1 // http://www.w3.org/html/wg/drafts/html/master/index.html#attributes-1
var tagsToAttrs = map[string]util.StringSet{ var atomsToAttrs = map[atom.Atom]util.StringSet{
"a": util.NewStringSet("href"), atom.A: util.NewStringSet("href"),
"applet": util.NewStringSet("codebase"), atom.Applet: util.NewStringSet("codebase"),
"area": util.NewStringSet("href"), atom.Area: util.NewStringSet("href"),
"audio": util.NewStringSet("src"), atom.Audio: util.NewStringSet("src"),
"base": util.NewStringSet("href"), atom.Base: util.NewStringSet("href"),
"blockquote": util.NewStringSet("cite"), atom.Blockquote: util.NewStringSet("cite"),
"body": util.NewStringSet("background"), atom.Body: util.NewStringSet("background"),
"button": util.NewStringSet("formaction"), atom.Button: util.NewStringSet("formaction"),
"command": util.NewStringSet("icon"), atom.Command: util.NewStringSet("icon"),
"del": util.NewStringSet("cite"), atom.Del: util.NewStringSet("cite"),
"embed": util.NewStringSet("src"), atom.Embed: util.NewStringSet("src"),
"form": util.NewStringSet("action"), atom.Form: util.NewStringSet("action"),
"frame": util.NewStringSet("longdesc", "src"), atom.Frame: util.NewStringSet("longdesc", "src"),
"head": util.NewStringSet("profile"), atom.Head: util.NewStringSet("profile"),
"html": util.NewStringSet("manifest"), atom.Html: util.NewStringSet("manifest"),
"iframe": util.NewStringSet("longdesc", "src"), atom.Iframe: util.NewStringSet("longdesc", "src"),
"img": util.NewStringSet("longdesc", "src", "usemap"), atom.Img: util.NewStringSet("longdesc", "src", "usemap"),
"input": util.NewStringSet("src", "usemap", "formaction"), atom.Input: util.NewStringSet("src", "usemap", "formaction"),
"ins": util.NewStringSet("cite"), atom.Ins: util.NewStringSet("cite"),
"link": util.NewStringSet("href"), atom.Link: util.NewStringSet("href"),
"object": util.NewStringSet("classid", "codebase", "data", "usemap"), atom.Object: util.NewStringSet("classid", "codebase", "data", "usemap"),
"q": util.NewStringSet("cite"), atom.Q: util.NewStringSet("cite"),
"script": util.NewStringSet("src"), atom.Script: util.NewStringSet("src"),
"source": util.NewStringSet("src"), atom.Source: util.NewStringSet("src"),
"video": util.NewStringSet("poster", "src"), atom.Video: util.NewStringSet("poster", "src"),
// TODO: css URLs hidden in style elements. // TODO: css URLs hidden in style elements.
} }
@ -108,7 +109,7 @@ func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) {
return resp, nil return resp, nil
} }
return t.fixLinks(req, resp) return t.rewriteResponse(req, resp)
} }
// rewriteURL rewrites a single URL to go through the proxy, if the URL refers // rewriteURL rewrites a single URL to go through the proxy, if the URL refers
@ -139,36 +140,42 @@ func (t *Transport) rewriteURL(targetURL string, sourceURL *url.URL) string {
return url.String() return url.String()
} }
// updateURLs checks and updates any of n's attributes that are listed in tagsToAttrs. // rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// Any URLs found are, if they share the source host, updated with the necessary changes // those values with the urlRewriter function. The updated HTML is output to the
// to make a visit to that URL also go through the proxy. // writer.
// sourceURL is the URL of the page which we're currently on. func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
func (t *Transport) updateURLs(n *html.Node, sourceURL *url.URL) { // Note: This assumes the content is UTF-8.
if n.Type != html.ElementNode { tokenizer := html.NewTokenizer(reader)
return
} var err error
attrs, ok := tagsToAttrs[n.Data] for err == nil {
if !ok { tokenType := tokenizer.Next()
return switch tokenType {
} case html.ErrorToken:
for i, attr := range n.Attr { err = tokenizer.Err()
if !attrs.Has(attr.Key) { case html.StartTagToken, html.SelfClosingTagToken:
continue token := tokenizer.Token()
if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
for i, attr := range token.Attr {
if urlAttrs.Has(attr.Key) {
token.Attr[i].Val = urlRewriter(attr.Val)
}
}
}
_, err = writer.Write([]byte(token.String()))
default:
_, err = writer.Write(tokenizer.Raw())
} }
n.Attr[i].Val = t.rewriteURL(attr.Val, sourceURL)
} }
if err != io.EOF {
return err
}
return nil
} }
// scan recursively calls f for every n and every subnode of n. // rewriteResponse modifies an HTML response by updating absolute links refering
func (t *Transport) scan(n *html.Node, f func(*html.Node)) { // to the original host to instead refer to the proxy transport.
f(n) func (t *Transport) rewriteResponse(req *http.Request, resp *http.Response) (*http.Response, error) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
t.scan(c, f)
}
}
// fixLinks modifies links in an HTML file such that they will be redirected through the proxy if needed.
func (t *Transport) fixLinks(req *http.Request, resp *http.Response) (*http.Response, error) {
origBody := resp.Body origBody := resp.Body
defer origBody.Close() defer origBody.Close()
@ -195,15 +202,13 @@ func (t *Transport) fixLinks(req *http.Request, resp *http.Response) (*http.Resp
return resp, nil return resp, nil
} }
doc, err := html.Parse(reader) urlRewriter := func(targetUrl string) string {
if err != nil { return t.rewriteURL(targetUrl, req.URL)
glog.Errorf("Parse failed: %v", err)
return resp, err
} }
err := rewriteHTML(reader, writer, urlRewriter)
t.scan(doc, func(n *html.Node) { t.updateURLs(n, req.URL) }) if err != nil {
if err := html.Render(writer, doc); err != nil { glog.Errorf("Failed to rewrite URLs: %v", err)
glog.Errorf("Failed to render: %v", err) return resp, err
} }
resp.Body = ioutil.NopCloser(newContent) resp.Body = ioutil.NopCloser(newContent)

View File

@ -17,7 +17,6 @@ limitations under the License.
package proxy package proxy
import ( import (
"bytes"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
@ -25,8 +24,6 @@ import (
"net/url" "net/url"
"strings" "strings"
"testing" "testing"
"golang.org/x/net/html"
) )
func parseURLOrDie(inURL string) *url.URL { func parseURLOrDie(inURL string) *url.URL {
@ -37,19 +34,6 @@ func parseURLOrDie(inURL string) *url.URL {
return parsed return parsed
} }
// fmtHTML parses and re-emits 'in', effectively canonicalizing it.
func fmtHTML(in string) string {
doc, err := html.Parse(strings.NewReader(in))
if err != nil {
panic(err)
}
out := &bytes.Buffer{}
if err := html.Render(out, doc); err != nil {
panic(err)
}
return string(out.Bytes())
}
func TestProxyTransport(t *testing.T) { func TestProxyTransport(t *testing.T) {
testTransport := &Transport{ testTransport := &Transport{
Scheme: "http", Scheme: "http",
@ -81,6 +65,14 @@ func TestProxyTransport(t *testing.T) {
contentType: "text/html", contentType: "text/html",
forwardedURI: "/proxy/minion/minion1:10250/logs/log.log", forwardedURI: "/proxy/minion/minion1:10250/logs/log.log",
}, },
"full document": {
input: `<html><header></header><body><pre><a href="kubelet.log">kubelet.log</a><a href="/google.log">google.log</a></pre></body></html>`,
sourceURL: "http://myminion.com/logs/log.log",
transport: testTransport,
output: `<html><header></header><body><pre><a href="kubelet.log">kubelet.log</a><a href="http://foo.com/proxy/minion/minion1:10250/google.log">google.log</a></pre></body></html>`,
contentType: "text/html",
forwardedURI: "/proxy/minion/minion1:10250/logs/log.log",
},
"trailing slash": { "trailing slash": {
input: `<pre><a href="kubelet.log">kubelet.log</a><a href="/google.log/">google.log</a></pre>`, input: `<pre><a href="kubelet.log">kubelet.log</a><a href="/google.log/">google.log</a></pre>`,
sourceURL: "http://myminion.com/logs/log.log", sourceURL: "http://myminion.com/logs/log.log",
@ -161,10 +153,6 @@ func TestProxyTransport(t *testing.T) {
} }
testItem := func(name string, item *Item) { testItem := func(name string, item *Item) {
// Canonicalize the html so we can diff.
item.input = fmtHTML(item.input)
item.output = fmtHTML(item.output)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Check request headers. // Check request headers.
if got, want := r.Header.Get("X-Forwarded-Uri"), item.forwardedURI; got != want { if got, want := r.Header.Get("X-Forwarded-Uri"), item.forwardedURI; got != want {