sanitizehtml: add a package to sanitize HTML It is based on an existing HTML parser in golang.org/x/net/html. The motivation is to support user-supplied HTML in step_text in Milo. R=dnj@chromium.org, vadimsh@chromium.org BUG=704387 Review-Url: https://codereview.chromium.org/2849353002
diff --git a/common/data/text/sanitizehtml/sanitize.go b/common/data/text/sanitizehtml/sanitize.go new file mode 100644 index 0000000..ce06f04 --- /dev/null +++ b/common/data/text/sanitizehtml/sanitize.go
@@ -0,0 +1,203 @@ +// Copyright 2017 The LUCI Authors. All rights reserved. +// Use of this source code is governed under the Apache License, Version 2.0 +// that can be found in the LICENSE file. + +// Package sanitizehtml implements a sanitizer of a very limited HTML. +// See Sanitize comment. +package sanitizehtml + +import ( + "bufio" + "io" + "net/url" + "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// attrValueSanitizer sanitizes an attribute value. +type attrValueSanitizer func(string) string + +func alwaysSafe(s string) string { + return s +} + +func sanitizeURL(s string) string { + const sanitizedPrefix = "about:invalid#sanitized&reason=" + switch u, err := url.Parse(s); { + case err != nil: + return sanitizedPrefix + "malformed-url" + + case u.Scheme != "http" && u.Scheme != "https": + return sanitizedPrefix + "disallowed-scheme" + + case u.Host == "": + return sanitizedPrefix + "relative-url" + + default: + // re-serialize the URL to ensure that what we return is what we think + // we parsed. + return u.String() + } +} + +type attrMap map[string]attrValueSanitizer + +var ( + anchorAttrs = attrMap{ + "alt": alwaysSafe, + "href": sanitizeURL, + } + trAttrs = attrMap{ + "rowspan": alwaysSafe, + "colspan": alwaysSafe, + } + tdAttrs = attrMap{ + "rowspan": alwaysSafe, + "colspan": alwaysSafe, + } +) + +type stringWriter interface { + WriteString(string) (int, error) +} + +type sanitizer struct { + sw stringWriter + err error +} + +// p prints the text, unless there was an error before. +func (s *sanitizer) p(safeMarkup string) { + if s.err == nil { + _, s.err = s.sw.WriteString(safeMarkup) + } +} + +// printAttrs sanitizes and prints a whitelist of attributes in el +func (s *sanitizer) printAttrs(el *html.Node, whitelist attrMap) { + for _, a := range el.Attr { + key := strings.ToLower(a.Key) + if sanitizer, ok := whitelist[key]; a.Namespace == "" && ok { + s.p(" ") + s.p(key) + s.p("=\"") + s.p(html.EscapeString(sanitizer(a.Val))) + s.p("\"") + } + } +} + +// printElem prints the safe element with a whitelist of attributes. +// If allowedAttrs is nil, all attributes are omitted. +// +// Do not call for unsafe elements. +func (s *sanitizer) printElem(safeElement *html.Node, allowedAttrs attrMap) { + tag := safeElement.DataAtom.String() + s.p("<") + s.p(tag) + if allowedAttrs == nil { + // ignore attributes + } else { + s.printAttrs(safeElement, allowedAttrs) + } + s.p(">") + + s.visitChildren(safeElement) + + s.p("</") + s.p(tag) + s.p(">") +} + +func (s *sanitizer) visit(n *html.Node) { + switch n.Type { + case html.TextNode: + // print it escaped. + s.p(html.EscapeString(n.Data)) + + case html.ElementNode: + // This switch statement defines what HTML elements we allow. + switch n.DataAtom { + case atom.Br: + // br is allowed and it should not be closed + s.p("<br>") + + case atom.Script, atom.Style: + // ignore entirely + // do not visit children so we don't print inner text + + case atom.A: + s.p(`<a rel="noopener" target="_blank"`) + s.printAttrs(n, anchorAttrs) + s.p(">") + s.visitChildren(n) + s.p("</a>") + + case atom.P, atom.Ol, atom.Ul, atom.Li, atom.Table, atom.Strong, atom.Em: + // print without attributes + s.printElem(n, nil) + + case atom.Tr: + s.printElem(n, trAttrs) + + case atom.Td: + s.printElem(n, tdAttrs) + + default: + // ignore the element, but visit children. + s.visitChildren(n) + } + + default: + // ignore the node, but visit children. + s.visitChildren(n) + } +} + +func (s *sanitizer) visitChildren(n *html.Node) { + for c := n.FirstChild; c != nil; c = c.NextSibling { + s.visit(c) + } +} + +// Sanitize strips all HTML nodes except allowed ones. +// +// Unless explicitly specified, attributes are stripped. +// Allowed elements: +// - p, br +// - strong, em +// - a +// - if href attribute is not a valid absolute HTTP(s) link, it is replaced +// with an innocuous one. +// - alt attribute is allowed +// - ul, ol, li +// - table +// - tr, td. Attributes rowspan/colspan are allowed. +// +// Elements <script> and <style> are ignored entirely. +// For all other HTML nodes, Sanitize ignores the node, but visits its children. +func Sanitize(w io.Writer, r io.Reader) (err error) { + var root *html.Node + root, err = html.Parse(r) + if err != nil { + return err + } + + sw, ok := w.(stringWriter) + if !ok { + bw := bufio.NewWriter(w) + defer func() { + ferr := bw.Flush() + if err == nil { + err = ferr + } + }() + sw = bw + } + + s := sanitizer{sw: sw} + s.visit(root) + return s.err +}
diff --git a/common/data/text/sanitizehtml/sanitize_test.go b/common/data/text/sanitizehtml/sanitize_test.go new file mode 100644 index 0000000..179e3af --- /dev/null +++ b/common/data/text/sanitizehtml/sanitize_test.go
@@ -0,0 +1,141 @@ +// Copyright 2017 The LUCI Authors. All rights reserved. +// Use of this source code is governed under the Apache License, Version 2.0 +// that can be found in the LICENSE file. + +package sanitizehtml + +import ( + "bytes" + "strings" + "testing" + + . "github.com/smartystreets/goconvey/convey" +) + +func TestSanitize(t *testing.T) { + t.Parallel() + + cases := []struct { + in, out string + }{ + // Scripts + { + `<script src="evil.js"/>`, + ``, + }, + + // Paragraphs + { + `<p style="font-size: 100">hi</p>`, + `<p>hi</p>`, + }, + { + `<P>hi</P>`, + `<p>hi</p>`, + }, + { + `a<br>b`, + `a<br>b`, + }, + + // Lists + { + `<ul foo="bar"> + <li x="y">a</li> + <li>a</li> + </ul>`, + `<ul> + <li>a</li> + <li>a</li> + </ul>`, + }, + + // Links + { + `<a href="https://ci.chromium.org" alt="x">link</a>`, + `<a rel="noopener" target="_blank" href="https://ci.chromium.org" alt="x">link</a>`, + }, + { + `<a href="javascript:evil.js">link</a>`, + `<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=disallowed-scheme">link</a>`, + }, + { + `<a href="about:blank">link</a>`, + `<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=disallowed-scheme">link</a>`, + }, + { + `<a href="%">link</a>`, + `<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=malformed-url">link</a>`, + }, + { + `<a href="/foo">link</a>`, + `<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=disallowed-scheme">link</a>`, + }, + { + `<a href="https:///foo">link</a>`, + `<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=relative-url">link</a>`, + }, + { + `<<a href=abc>`, + `<<a rel="noopener" target="_blank" href="about:invalid#sanitized&reason=disallowed-scheme"></a>`, + }, + + // Tables + { + `<table> + <tr colspan="2"> + <td rowspan=2>a</td> + </tr> + <tr style=""> + <td>b</td> + <td>c</td> + </tr> + </table>`, + `<table> + <tr colspan="2"> + <td rowspan="2">a</td> + </tr> + <tr> + <td>b</td> + <td>c</td> + </tr> + </table>`, + }, + + // Other + { + `<div><strong>hello</strong></div>`, + `<strong>hello</strong>`, + }, + { + `<`, + `<`, + }, + { + `&foobar;`, + `&foobar;`, + }, + { + `<div><p>foo</p>`, + `<p>foo</p>`, + }, + { + `<p></a alt="blah"></p>`, + `<p></p>`, + }, + { + `<p><a>blah</p></a>`, + `<p><a rel="noopener" target="_blank">blah</a></p>`, + }, + } + + for _, c := range cases { + c := c + Convey(c.in, t, func() { + buf := &bytes.Buffer{} + err := Sanitize(buf, strings.NewReader(c.in)) + So(err, ShouldBeNil) + So(buf.String(), ShouldEqual, c.out) + }) + } +}