blob: f61d98f59ed3fa2f983d3836440eb0ba1b900e28 [file] [log] [blame]
// Copyright (c) 2014, David Kitchen <david@buro9.com>
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * Neither the name of the organisation (Microcosm) nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package bluemonday
import (
"net/url"
"regexp"
"strings"
)
// Policy encapsulates the whitelist of HTML elements and attributes that will
// be applied to the sanitised HTML.
//
// You should use bluemonday.NewPolicy() to create a blank policy as the
// unexported fields contain maps that need to be initialized.
type Policy struct {
// Declares whether the maps have been initialized, used as a cheap check to
// ensure that those using Policy{} directly won't cause nil pointer
// exceptions
initialized bool
// If true then we add spaces when stripping tags, specifically the closing
// tag is replaced by a space character.
addSpaces bool
// When true, add rel="nofollow" to HTML anchors
requireNoFollow bool
// When true, add rel="nofollow" to HTML anchors
// Will add for href="http://foo"
// Will skip for href="/foo" or href="foo"
requireNoFollowFullyQualifiedLinks bool
// When true add target="_blank" to fully qualified links
// Will add for href="http://foo"
// Will skip for href="/foo" or href="foo"
addTargetBlankToFullyQualifiedLinks bool
// When true, URLs must be parseable by "net/url" url.Parse()
requireParseableURLs bool
// When true, u, _ := url.Parse("url"); !u.IsAbs() is permitted
allowRelativeURLs bool
// When true, allow data attributes.
allowDataAttributes bool
// map[htmlElementName]map[htmlAttributeName]attrPolicy
elsAndAttrs map[string]map[string]attrPolicy
// map[htmlAttributeName]attrPolicy
globalAttrs map[string]attrPolicy
// If urlPolicy is nil, all URLs with matching schema are allowed.
// Otherwise, only the URLs with matching schema and urlPolicy(url)
// returning true are allowed.
allowURLSchemes map[string]urlPolicy
// If an element has had all attributes removed as a result of a policy
// being applied, then the element would be removed from the output.
//
// However some elements are valid and have strong layout meaning without
// any attributes, i.e. <table>. To prevent those being removed we maintain
// a list of elements that are allowed to have no attributes and that will
// be maintained in the output HTML.
setOfElementsAllowedWithoutAttrs map[string]struct{}
setOfElementsToSkipContent map[string]struct{}
}
type attrPolicy struct {
// optional pattern to match, when not nil the regexp needs to match
// otherwise the attribute is removed
regexp *regexp.Regexp
}
type attrPolicyBuilder struct {
p *Policy
attrNames []string
regexp *regexp.Regexp
allowEmpty bool
}
type urlPolicy func(url *url.URL) (allowUrl bool)
// init initializes the maps if this has not been done already
func (p *Policy) init() {
if !p.initialized {
p.elsAndAttrs = make(map[string]map[string]attrPolicy)
p.globalAttrs = make(map[string]attrPolicy)
p.allowURLSchemes = make(map[string]urlPolicy)
p.setOfElementsAllowedWithoutAttrs = make(map[string]struct{})
p.setOfElementsToSkipContent = make(map[string]struct{})
p.initialized = true
}
}
// NewPolicy returns a blank policy with nothing whitelisted or permitted. This
// is the recommended way to start building a policy and you should now use
// AllowAttrs() and/or AllowElements() to construct the whitelist of HTML
// elements and attributes.
func NewPolicy() *Policy {
p := Policy{}
p.addDefaultElementsWithoutAttrs()
p.addDefaultSkipElementContent()
return &p
}
// AllowAttrs takes a range of HTML attribute names and returns an
// attribute policy builder that allows you to specify the pattern and scope of
// the whitelisted attribute.
//
// The attribute policy is only added to the core policy when either Globally()
// or OnElements(...) are called.
func (p *Policy) AllowAttrs(attrNames ...string) *attrPolicyBuilder {
p.init()
abp := attrPolicyBuilder{
p: p,
allowEmpty: false,
}
for _, attrName := range attrNames {
abp.attrNames = append(abp.attrNames, strings.ToLower(attrName))
}
return &abp
}
// AllowDataAttributes whitelists all data attributes. We can't specify the name
// of each attribute exactly as they are customized.
//
// NOTE: These values are not sanitized and applications that evaluate or process
// them without checking and verification of the input may be at risk if this option
// is enabled. This is a 'caveat emptor' option and the person enabling this option
// needs to fully understand the potential impact with regards to whatever application
// will be consuming the sanitized HTML afterwards, i.e. if you know you put a link in a
// data attribute and use that to automatically load some new window then you're giving
// the author of a HTML fragment the means to open a malicious destination automatically.
// Use with care!
func (p *Policy) AllowDataAttributes() {
p.allowDataAttributes = true
}
// AllowNoAttrs says that attributes on element are optional.
//
// The attribute policy is only added to the core policy when OnElements(...)
// are called.
func (p *Policy) AllowNoAttrs() *attrPolicyBuilder {
p.init()
abp := attrPolicyBuilder{
p: p,
allowEmpty: true,
}
return &abp
}
// AllowNoAttrs says that attributes on element are optional.
//
// The attribute policy is only added to the core policy when OnElements(...)
// are called.
func (abp *attrPolicyBuilder) AllowNoAttrs() *attrPolicyBuilder {
abp.allowEmpty = true
return abp
}
// Matching allows a regular expression to be applied to a nascent attribute
// policy, and returns the attribute policy. Calling this more than once will
// replace the existing regexp.
func (abp *attrPolicyBuilder) Matching(regex *regexp.Regexp) *attrPolicyBuilder {
abp.regexp = regex
return abp
}
// OnElements will bind an attribute policy to a given range of HTML elements
// and return the updated policy
func (abp *attrPolicyBuilder) OnElements(elements ...string) *Policy {
for _, element := range elements {
element = strings.ToLower(element)
for _, attr := range abp.attrNames {
if _, ok := abp.p.elsAndAttrs[element]; !ok {
abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
}
ap := attrPolicy{}
if abp.regexp != nil {
ap.regexp = abp.regexp
}
abp.p.elsAndAttrs[element][attr] = ap
}
if abp.allowEmpty {
abp.p.setOfElementsAllowedWithoutAttrs[element] = struct{}{}
if _, ok := abp.p.elsAndAttrs[element]; !ok {
abp.p.elsAndAttrs[element] = make(map[string]attrPolicy)
}
}
}
return abp.p
}
// Globally will bind an attribute policy to all HTML elements and return the
// updated policy
func (abp *attrPolicyBuilder) Globally() *Policy {
for _, attr := range abp.attrNames {
if _, ok := abp.p.globalAttrs[attr]; !ok {
abp.p.globalAttrs[attr] = attrPolicy{}
}
ap := attrPolicy{}
if abp.regexp != nil {
ap.regexp = abp.regexp
}
abp.p.globalAttrs[attr] = ap
}
return abp.p
}
// AllowElements will append HTML elements to the whitelist without applying an
// attribute policy to those elements (the elements are permitted
// sans-attributes)
func (p *Policy) AllowElements(names ...string) *Policy {
p.init()
for _, element := range names {
element = strings.ToLower(element)
if _, ok := p.elsAndAttrs[element]; !ok {
p.elsAndAttrs[element] = make(map[string]attrPolicy)
}
}
return p
}
// RequireNoFollowOnLinks will result in all <a> tags having a rel="nofollow"
// added to them if one does not already exist
//
// Note: This requires p.RequireParseableURLs(true) and will enable it.
func (p *Policy) RequireNoFollowOnLinks(require bool) *Policy {
p.requireNoFollow = require
p.requireParseableURLs = true
return p
}
// RequireNoFollowOnFullyQualifiedLinks will result in all <a> tags that point
// to a non-local destination (i.e. starts with a protocol and has a host)
// having a rel="nofollow" added to them if one does not already exist
//
// Note: This requires p.RequireParseableURLs(true) and will enable it.
func (p *Policy) RequireNoFollowOnFullyQualifiedLinks(require bool) *Policy {
p.requireNoFollowFullyQualifiedLinks = require
p.requireParseableURLs = true
return p
}
// AddTargetBlankToFullyQualifiedLinks will result in all <a> tags that point
// to a non-local destination (i.e. starts with a protocol and has a host)
// having a target="_blank" added to them if one does not already exist
//
// Note: This requires p.RequireParseableURLs(true) and will enable it.
func (p *Policy) AddTargetBlankToFullyQualifiedLinks(require bool) *Policy {
p.addTargetBlankToFullyQualifiedLinks = require
p.requireParseableURLs = true
return p
}
// RequireParseableURLs will result in all URLs requiring that they be parseable
// by "net/url" url.Parse()
// This applies to:
// - a.href
// - area.href
// - blockquote.cite
// - img.src
// - link.href
// - script.src
func (p *Policy) RequireParseableURLs(require bool) *Policy {
p.requireParseableURLs = require
return p
}
// AllowRelativeURLs enables RequireParseableURLs and then permits URLs that
// are parseable, have no schema information and url.IsAbs() returns false
// This permits local URLs
func (p *Policy) AllowRelativeURLs(require bool) *Policy {
p.RequireParseableURLs(true)
p.allowRelativeURLs = require
return p
}
// AllowURLSchemes will append URL schemes to the whitelist
// Example: p.AllowURLSchemes("mailto", "http", "https")
func (p *Policy) AllowURLSchemes(schemes ...string) *Policy {
p.init()
p.RequireParseableURLs(true)
for _, scheme := range schemes {
scheme = strings.ToLower(scheme)
// Allow all URLs with matching scheme.
p.allowURLSchemes[scheme] = nil
}
return p
}
// AllowURLSchemeWithCustomPolicy will append URL schemes with
// a custom URL policy to the whitelist.
// Only the URLs with matching schema and urlPolicy(url)
// returning true will be allowed.
func (p *Policy) AllowURLSchemeWithCustomPolicy(
scheme string,
urlPolicy func(url *url.URL) (allowUrl bool),
) *Policy {
p.init()
p.RequireParseableURLs(true)
scheme = strings.ToLower(scheme)
p.allowURLSchemes[scheme] = urlPolicy
return p
}
// AddSpaceWhenStrippingTag states whether to add a single space " " when
// removing tags that are not whitelisted by the policy.
//
// This is useful if you expect to strip tags in dense markup and may lose the
// value of whitespace.
//
// For example: "<p>Hello</p><p>World</p>"" would be sanitized to "HelloWorld"
// with the default value of false, but you may wish to sanitize this to
// " Hello World " by setting AddSpaceWhenStrippingTag to true as this would
// retain the intent of the text.
func (p *Policy) AddSpaceWhenStrippingTag(allow bool) *Policy {
p.addSpaces = allow
return p
}
// SkipElementsContent adds the HTML elements whose tags is needed to be removed
// with its content.
func (p *Policy) SkipElementsContent(names ...string) *Policy {
p.init()
for _, element := range names {
element = strings.ToLower(element)
if _, ok := p.setOfElementsToSkipContent[element]; !ok {
p.setOfElementsToSkipContent[element] = struct{}{}
}
}
return p
}
// AllowElementsContent marks the HTML elements whose content should be
// retained after removing the tag.
func (p *Policy) AllowElementsContent(names ...string) *Policy {
p.init()
for _, element := range names {
delete(p.setOfElementsToSkipContent, strings.ToLower(element))
}
return p
}
// addDefaultElementsWithoutAttrs adds the HTML elements that we know are valid
// without any attributes to an internal map.
// i.e. we know that <table> is valid, but <bdo> isn't valid as the "dir" attr
// is mandatory
func (p *Policy) addDefaultElementsWithoutAttrs() {
p.init()
p.setOfElementsAllowedWithoutAttrs["abbr"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["acronym"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["address"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["article"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["aside"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["audio"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["b"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["bdi"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["blockquote"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["body"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["br"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["button"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["canvas"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["caption"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["center"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["cite"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["code"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["col"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["colgroup"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["datalist"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["dd"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["del"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["details"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["dfn"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["div"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["dl"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["dt"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["em"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["fieldset"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["figcaption"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["figure"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["footer"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h1"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h2"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h3"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h4"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h5"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["h6"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["head"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["header"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["hgroup"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["hr"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["html"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["i"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["ins"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["kbd"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["li"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["mark"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["marquee"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["nav"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["ol"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["optgroup"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["option"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["p"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["pre"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["q"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["rp"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["rt"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["ruby"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["s"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["samp"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["script"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["section"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["select"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["small"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["span"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["strike"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["strong"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["style"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["sub"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["summary"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["sup"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["svg"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["table"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["tbody"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["td"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["textarea"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["tfoot"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["th"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["thead"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["title"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["time"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["tr"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["tt"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["u"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["ul"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["var"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["video"] = struct{}{}
p.setOfElementsAllowedWithoutAttrs["wbr"] = struct{}{}
}
// addDefaultSkipElementContent adds the HTML elements that we should skip
// rendering the character content of, if the element itself is not allowed.
// This is all character data that the end user would not normally see.
// i.e. if we exclude a <script> tag then we shouldn't render the JavaScript or
// anything else until we encounter the closing </script> tag.
func (p *Policy) addDefaultSkipElementContent() {
p.init()
p.setOfElementsToSkipContent["frame"] = struct{}{}
p.setOfElementsToSkipContent["frameset"] = struct{}{}
p.setOfElementsToSkipContent["iframe"] = struct{}{}
p.setOfElementsToSkipContent["noembed"] = struct{}{}
p.setOfElementsToSkipContent["noframes"] = struct{}{}
p.setOfElementsToSkipContent["noscript"] = struct{}{}
p.setOfElementsToSkipContent["nostyle"] = struct{}{}
p.setOfElementsToSkipContent["object"] = struct{}{}
p.setOfElementsToSkipContent["script"] = struct{}{}
p.setOfElementsToSkipContent["style"] = struct{}{}
p.setOfElementsToSkipContent["title"] = struct{}{}
}