blob: 3e2cb12838a89eb0846c31c0c2504336122dc32c [file]
// Copyright 2021 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
/**
* Implements a limited HTML sanitizer based on the sanitizehtml package at
* https://pkg.go.dev/github.com/luci/luci-go/common/data/text/sanitizehtml
*
* See sanitize comment.
*/
// Characters in an HTML string to escape.
const ESCAPES: {[key: string]: string} = {
'&': '&',
'<': '&lt;',
'>': '&gt;',
'"': '&#034;',
"'": '&#039;',
};
// Regular expression corresponding to characters in ESCAPES.
const ESCAPE_REGEX = new RegExp(`[${Object.keys(ESCAPES).join('')}]`, 'g');
class Sanitizer {
safeString: string[];
constructor() {
this.safeString = [];
}
/**
* Returns a single string, joined from the elements in this.safeString.
*
* @returns The full safe string.
*/
getSafeString(): string {
return this.safeString.join('');
}
/**
* Prints a space and then an HTML attribute node.
*
* @param key The attribute key.
* @param value The attribute value.
*/
printAttr(key: string, value: string): void {
this.safeString.push(` ${key}="${this.escape(value)}"`);
}
/**
* Sanitizes a DOM node and its child nodes.
*
* @param n The DOM node to sanitize.
*/
visit(n: HTMLElement): void {
switch (n.nodeType) {
case 3: // TEXT_NODE
// Print it escaped.
this.safeString.push(this.escape(n.textContent!));
break;
case 1: // ELEMENT_NODE
// This switch statement defines what HTML elements we allow.
const tag = n.nodeName.toLowerCase();
switch (tag) {
case 'br':
case 'hr':
// br, hr are allowed and it should not be closed.
this.safeString.push(`<${tag}/>`);
break;
case 'script':
case 'style':
// Ignore entirely.
// Do not visit children so we don't print inner text.
break;
case 'a':
this.safeString.push('<a rel="noopener" target="_blank"');
for (const attr of n.attributes) {
switch (attr.name.toLowerCase()) {
case 'href':
this.printAttr('href', sanitizeURL(attr.value));
break;
case 'alt':
this.printAttr('alt', attr.value);
break;
}
}
this.safeString.push('>');
this.visitChildren(n);
this.safeString.push('</a>');
break;
case 'p':
case 'ol':
case 'ul':
case 'li':
case 'strong':
case 'em':
case 'code':
case 'pre':
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
// Print without attributes.
this.safeString.push(`<${tag}>`);
this.visitChildren(n);
this.safeString.push(`</${tag}>`);
break;
default:
// Ignore the element, but visit children.
this.visitChildren(n);
}
break;
default:
// Ignore the element, but visit children.
this.visitChildren(n);
}
}
/**
* Sanitizes a DOM's child nodes.
*
* @param n The DOM node to sanitize.
*/
visitChildren(n: HTMLElement): void {
for (const c of n.childNodes) {
this.visit(c as HTMLElement);
}
}
/**
* Returns an escaped string.
*
* @param s The HTML string to escape.
* @returns The escaped HTML string.
*/
escape(s: string): string {
return s.replaceAll(ESCAPE_REGEX, (m: string) => ESCAPES[m]);
}
}
/**
* Returns a sanitized URL. Removes non-http and malformed URLs. Used to
* sanitize URLs in <a> elements.
*
* @param s URL to sanitize.
* @returns The sanitized URL.
*/
function sanitizeURL(s: string): string {
const sanitizedPrefix = 'about:invalid#sanitized&reason=';
let u: URL;
try {
u = new URL(s);
} catch (e) {
if (e instanceof TypeError) {
return sanitizedPrefix + 'malformed-url';
}
throw e;
}
if (!['http:', 'https:'].includes(u.protocol)) {
return sanitizedPrefix + 'disallowed-scheme';
}
// Re-serialize the URL to ensure that we return is what we think we parsed.
return u.toString();
}
/**
* Strips all HTML nodes except allowed ones.
*
* Unless explicitly specified, attributes are stripped.
* Allowed elements:
* - p, br, hr
* - h1, h2, h3, h4, h5, h6
* - strong, em
* - a
* - if href attribute is not a valid absolute HTTP(s) link, it is replaced
* with an innocuous one.
* - alt attribute is allowed
* - ul, ol, li
* - code, pre
*
* Elements <script> and <style> are ignored entirely.
* For all other HTML nodes, sanitize ignores the node, but visits its children.
*
* @param s The HTML string to sanitize.
* @returns The sanitized HTML string.
*/
export function sanitize(s: string): string {
const root = new DOMParser().parseFromString(s, 'text/html');
const sanitizer = new Sanitizer();
sanitizer.visit(root as unknown as HTMLElement);
return sanitizer.getSafeString();
}