| // Copyright 2021 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| /** |
| * Implements a limited HTML sanitizer based on the sanitizehtml package at |
| * https://pkg.go.dev/github.com/luci/luci-go/common/data/text/sanitizehtml |
| * |
| * See sanitize comment. |
| */ |
| |
| // Characters in an HTML string to escape. |
| const ESCAPES: {[key: string]: string} = { |
| '&': '&', |
| '<': '<', |
| '>': '>', |
| '"': '"', |
| "'": ''', |
| }; |
| |
| // Regular expression corresponding to characters in ESCAPES. |
| const ESCAPE_REGEX = new RegExp(`[${Object.keys(ESCAPES).join('')}]`, 'g'); |
| |
| class Sanitizer { |
| safeString: string[]; |
| |
| constructor() { |
| this.safeString = []; |
| } |
| |
| /** |
| * Returns a single string, joined from the elements in this.safeString. |
| * |
| * @returns The full safe string. |
| */ |
| getSafeString(): string { |
| return this.safeString.join(''); |
| } |
| |
| /** |
| * Prints a space and then an HTML attribute node. |
| * |
| * @param key The attribute key. |
| * @param value The attribute value. |
| */ |
| printAttr(key: string, value: string): void { |
| this.safeString.push(` ${key}="${this.escape(value)}"`); |
| } |
| |
| /** |
| * Sanitizes a DOM node and its child nodes. |
| * |
| * @param n The DOM node to sanitize. |
| */ |
| visit(n: HTMLElement): void { |
| switch (n.nodeType) { |
| case 3: // TEXT_NODE |
| // Print it escaped. |
| this.safeString.push(this.escape(n.textContent!)); |
| break; |
| |
| case 1: // ELEMENT_NODE |
| // This switch statement defines what HTML elements we allow. |
| const tag = n.nodeName.toLowerCase(); |
| switch (tag) { |
| case 'br': |
| case 'hr': |
| // br, hr are allowed and it should not be closed. |
| this.safeString.push(`<${tag}/>`); |
| break; |
| |
| case 'script': |
| case 'style': |
| // Ignore entirely. |
| // Do not visit children so we don't print inner text. |
| break; |
| |
| case 'a': |
| this.safeString.push('<a rel="noopener" target="_blank"'); |
| |
| for (const attr of n.attributes) { |
| switch (attr.name.toLowerCase()) { |
| case 'href': |
| this.printAttr('href', sanitizeURL(attr.value)); |
| break; |
| |
| case 'alt': |
| this.printAttr('alt', attr.value); |
| break; |
| } |
| } |
| |
| this.safeString.push('>'); |
| this.visitChildren(n); |
| this.safeString.push('</a>'); |
| break; |
| |
| case 'p': |
| case 'ol': |
| case 'ul': |
| case 'li': |
| case 'strong': |
| case 'em': |
| case 'code': |
| case 'pre': |
| case 'h1': |
| case 'h2': |
| case 'h3': |
| case 'h4': |
| case 'h5': |
| case 'h6': |
| // Print without attributes. |
| this.safeString.push(`<${tag}>`); |
| this.visitChildren(n); |
| this.safeString.push(`</${tag}>`); |
| break; |
| |
| default: |
| // Ignore the element, but visit children. |
| this.visitChildren(n); |
| } |
| break; |
| |
| default: |
| // Ignore the element, but visit children. |
| this.visitChildren(n); |
| } |
| } |
| |
| /** |
| * Sanitizes a DOM's child nodes. |
| * |
| * @param n The DOM node to sanitize. |
| */ |
| visitChildren(n: HTMLElement): void { |
| for (const c of n.childNodes) { |
| this.visit(c as HTMLElement); |
| } |
| } |
| |
| /** |
| * Returns an escaped string. |
| * |
| * @param s The HTML string to escape. |
| * @returns The escaped HTML string. |
| */ |
| escape(s: string): string { |
| return s.replaceAll(ESCAPE_REGEX, (m: string) => ESCAPES[m]); |
| } |
| } |
| |
| /** |
| * Returns a sanitized URL. Removes non-http and malformed URLs. Used to |
| * sanitize URLs in <a> elements. |
| * |
| * @param s URL to sanitize. |
| * @returns The sanitized URL. |
| */ |
| function sanitizeURL(s: string): string { |
| const sanitizedPrefix = 'about:invalid#sanitized&reason='; |
| let u: URL; |
| |
| try { |
| u = new URL(s); |
| } catch (e) { |
| if (e instanceof TypeError) { |
| return sanitizedPrefix + 'malformed-url'; |
| } |
| throw e; |
| } |
| |
| if (!['http:', 'https:'].includes(u.protocol)) { |
| return sanitizedPrefix + 'disallowed-scheme'; |
| } |
| |
| // Re-serialize the URL to ensure that we return is what we think we parsed. |
| return u.toString(); |
| } |
| |
| /** |
| * Strips all HTML nodes except allowed ones. |
| * |
| * Unless explicitly specified, attributes are stripped. |
| * Allowed elements: |
| * - p, br, hr |
| * - h1, h2, h3, h4, h5, h6 |
| * - strong, em |
| * - a |
| * - if href attribute is not a valid absolute HTTP(s) link, it is replaced |
| * with an innocuous one. |
| * - alt attribute is allowed |
| * - ul, ol, li |
| * - code, pre |
| * |
| * Elements <script> and <style> are ignored entirely. |
| * For all other HTML nodes, sanitize ignores the node, but visits its children. |
| * |
| * @param s The HTML string to sanitize. |
| * @returns The sanitized HTML string. |
| */ |
| export function sanitize(s: string): string { |
| const root = new DOMParser().parseFromString(s, 'text/html'); |
| const sanitizer = new Sanitizer(); |
| sanitizer.visit(root as unknown as HTMLElement); |
| return sanitizer.getSafeString(); |
| } |