| // Copyright 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| package org.chromium.distiller; |
| |
| import com.google.gwt.core.client.JsArray; |
| import com.google.gwt.core.client.JsArrayString; |
| import com.google.gwt.dom.client.AnchorElement; |
| import com.google.gwt.dom.client.Document; |
| import com.google.gwt.dom.client.Element; |
| import com.google.gwt.dom.client.ImageElement; |
| import com.google.gwt.dom.client.Node; |
| import com.google.gwt.dom.client.NodeList; |
| import com.google.gwt.dom.client.Style; |
| import com.google.gwt.dom.client.VideoElement; |
| import com.google.gwt.http.client.URL; |
| |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| public class DomUtil { |
| /** |
| * GWT does not provide a way to get a list of all attributes that have been explicitly set on a |
| * DOM element (only a way to query the value of a particular attribute). In javascript, this |
| * list is accessible as elem.attributes. |
| * |
| * @Return The element's attribute list from javascript. |
| */ |
| public static native JsArray<Node> getAttributes(Element elem) /*-{ |
| return elem.attributes; |
| }-*/; |
| |
| // Returns the first element with |className| in the tree rooted at |root|, null if none is |
| // found. |
| public static native Element getFirstElementWithClassName(Element root, String className) /*-{ |
| return root.querySelector("." + className); |
| }-*/; |
| |
| public static native boolean hasClassName(Element elem, String className) /*-{ |
| return elem.classList.contains(className); |
| }-*/; |
| |
| public static native JsArrayString getClassList(Element elem) /*-{ |
| return elem.classList; |
| }-*/; |
| |
| /** |
| * Check to see if a provided URL has the specified root domain (ex. http://a.b.c/foo/bar has |
| * root domain of b.c). |
| * @param url The URL to test. |
| * @param root The root domain to test against. |
| * @return True if url has the specified root domain. |
| */ |
| public static boolean hasRootDomain(String url, String root) { |
| if (url == null || root == null) { |
| return false; |
| } |
| AnchorElement anchor = Document.get().createAnchorElement(); |
| anchor.setHref(url); |
| String host = anchor.getPropertyString("host"); |
| return ("." + host).endsWith("." + root); |
| } |
| |
| /** |
| * Split URL parameters into key/value pairs and return them in a map. |
| * @param query The query string after the "?". |
| * @return Map of all query parameters or an empty map. |
| */ |
| public static Map<String, String> splitUrlParams(String query) { |
| if (query == null || query.isEmpty()) { |
| return new HashMap<>(); |
| } |
| Map<String, String> paramMap = new HashMap<>(); |
| String[] params = query.split("&"); |
| for (int i = 0; i < params.length; i++) {} |
| for (String currentParam : params) { |
| String[] paramSplit = currentParam.split("="); |
| if (paramSplit.length > 1) { |
| paramMap.put(paramSplit[0], URL.decode(paramSplit[1])); |
| } |
| } |
| return paramMap; |
| } |
| |
| /** |
| * @Return The CSS style of an element after applying the active stylesheets and resolving any |
| * basic computation the style's value(s) may contain. |
| * @param el - DOM element |
| */ |
| public static native Style getComputedStyle(Element el) /*-{ |
| return getComputedStyle(el, null); |
| }-*/; |
| |
| public static boolean isVisible(Element e) { |
| Style style = getComputedStyle(e); |
| double opacity = JavaScript.parseFloat(style.getOpacity()); |
| return !(style.getDisplay().equals("none") || |
| style.getVisibility().equals("hidden") || |
| opacity == 0.0F); |
| } |
| |
| /** |
| * Verifies if a given element is visible by checking its offset. |
| */ |
| public static boolean isVisibleByOffset(Element e) { |
| // Detect whether any of the ancestors has "display: none". |
| // Using offsetParent alone wouldn't work because it's also null when position is fixed. |
| // Using offsetHeight/Width alone makes sense in production, but we have too many |
| // zero-sized elements in our tests. |
| return e.getOffsetParent() != null || e.getOffsetHeight() != 0 || e.getOffsetWidth() != 0; |
| } |
| |
| /** |
| * Get the element of the main article, if any. |
| * @return An element of article (not necessarily the html5 article element). |
| */ |
| public static Element getArticleElement(Element root) { |
| NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
| List<Element> visibleElements = getVisibleElements(allArticles); |
| // Having multiple article elements usually indicates a bad case for this shortcut. |
| // TODO(wychen): some sites exclude things like title and author in article element. |
| if (visibleElements.size() == 1) { |
| return visibleElements.get(0); |
| } |
| // Note that the CSS property matching is case sensitive, and "Article" is the correct |
| // capitalization. |
| String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Posting\"]"; |
| allArticles = DomUtil.querySelectorAll(root, query); |
| visibleElements = getVisibleElements(allArticles); |
| // It is commonly seen that the article is wrapped separately or in multiple layers. |
| if (visibleElements.size() > 0) { |
| return Element.as(DomUtil.getNearestCommonAncestor(visibleElements)); |
| } |
| return null; |
| } |
| |
| /** |
| * Get a list of visible elements. |
| * @return A list of visible elements. |
| */ |
| public static List<Element> getVisibleElements(NodeList<Element> nodeList) { |
| List<Element> visibleElements = new ArrayList<>(); |
| for (int i = 0; i < nodeList.getLength(); i++) { |
| Element element = nodeList.getItem(i); |
| if (DomUtil.isVisible(element) && |
| DomUtil.isVisibleByOffset(element)) { |
| visibleElements.add(element); |
| } |
| } |
| return visibleElements; |
| } |
| |
| /* |
| * We want to use jsni for direct access to javascript's innerText. This avoids GWT's |
| * implementation of Element::getInnerText(), which is intentionally different to mimic an old |
| * IE behaviour, which returns text within <script> tags. |
| */ |
| public static native String getInnerText(Node node) /*-{ |
| return node.innerText; |
| }-*/; |
| |
| public static native double getTime() /*-{ |
| // window.performance is unavailable in Gwt's dev environment and even referencing it on iOS |
| // causes a crash. |
| if ((typeof distiller_on_ios === 'undefined' || !distiller_on_ios) && window.performance) { |
| return window.performance.now(); |
| } |
| return Date.now(); |
| }-*/; |
| |
| /** |
| * Use jsni for direct access to javascript's textContent. textContent is different from |
| * innerText (see http://www.kellegous.com/j/2013/02/27/innertext-vs-textcontent): |
| * - textContent is the raw textual content, doesn't require layout, and is basically a |
| * concatenation of the values of all text nodes within a subtree. |
| * - innerText is what is presented to the user, requires layout, and excludes text in invisible |
| * elements, e.g. <title> tags. |
| */ |
| public static native String javascriptTextContent(Node node) /*-{ |
| return node.textContent; |
| }-*/; |
| |
| /** |
| * Get a list of all the parents of this node starting with the node itself. |
| * @param n The node to get the parents of. |
| * @return A list of the provided node's parents. |
| */ |
| public static List<Node> getParentNodes(Node n) { |
| ArrayList<Node> result = new ArrayList<Node>(); |
| Node curr = n; |
| while (curr != null) { |
| result.add(curr); |
| curr = curr.getParentNode(); |
| } |
| return result; |
| } |
| |
| /** |
| * Get the depth of the given node in the DOM tree (only counting elements). |
| * @param n The node to find the depth of. |
| * @return The depth of the provided node; -1 if n is null. |
| */ |
| public static int getNodeDepth(final Node n) { |
| return getParentNodes(n).size()-1; |
| } |
| |
| /** |
| * Get the nearest common ancestor of two nodes. |
| * @param n1 First node. |
| * @param n2 Second node. |
| * @return The nearest common ancestor node of n1 and n2. |
| */ |
| public static Node getNearestCommonAncestor(final Node n1, final Node n2) { |
| Node parent = n1; |
| while (parent != null && !JavaScript.contains(parent, n2)) parent = parent.getParentNode(); |
| return parent; |
| } |
| |
| /** |
| * Get the nearest common ancestor of nodes. |
| */ |
| public static Node getNearestCommonAncestor(final List<? extends Node> ns) { |
| if (ns.size() == 0) return null; |
| Node parent = ns.get(0); |
| for (int i = 1; i < ns.size(); i++) { |
| parent = getNearestCommonAncestor(parent, ns.get(i)); |
| } |
| return parent; |
| } |
| |
| /** |
| * Get all text from a tree/sub-tree. |
| * @param node The root of the tree. |
| * @return The text contained in this tree. |
| */ |
| public static String getTextFromTree(Node node) { |
| // Temporarily add the node to the DOM so that style is calculated. |
| Document.get().getBody().appendChild(node); |
| String output = DomUtil.getInnerText(node); |
| |
| // And remove it again. |
| Document.get().getBody().removeChild(node); |
| return output; |
| } |
| |
| /** |
| * Generate the HTML output for a list of relevant nodes. |
| * @param outputNodes The list of nodes in a subtree that are considered relevant. |
| * @param textOnly If this function should return text only instead of HTML. |
| * @return Displayable HTML content representing this WebElement. |
| */ |
| public static String generateOutputFromList(List<Node> outputNodes, boolean textOnly) { |
| if (outputNodes.size() == 0) { |
| return ""; |
| } |
| |
| NodeTree expanded = NodeListExpander.expand(outputNodes); |
| Node clonedSubtree = expanded.cloneSubtreeRetainDirection(); |
| |
| if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) return ""; |
| |
| stripIds(clonedSubtree); |
| makeAllLinksAbsolute(clonedSubtree); |
| stripTargetAttributes(clonedSubtree); |
| stripFontColorAttributes(clonedSubtree); |
| stripTableBackgroundColorAttributes(clonedSubtree); |
| stripStyleAttributes(clonedSubtree); |
| stripImageElements(clonedSubtree); |
| |
| if (textOnly) { |
| return DomUtil.getTextFromTree(clonedSubtree); |
| } |
| return Element.as(clonedSubtree).getString(); |
| } |
| |
| /** |
| * Makes all anchors and video posters absolute. This calls "makeAllSrcAttributesAbsolute". |
| * @param rootNode The root Node to look through. |
| */ |
| public static void makeAllLinksAbsolute(Node rootNode) { |
| Element root = Element.as(rootNode); |
| |
| // AnchorElement.getHref() and ImageElement.getSrc() both return the |
| // absolute URI, so simply set them as the respective attributes. |
| |
| if ("A".equals(root.getTagName())) { |
| AnchorElement link = AnchorElement.as(root); |
| if (!link.getHref().isEmpty()) { |
| link.setHref(link.getHref()); |
| } |
| } |
| NodeList<Element> allLinks = root.getElementsByTagName("A"); |
| for (int i = 0; i < allLinks.getLength(); i++) { |
| AnchorElement link = AnchorElement.as(allLinks.getItem(i)); |
| if (!link.getHref().isEmpty()) { |
| link.setHref(link.getHref()); |
| } |
| } |
| if (root.getTagName().equals("VIDEO")) { |
| VideoElement video = (VideoElement) root; |
| if (!video.getPoster().isEmpty()) { |
| video.setPoster(video.getPoster()); |
| } |
| } |
| NodeList<Element> videoTags = root.getElementsByTagName("VIDEO"); |
| for (int i = 0; i < videoTags.getLength(); i++) { |
| VideoElement video = (VideoElement) videoTags.getItem(i); |
| if (!video.getPoster().isEmpty()) { |
| video.setPoster(video.getPoster()); |
| } |
| } |
| makeAllSrcAttributesAbsolute(root); |
| |
| makeSrcSetAbsolute(root); |
| } |
| |
| private static void makeSrcSetAbsolute(Element root) { |
| if (root.getTagName().equals("IMG")) { |
| makeSrcSetAbsolute(ImageElement.as(root)); |
| } |
| NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG[SRCSET]"); |
| for (int i = 0; i < imgs.getLength(); i++) { |
| makeSrcSetAbsolute(ImageElement.as(imgs.getItem(i))); |
| } |
| } |
| |
| public static void makeSrcSetAbsolute(ImageElement ie) { |
| String srcset = ie.getAttribute("srcset"); |
| if (srcset == "") { |
| ie.removeAttribute("srcset"); |
| return; |
| } |
| |
| String oldsrc = ie.getSrc(); |
| String[] sizes = StringUtil.jsSplit(srcset, ","); |
| for(int i = 0; i < sizes.length; i++) { |
| String size = StringUtil.jsTrim(sizes[i]); |
| if (size.isEmpty()) continue; |
| String[] comp = size.split(" "); |
| ie.setSrc(comp[0]); |
| comp[0] = ie.getSrc(); |
| sizes[i] = StringUtil.join(comp, " "); |
| } |
| ie.setAttribute("srcset", StringUtil.join(sizes, ", ")); |
| ie.setSrc(oldsrc); |
| } |
| |
| public static void stripImageElements(Node root) { |
| if (root.getNodeType() == Node.ELEMENT_NODE) { |
| Element element = Element.as(root); |
| if (element.getTagName().equals("IMG")) { |
| stripImageElement(ImageElement.as(element)); |
| } |
| } |
| NodeList<Element> imgs = DomUtil.querySelectorAll(root, "IMG"); |
| for (int i = 0; i < imgs.getLength(); i++) { |
| stripImageElement(ImageElement.as(imgs.getItem(i))); |
| } |
| } |
| |
| /** |
| * Only keep some attributes for image elements. |
| * @param imgElement The image element to strip in-place. |
| */ |
| public static void stripImageElement(ImageElement imgElement) { |
| JsArray<Node> attrs = getAttributes(imgElement); |
| for (int i = 0; i < attrs.length(); ) { |
| String name = attrs.get(i).getNodeName(); |
| if (!"src".equals(name) && |
| !"alt".equals(name) && |
| !"srcset".equals(name) && |
| !"dir".equals(name) && |
| !"width".equals(name) && |
| !"height".equals(name) && |
| !"title".equals(name)) { |
| imgElement.removeAttribute(name); |
| } else { |
| i++; |
| } |
| } |
| } |
| |
| /** |
| * Makes all "img", "source", "track", and "video" tags have an absolute "src" attribute. |
| * @param root The root element to look through. |
| */ |
| public static native void makeAllSrcAttributesAbsolute(Element root) /*-{ |
| if (root.tagName == "IMG" || root.tagName == "SOURCE" || root.tagName == "TRACK" || |
| root.tagName == "VIDEO") { |
| if (root.src) { |
| root.src = root.src; |
| } |
| } |
| var elementsWithSrc = root.querySelectorAll('img,source,track,video'); |
| for (var key in elementsWithSrc) { |
| if (elementsWithSrc[key].src) { |
| elementsWithSrc[key].src = elementsWithSrc[key].src; |
| } |
| } |
| }-*/; |
| |
| /** |
| * Strips some attribute from certain tags in the tree rooted at |rootNode|, including root. |
| * @param tagNames The tag names to be processed. ["*"] means all. |
| */ |
| @SuppressWarnings("unused") |
| public static void stripAttributeFromTags(Node rootNode, String attribute, String[] tagNames) { |
| Element root = Element.as(rootNode); |
| for (String tag: tagNames) { |
| if (root.getTagName().equals(tag) || tag.equals("*")) { |
| root.removeAttribute(attribute); |
| } |
| } |
| |
| for (String tag: tagNames) { |
| tag += "[" + attribute + "]"; |
| } |
| String query = StringUtil.join(tagNames, ", "); |
| NodeList<Element> tags = DomUtil.querySelectorAll(root, query); |
| for (int i = 0; i < tags.getLength(); i++) { |
| tags.getItem(i).removeAttribute(attribute); |
| } |
| } |
| |
| /** |
| * Strips all "id" attributes from all nodes in the tree rooted at |node| |
| */ |
| public static void stripIds(Node node) { |
| stripAttributeFromTags(node, "ID", new String[]{"*"}); |
| } |
| |
| /** |
| * Strips all "color" attributes from "font" nodes in the tree rooted at |rootNode| |
| */ |
| public static void stripFontColorAttributes(Node rootNode) { |
| stripAttributeFromTags(rootNode, "COLOR", new String[]{"FONT"}); |
| } |
| |
| /** |
| * Strips all "bgcolor" attributes from table nodes in the tree rooted at |rootNode| |
| */ |
| public static void stripTableBackgroundColorAttributes(Node rootNode) { |
| stripAttributeFromTags(rootNode, "BGCOLOR", new String[]{"TABLE", "TR", "TD", "TH"}); |
| } |
| |
| /** |
| * Strips all "style" attributes from all nodes in the tree rooted at |rootNode| |
| */ |
| public static void stripStyleAttributes(Node rootNode) { |
| stripAttributeFromTags(rootNode, "STYLE", new String[]{"*"}); |
| } |
| |
| /** |
| * Strips all "target" attributes from anchor nodes in the tree rooted at |rootNode| |
| */ |
| public static void stripTargetAttributes(Node rootNode) { |
| stripAttributeFromTags(rootNode, "TARGET", new String[]{"A"}); |
| } |
| |
| /** |
| * Get a list of relevant nodes from a subtree. |
| * @param root The root of the subtree. |
| * @return A list of relevant nodes. |
| */ |
| public static List<Node> getOutputNodes(Node root) { |
| final List<Node> nodes = new ArrayList<>(); |
| new DomWalker(new DomWalker.Visitor() { |
| @Override |
| public boolean visit(Node n) { |
| switch (n.getNodeType()) { |
| case Node.TEXT_NODE: |
| nodes.add(n); |
| return false; |
| case Node.ELEMENT_NODE: |
| if (!DomUtil.isVisible(Element.as(n))) return false; |
| nodes.add(n); |
| return true; |
| case Node.DOCUMENT_NODE: |
| default: |
| return false; |
| } |
| } |
| |
| @Override |
| public void exit(Node n) { |
| } |
| |
| @Override |
| public void skip(Element e) { |
| } |
| }).walk(root); |
| return nodes; |
| } |
| |
| public static int getArea(Element e) { |
| if (e != null) { |
| return e.getOffsetHeight() * e.getOffsetWidth(); |
| } |
| return 0; |
| } |
| |
| /** |
| * Generate HTML/text output for a given node tree/subtree. This will ignore hidden |
| * elements. |
| * @param subtree The root of the subtree. |
| * @param textOnly If this function should return text only and not HTML. |
| * @return The output for the provided subtree. |
| */ |
| public static String generateOutputFromTree(Node subtree, boolean textOnly) { |
| return generateOutputFromList(getOutputNodes(subtree), textOnly); |
| } |
| |
| // Returns whether querySelectorAll is available |
| public static native boolean supportQuerySelectorAll(Element root) /*-{ |
| return (typeof(root.querySelectorAll) == 'function'); |
| }-*/; |
| |
| // GWT doesn't support querySelectorAll, so testing the caller could be harder. |
| public static native NodeList<Element> querySelectorAll(Node l, String selectors) /*-{ |
| return l.querySelectorAll(selectors); |
| }-*/; |
| |
| public static native Document createHTMLDocument(Document doc) /*-{ |
| return doc.implementation.createHTMLDocument(); |
| }-*/; |
| |
| public static native Element getFirstElementChild(Document document) /*-{ |
| return document.firstElementChild; |
| }-*/; |
| } |