blob: 31f17a342dd8804bf68a041adb8f54b27698ac7b [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package org.chromium.distiller.webdocument;
import org.chromium.distiller.DomUtil;
import org.chromium.distiller.DomWalker;
import org.chromium.distiller.LogUtil;
import org.chromium.distiller.TableClassifier;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.Style;
import com.google.gwt.dom.client.TableElement;
import com.google.gwt.dom.client.Text;
import org.chromium.distiller.extractors.embeds.EmbedExtractor;
import org.chromium.distiller.extractors.embeds.ImageExtractor;
import org.chromium.distiller.extractors.embeds.TwitterExtractor;
import org.chromium.distiller.extractors.embeds.VimeoExtractor;
import org.chromium.distiller.extractors.embeds.YouTubeExtractor;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This DomWalker.Visitor creates a WebDocument from the walked DOM. It skips hidden and other
* elements that should not be in the created document. Some of these skipped elements (hidden
* elements and data tables) are available for retrieval after processing.
*/
public class DomConverter implements DomWalker.Visitor {
private final WebDocumentBuilderInterface builder;
private final Set<Node> hiddenElements;
private final List<EmbedExtractor> extractors;
// For quick lookup of tags that could possibly be embeds.
private final HashSet<String> embedTagNames;
public DomConverter(WebDocumentBuilderInterface builder) {
hiddenElements = new HashSet<Node>();
this.builder = builder;
extractors = new ArrayList<EmbedExtractor>();
extractors.add(new ImageExtractor());
extractors.add(new TwitterExtractor());
extractors.add(new VimeoExtractor());
extractors.add(new YouTubeExtractor());
embedTagNames = new HashSet<>();
for (EmbedExtractor extractor : extractors) {
embedTagNames.addAll(extractor.getRelevantTagNames());
}
}
public final Set<Node> getHiddenElements() {
return hiddenElements;
}
@Override
public void skip(Element e) {
builder.skipElement(e);
}
@Override
public boolean visit(Node n) {
switch (n.getNodeType()) {
case Node.TEXT_NODE:
builder.textNode(Text.as(n));
return false;
case Node.ELEMENT_NODE:
return visitElement(Element.as(n));
default:
return false;
}
}
private boolean visitElement(Element e) {
// Skip invisible or uninteresting elements.
boolean visible = DomUtil.isVisible(e);
logVisibilityInfo(e, visible);
if (!visible) {
hiddenElements.add(e);
return false;
}
// Node-type specific extractors check for elements they are interested in here. Everything
// else will be filtered through the switch below.
// Check for embedded elements that might be extracted.
if (embedTagNames.contains(e.getTagName())) {
// If the tag is marked as interesting, check the extractors.
for (EmbedExtractor extractor : extractors) {
WebElement embed = extractor.extract(e);
if (embed != null) {
builder.embed(embed);
return false;
}
}
}
// Create a placeholder for the elements we want to preserve.
if (WebTag.canBeNested(e.getTagName())) {
builder.tag(new WebTag(e.getTagName(), WebTag.TagType.START));
}
switch (e.getTagName()) {
case "BR":
builder.lineBreak(e);
return false;
// Skip data tables, keep track of them to be extracted by RelevantElementsFinder
// later.
case "TABLE":
TableClassifier.Type type = TableClassifier.table(TableElement.as(e));
logTableInfo(e, type);
if (type == TableClassifier.Type.DATA) {
builder.dataTable(e);
return false;
}
break;
// Some components are revisited later in context as they break text-flow of a
// document. e.g. <video> can contain text if format is unsupported.
case "VIDEO":
builder.embed(new WebVideo(e, e.getClientHeight(), e.getClientHeight()));
return false;
// These element types are all skipped (but may affect document construction).
case "OPTION":
case "OBJECT":
case "EMBED":
case "APPLET":
skip(e);
return false;
// These types are skipped and don't affect document construction.
case "HEAD":
case "STYLE":
case "SCRIPT":
case "LINK":
case "NOSCRIPT":
return false;
}
builder.startElement(e);
return true;
}
@Override
public void exit(Node n) {
if (n.getNodeType() == Node.ELEMENT_NODE) {
Element e = Element.as(n);
if (WebTag.canBeNested(e.getTagName())) {
builder.tag(new WebTag(e.getTagName(), WebTag.TagType.END));
}
}
builder.endElement();
}
private static void logVisibilityInfo(Element e, boolean visible) {
if (!LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) return;
Style style = DomUtil.getComputedStyle(e);
LogUtil.logToConsole((visible ? "KEEP " : "SKIP ") + e.getTagName() +
": id=" + e.getId() +
", dsp=" + style.getDisplay() +
", vis=" + style.getVisibility() +
", opaq=" + style.getOpacity());
}
private static void logTableInfo(Element e, TableClassifier.Type type) {
if (!LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) return;
Element parent = e.getParentElement();
LogUtil.logToConsole("TABLE: " + type +
", id=" + e.getId() +
", class=" + e.getClassName() +
", parent=[" + parent.getTagName() +
", id=" + parent.getId() +
", class=" + parent.getClassName() +
"]");
}
}