Discard hidden articles when using fast path
Dom Distiller returns an empty content string for certain webpages.
The shortcut logic that Dom Distiller uses when some pages have one or
more <article> or <element itemscope itemtype="Article"> can lead to
erroneous content.
Some web pages have hidden articles, and these are being considered as
valid elements.
In order to fix this, when Dom Distiller is searching for article tags,
it will now discard hidden article elements.
Contributors=marcelorcorrea@hp.com, mlongaray@hp.com
BUG=544962, 616954
R=mdjones@chromium.org, wychen@chromium.org
Review URL: https://codereview.chromium.org/1411603004 .
Patch from marcelorcorrea <marcelorcorrea@hp.com>.
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 4a8f8bd..7d16a77 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -162,35 +162,13 @@
}
/**
- * Get the element of the main article, if any.
- * @return An element of article (not necessarily the html5 article element).
- */
- private Element getArticleElement(Element root) {
- NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
- // Having multiple article elements usually indicates a bad case for this shortcut.
- // TODO(wychen): some sites exclude things like title and author in article element.
- if (allArticles.getLength() == 1) {
- return allArticles.getItem(0);
- }
- // Note that the CSS property matching is case sensitive, and "Article" is the correct
- // capitalization.
- String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
- allArticles = DomUtil.querySelectorAll(root, query);
- // It is commonly seen that the article is wrapped separately or in multiple layers.
- if (allArticles.getLength() > 0) {
- return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
- }
- return null;
- }
-
- /**
* Converts the original HTML page into a WebDocument for analysis.
*/
private WebDocumentInfo createWebDocumentInfoFromPage() {
WebDocumentInfo info = new WebDocumentInfo();
WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
DomConverter converter = new DomConverter(documentBuilder);
- Element walkerRoot = getArticleElement(documentElement);
+ Element walkerRoot = DomUtil.getArticleElement(documentElement);
if (walkerRoot == null) {
walkerRoot = documentElement;
}
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 28ec2ac..3617c2b 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -102,6 +102,57 @@
opacity == 0.0F);
}
+ /**
+ * Verifies if a given element is visible by checking its offset.
+ */
+ public static boolean isVisibleByOffset(Element e) {
+ // Detect whether any of the ancestors has "display: none".
+ // Using offsetParent alone wouldn't work because it's also null when position is fixed.
+ // Using offsetHeight/Width alone makes sense in production, but we have too many
+ // zero-sized elements in our tests.
+ return e.getOffsetParent() != null || e.getOffsetHeight() != 0 || e.getOffsetWidth() != 0;
+ }
+
+ /**
+ * Get the element of the main article, if any.
+ * @return An element of article (not necessarily the html5 article element).
+ */
+ public static Element getArticleElement(Element root) {
+ NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
+ List<Element> visibleElements = getVisibleElements(allArticles);
+ // Having multiple article elements usually indicates a bad case for this shortcut.
+ // TODO(wychen): some sites exclude things like title and author in article element.
+ if (visibleElements.size() == 1) {
+ return visibleElements.get(0);
+ }
+ // Note that the CSS property matching is case sensitive, and "Article" is the correct
+ // capitalization.
+ String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
+ allArticles = DomUtil.querySelectorAll(root, query);
+ visibleElements = getVisibleElements(allArticles);
+ // It is commonly seen that the article is wrapped separately or in multiple layers.
+ if (visibleElements.size() > 0) {
+ return Element.as(DomUtil.getNearestCommonAncestor(visibleElements));
+ }
+ return null;
+ }
+
+ /**
+ * Get a list of visible elements.
+ * @return A list of visible elements.
+ */
+ public static List<Element> getVisibleElements(NodeList<Element> nodeList) {
+ List<Element> visibleElements = new ArrayList<>();
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Element element = nodeList.getItem(i);
+ if (DomUtil.isVisible(element) &&
+ DomUtil.isVisibleByOffset(element)) {
+ visibleElements.add(element);
+ }
+ }
+ return visibleElements;
+ }
+
/*
* We want to use jsni for direct access to javascript's innerText. This avoids GWT's
* implementation of Element::getInnerText(), which is intentionally different to mimic an old
@@ -171,11 +222,11 @@
/**
* Get the nearest common ancestor of nodes.
*/
- public static Node getNearestCommonAncestor(final NodeList ns) {
- if (ns.getLength() == 0) return null;
- Node parent = ns.getItem(0);
- for (int i = 1; i < ns.getLength(); i++) {
- parent = getNearestCommonAncestor(parent, ns.getItem(i));
+ public static Node getNearestCommonAncestor(final List<Element> ns) {
+ if (ns.size() == 0) return null;
+ Node parent = ns.get(0);
+ for (int i = 1; i < ns.size(); i++) {
+ parent = getNearestCommonAncestor(parent, ns.get(i));
}
return parent;
}
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index c1457c6..4b6d424 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -556,91 +556,6 @@
assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
}
- public void testOnlyProcessArticleElement() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String html = "<h1>" + CONTENT_TEXT + "</h1><div>" + article + "</div>";
- final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
-
- // Make sure everything is there before using the fast path.
- assertExtractor(expected, html);
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<article>" + article + "</article>";
-
- assertExtractor(article, htmlArticle);
- }
-
- public void testOnlyProcessArticleElementMultiple() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<article>" + article + "</article>" +
- "<article>" + article + "</article>";
- final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article + article;
-
- // The existence of multiple articles disables the fast path.
- assertExtractor(expected, htmlArticle);
- }
-
- public void testOnlyProcessOGArticle() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<div itemscope itemtype=\"http://schema.org/Article\">" + article + "</div>";
-
- assertExtractor(article, htmlArticle);
- }
-
- public void testOnlyProcessOGArticleNews() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<div itemscope itemtype=\"http://schema.org/NewsArticle\">" + article + "</div>";
-
- assertExtractor(article, htmlArticle);
- }
-
- public void testOnlyProcessOGArticleBlog() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<div itemscope itemtype=\"http://schema.org/BlogPosting\">" + article + "</div>";
-
- assertExtractor(article, htmlArticle);
- }
-
- public void testOnlyProcessOGArticleNested() {
- final String paragraph = "<p>" + CONTENT_TEXT + "</p>";
- final String article = paragraph + paragraph;
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<div itemscope itemtype=\"http://schema.org/Article\">" +
- paragraph +
- "<div itemscope itemtype=\"http://schema.org/Article\">" + paragraph + "</div>" +
- "</div>";
-
- assertExtractor(article, htmlArticle);
- }
-
- public void testOnlyProcessOGNonArticleMovie() {
- final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
-
- final String htmlArticle =
- "<h1>" + CONTENT_TEXT + "</h1>" +
- "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + "</div>";
- final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
-
- // Non-article schema.org types should not use the fast path.
- assertExtractor(expected, htmlArticle);
- }
-
public void testDropCap() {
String html =
"<h1>" +
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index 2336b36..d8d9f4b 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -10,6 +10,7 @@
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
import java.util.Map;
import java.util.List;
@@ -110,8 +111,8 @@
currDiv.appendChild(TestUtil.createDiv(5));
assertEquals(div2, DomUtil.getNearestCommonAncestor(finalDiv1, currDiv.getChild(0)));
- assertEquals(div2, DomUtil.getNearestCommonAncestor(
- DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]")));
+ NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]");
+ assertEquals(div2, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
}
/**
@@ -129,8 +130,8 @@
div2.appendChild(div3);
assertEquals(div, DomUtil.getNearestCommonAncestor(div, div3));
- assertEquals(div, DomUtil.getNearestCommonAncestor(
- DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]")));
+ NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]");
+ assertEquals(div, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
}
public void testNodeDepth() {
@@ -378,6 +379,212 @@
assertEquals(expected, mBody.getInnerHTML());
}
+ public void testIsVisibleByOffsetParentDisplayNone() {
+ String html =
+ "<div style=\"display: none;\">" +
+ "<div></div>" +
+ "</div>";
+ mBody.setInnerHTML(html);
+ Element child = mBody.getFirstChildElement().getFirstChildElement();
+ assertFalse(DomUtil.isVisibleByOffset(child));
+ }
+
+ public void testIsVisibleByOffsetChildDisplayNone() {
+ String html =
+ "<div>" +
+ "<div style=\"display: none;\"></div>" +
+ "</div>";
+ mBody.setInnerHTML(html);
+ Element child = mBody.getFirstChildElement().getFirstChildElement();
+ assertFalse(DomUtil.isVisibleByOffset(child));
+ }
+
+ public void testIsVisibleByOffsetDisplayBlock() {
+ String html =
+ "<div>" +
+ "<div></div>" +
+ "</div>";
+ mBody.setInnerHTML(html);
+ Element child = mBody.getFirstChildElement().getFirstChildElement();
+ assertTrue(DomUtil.isVisibleByOffset(child));
+ }
+
+ public void testOnlyProcessArticleElement() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<article></article>";
+
+ String expected = "<article></article>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessArticleElementWithHiddenArticleElement() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<article></article>" +
+ "<article style=\"display:none\"></article>";
+
+ String expected = "<article></article>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessArticleElementMultiple() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<article></article>" +
+ "<article></article>";
+
+ // The existence of multiple articles disables the fast path.
+ assertNull(getArticleElement(htmlArticle));
+ }
+
+ public void testOnlyProcessSchemaOrgArticle() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" " +
+ "itemtype=\"http://schema.org/Article\">" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleWithHiddenArticleElement() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\" " +
+ "style=\"display:none\">" +
+ "</div>";
+
+ String expected =
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleNews() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/NewsArticle\">" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" " +
+ "itemtype=\"http://schema.org/NewsArticle\">" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleBlog() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/BlogPosting\">" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" " +
+ "itemtype=\"http://schema.org/BlogPosting\">" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleNested() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleNestedWithNestedHiddenArticleElement() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\" " +
+ "style=\"display:none\">" +
+ "</div>" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\" " +
+ "style=\"display:none\">" +
+ "</div>" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgArticleNestedWithHiddenArticleElement() {
+ final String paragraph = "<p></p>";
+
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "</div>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\" " +
+ "style=\"display:none\">" +
+ "</div>";
+
+ final String expected =
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "<div itemscope=\"\" itemtype=\"http://schema.org/Article\">" +
+ "</div>" +
+ "</div>";
+
+ Element result = getArticleElement(htmlArticle);
+ assertEquals(expected, result.getString());
+ }
+
+ public void testOnlyProcessSchemaOrgNonArticleMovie() {
+ final String htmlArticle =
+ "<h1></h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Movie\">" +
+ "</div>";
+
+ // Non-article schema.org types should not use the fast path.
+ Element result = getArticleElement(htmlArticle);
+ assertNull(result);
+ }
+
+ private Element getArticleElement(String html) {
+ mBody.setInnerHTML(html);
+ return DomUtil.getArticleElement(mRoot);
+ }
+
public void testGetArea() {
String elements =
"<div style=\"width: 200px; height: 100px\">w</div>" +
diff --git a/javatests/org/chromium/distiller/TestUtil.java b/javatests/org/chromium/distiller/TestUtil.java
index 6db5ad1..107eaa9 100644
--- a/javatests/org/chromium/distiller/TestUtil.java
+++ b/javatests/org/chromium/distiller/TestUtil.java
@@ -13,6 +13,7 @@
import com.google.gwt.dom.client.MetaElement;
import com.google.gwt.dom.client.Text;
import com.google.gwt.dom.client.TitleElement;
+import com.google.gwt.dom.client.NodeList;
import com.google.gwt.user.client.Random;
import com.google.gwt.user.client.Window;
@@ -161,6 +162,14 @@
return originalHtml.replaceAll(" dir=\\\"(ltr|rtl|inherit|auto)\\\"","");
}
+ public static List<Element> nodeListToList(NodeList<Element> nodeList) {
+ List<Element> elements = new ArrayList<>();
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ elements.add(nodeList.getItem(i));
+ }
+ return elements;
+ }
+
/**
* Randomly shuffle the list in-place.
*/