Leverage semantic elements to find the main article
Semantic web elements or metadata can tell us where the main article
is in a structured way. Only processing that element is a fast path
to boost the speed of distillation.
In a well-formed html5 page of an article, the <article> element can
directly be used for distillation. The processing is much faster this
way.
Similarly, schema.org metadata can give a strong hint where the main
article is. This is often less precise than the <article> element,
so this signal is used in a different way.
** Score changes:
- Content:
reader-mode-golden-data:
Average precision +0.003, recall -0.001.
- Images:
reader-images-golden-data:
Average precision +0.012, recall +0.005.
- Next page:
page-links-golden-data:
No change.
** Performance improvement:
- reader-mode-golden-data:
Spent 15% less time on average.
BUG=431067
R=cjhopman@chromium.org
Review URL: https://codereview.chromium.org/1131793009.
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 42e2766..b9f0b2d 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -20,6 +20,7 @@
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
import java.util.ArrayList;
import java.util.LinkedList;
@@ -159,13 +160,39 @@
}
/**
+ * Get the element of the main article, if any.
+ * @return An element of article (not necessarily the html5 article element).
+ */
+ private Element getArticleElement(Element root) {
+ NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
+ // Having multiple article elements usually indicates a bad case for this shortcut.
+ // TODO(wychen): some sites exclude things like title and author in article element.
+ if (allArticles.getLength() == 1) {
+ return allArticles.getItem(0);
+ }
+ // Note that the CSS property matching is case sensitive, and "Article" is the correct
+ // capitalization.
+ String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
+ allArticles = DomUtil.querySelectorAll(root, query);
+ // It is commonly seen that the article is wrapped separately or in multiple layers.
+ if (allArticles.getLength() > 0) {
+ return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
+ }
+ return null;
+ }
+
+ /**
* Converts the original HTML page into a WebDocument for analysis.
*/
private WebDocumentInfo createWebDocumentInfoFromPage() {
WebDocumentInfo info = new WebDocumentInfo();
WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
DomConverter converter = new DomConverter(documentBuilder);
- new DomWalker(converter).walk(documentElement);
+ Element walkerRoot = getArticleElement(documentElement);
+ if (walkerRoot == null) {
+ walkerRoot = documentElement;
+ }
+ new DomWalker(converter).walk(walkerRoot);
info.document = documentBuilder.toWebDocument();
ensureTitleInitialized();
info.hiddenElements = converter.getHiddenElements();
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 374bdbb..27686c7 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -168,6 +168,18 @@
}
/**
+ * Get the nearest common ancestor of nodes.
+ */
+ public static Node getNearestCommonAncestor(final NodeList ns) {
+ if (ns.getLength() == 0) return null;
+ Node parent = ns.getItem(0);
+ for (int i = 1; i < ns.getLength(); i++) {
+ parent = getNearestCommonAncestor(parent, ns.getItem(i));
+ }
+ return parent;
+ }
+
+ /**
* Get all text from a tree/sub-tree.
* @param node The root of the tree.
* @return The text contained in this tree.
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index 34ec3ef..d248e6f 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -138,4 +138,100 @@
"<span><font>" + CONTENT_TEXT + "</font></span> </font>",
TestUtil.removeAllDirAttributes(extractedContent));
}
+
+ private void assertExtractor(String expected, String html) {
+ mBody.setInnerHTML("");
+ Element div = TestUtil.createDiv(0);
+ mBody.appendChild(div);
+
+ div.setInnerHTML(html);
+ ContentExtractor extractor = new ContentExtractor(mRoot);
+ String extractedContent = extractor.extractContent();
+ assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
+ }
+
+ public void testOnlyProcessArticleElement() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String html = "<h1>" + CONTENT_TEXT + "</h1><div>" + article + "</div>";
+ final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
+
+ // Make sure everything is there before using the fast path.
+ assertExtractor(expected, html);
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<article>" + article + "</article>";
+
+ assertExtractor(article, htmlArticle);
+ }
+
+ public void testOnlyProcessArticleElementMultiple() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<article>" + article + "</article>" +
+ "<article>" + article + "</article>";
+ final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article + article;
+
+ // The existence of multiple articles disables the fast path.
+ assertExtractor(expected, htmlArticle);
+ }
+
+ public void testOnlyProcessOGArticle() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" + article + "</div>";
+
+ assertExtractor(article, htmlArticle);
+ }
+
+ public void testOnlyProcessOGArticleNews() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<div itemscope itemtype=\"http://schema.org/NewsArticle\">" + article + "</div>";
+
+ assertExtractor(article, htmlArticle);
+ }
+
+ public void testOnlyProcessOGArticleBlog() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<div itemscope itemtype=\"http://schema.org/BlogPosting\">" + article + "</div>";
+
+ assertExtractor(article, htmlArticle);
+ }
+
+ public void testOnlyProcessOGArticleNested() {
+ final String paragraph = "<p>" + CONTENT_TEXT + "</p>";
+ final String article = paragraph + paragraph;
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" +
+ paragraph +
+ "<div itemscope itemtype=\"http://schema.org/Article\">" + paragraph + "</div>" +
+ "</div>";
+
+ assertExtractor(article, htmlArticle);
+ }
+
+ public void testOnlyProcessOGNonArticleMovie() {
+ final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + "</div>";
+ final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
+
+ // Non-article schema.org types should not use the fast path.
+ assertExtractor(expected, htmlArticle);
+ }
}
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index fee4fc5..01e4422 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -89,8 +89,14 @@
assertEquals(0, result.size());
}
+ /**
+ * The tree graph is:
+ * 1 - 2 - 3
+ * \ 4 - 5
+ */
public void testNearestCommonAncestor() {
Element div = TestUtil.createDiv(1);
+ mBody.appendChild(div);
Element div2 = TestUtil.createDiv(2);
div.appendChild(div2);
@@ -104,10 +110,17 @@
currDiv.appendChild(TestUtil.createDiv(5));
assertEquals(div2, DomUtil.getNearestCommonAncestor(finalDiv1, currDiv.getChild(0)));
+ assertEquals(div2, DomUtil.getNearestCommonAncestor(
+ DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]")));
}
+ /**
+ * The tree graph is:
+ * 1 - 2 - 3
+ */
public void testNearestCommonAncestorIsRoot() {
Element div = TestUtil.createDiv(1);
+ mBody.appendChild(div);
Element div2 = TestUtil.createDiv(2);
div.appendChild(div2);
@@ -116,6 +129,8 @@
div2.appendChild(div3);
assertEquals(div, DomUtil.getNearestCommonAncestor(div, div3));
+ assertEquals(div, DomUtil.getNearestCommonAncestor(
+ DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]")));
}
public void testNodeDepth() {