Leverage semantic elements to find the main article

Semantic web elements or metadata can tell us where the main article
is in a structured way. Only processing that element is a fast path
to boost the speed of distillation.

In a well-formed html5 page of an article, the <article> element can
directly be used for distillation. The processing is much faster this
way.

Similarly, schema.org metadata can give a strong hint where the main
article is. This is often less precise than the <article> element,
so this signal is used in a different way.

** Score changes:
- Content:
    reader-mode-golden-data:
      Average precision +0.003, recall -0.001.
- Images:
    reader-images-golden-data:
      Average precision +0.012, recall +0.005.
- Next page:
    page-links-golden-data:
      No change.

** Performance improvement:
- reader-mode-golden-data:
    Spent 15% less time on average.

BUG=431067
R=cjhopman@chromium.org

Review URL: https://codereview.chromium.org/1131793009.
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 42e2766..b9f0b2d 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -20,6 +20,7 @@
 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
 import com.google.gwt.dom.client.Node;
+import com.google.gwt.dom.client.NodeList;
 
 import java.util.ArrayList;
 import java.util.LinkedList;
@@ -159,13 +160,39 @@
     }
 
     /**
+     * Get the element of the main article, if any.
+     * @return An element of article (not necessarily the html5 article element).
+     */
+    private Element getArticleElement(Element root) {
+        NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
+        // Having multiple article elements usually indicates a bad case for this shortcut.
+        // TODO(wychen): some sites exclude things like title and author in article element.
+        if (allArticles.getLength() == 1) {
+            return allArticles.getItem(0);
+        }
+        // Note that the CSS property matching is case sensitive, and "Article" is the correct
+        // capitalization.
+        String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
+        allArticles = DomUtil.querySelectorAll(root, query);
+        // It is commonly seen that the article is wrapped separately or in multiple layers.
+        if (allArticles.getLength() > 0) {
+            return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
+        }
+        return null;
+    }
+
+    /**
      * Converts the original HTML page into a WebDocument for analysis.
      */
     private WebDocumentInfo createWebDocumentInfoFromPage() {
         WebDocumentInfo info = new WebDocumentInfo();
         WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
         DomConverter converter = new DomConverter(documentBuilder);
-        new DomWalker(converter).walk(documentElement);
+        Element walkerRoot = getArticleElement(documentElement);
+        if (walkerRoot == null) {
+            walkerRoot = documentElement;
+        }
+        new DomWalker(converter).walk(walkerRoot);
         info.document = documentBuilder.toWebDocument();
         ensureTitleInitialized();
         info.hiddenElements = converter.getHiddenElements();
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 374bdbb..27686c7 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -168,6 +168,18 @@
     }
 
     /**
+     * Get the nearest common ancestor of nodes.
+     */
+    public static Node getNearestCommonAncestor(final NodeList ns) {
+        if (ns.getLength() == 0) return null;
+        Node parent = ns.getItem(0);
+        for (int i = 1; i < ns.getLength(); i++) {
+            parent = getNearestCommonAncestor(parent, ns.getItem(i));
+        }
+        return parent;
+    }
+
+    /**
      * Get all text from a tree/sub-tree.
      * @param node The root of the tree.
      * @return The text contained in this tree.
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index 34ec3ef..d248e6f 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -138,4 +138,100 @@
                      "<span><font>" + CONTENT_TEXT + "</font></span> </font>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }
+
+    private void assertExtractor(String expected, String html) {
+        mBody.setInnerHTML("");
+        Element div = TestUtil.createDiv(0);
+        mBody.appendChild(div);
+
+        div.setInnerHTML(html);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testOnlyProcessArticleElement() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String html = "<h1>" + CONTENT_TEXT + "</h1><div>" + article + "</div>";
+        final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
+
+        // Make sure everything is there before using the fast path.
+        assertExtractor(expected, html);
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<article>" + article + "</article>";
+
+        assertExtractor(article, htmlArticle);
+    }
+
+    public void testOnlyProcessArticleElementMultiple() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<article>" + article + "</article>" +
+            "<article>" + article + "</article>";
+        final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article + article;
+
+        // The existence of multiple articles disables the fast path.
+        assertExtractor(expected, htmlArticle);
+    }
+
+    public void testOnlyProcessOGArticle() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<div itemscope itemtype=\"http://schema.org/Article\">" + article + "</div>";
+
+        assertExtractor(article, htmlArticle);
+    }
+
+    public void testOnlyProcessOGArticleNews() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<div itemscope itemtype=\"http://schema.org/NewsArticle\">" + article + "</div>";
+
+        assertExtractor(article, htmlArticle);
+    }
+
+    public void testOnlyProcessOGArticleBlog() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<div itemscope itemtype=\"http://schema.org/BlogPosting\">" + article + "</div>";
+
+        assertExtractor(article, htmlArticle);
+    }
+
+    public void testOnlyProcessOGArticleNested() {
+        final String paragraph = "<p>" + CONTENT_TEXT + "</p>";
+        final String article = paragraph + paragraph;
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<div itemscope itemtype=\"http://schema.org/Article\">" +
+                paragraph +
+                "<div itemscope itemtype=\"http://schema.org/Article\">" + paragraph + "</div>" +
+            "</div>";
+
+        assertExtractor(article, htmlArticle);
+    }
+
+    public void testOnlyProcessOGNonArticleMovie() {
+        final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";
+
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + "</div>";
+        final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;
+
+        // Non-article schema.org types should not use the fast path.
+        assertExtractor(expected, htmlArticle);
+    }
 }
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index fee4fc5..01e4422 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -89,8 +89,14 @@
         assertEquals(0, result.size());
     }
 
+    /**
+     * The tree graph is:
+     * 1 - 2 - 3
+     *       \ 4 - 5
+     */
     public void testNearestCommonAncestor() {
         Element div = TestUtil.createDiv(1);
+        mBody.appendChild(div);
 
         Element div2 = TestUtil.createDiv(2);
         div.appendChild(div2);
@@ -104,10 +110,17 @@
         currDiv.appendChild(TestUtil.createDiv(5));
 
         assertEquals(div2, DomUtil.getNearestCommonAncestor(finalDiv1, currDiv.getChild(0)));
+        assertEquals(div2, DomUtil.getNearestCommonAncestor(
+                DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]")));
     }
 
+    /**
+     * The tree graph is:
+     * 1 - 2 - 3
+     */
     public void testNearestCommonAncestorIsRoot() {
         Element div = TestUtil.createDiv(1);
+        mBody.appendChild(div);
 
         Element div2 = TestUtil.createDiv(2);
         div.appendChild(div2);
@@ -116,6 +129,8 @@
         div2.appendChild(div3);
 
         assertEquals(div, DomUtil.getNearestCommonAncestor(div, div3));
+        assertEquals(div, DomUtil.getNearestCommonAncestor(
+                DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]")));
     }
 
     public void testNodeDepth() {