The display style of WebText root element should never be inline

Some webpages have the following structure:

<p>
  <span>text</span>
</p>
<p>
  <span>text2</span>
</p>

The <p> element should be kept; otherwise these paragraphs would be
concatenated as one after distillation.

BUG=531545, 539851
R=mdjones@chromium.org

Review URL: https://codereview.chromium.org/1518973002 .
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 45d78a2..3cca24b 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -222,7 +222,7 @@
     /**
      * Get the nearest common ancestor of nodes.
      */
-    public static Node getNearestCommonAncestor(final List<Element> ns) {
+    public static Node getNearestCommonAncestor(final List<? extends Node> ns) {
         if (ns.size() == 0) return null;
         Node parent = ns.get(0);
         for (int i = 1; i < ns.size(); i++) {
diff --git a/java/org/chromium/distiller/webdocument/WebText.java b/java/org/chromium/distiller/webdocument/WebText.java
index 47e97c0..e41927d 100644
--- a/java/org/chromium/distiller/webdocument/WebText.java
+++ b/java/org/chromium/distiller/webdocument/WebText.java
@@ -4,6 +4,7 @@
 
 package org.chromium.distiller.webdocument;
 
+import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
 import org.chromium.distiller.DomUtil;
 import org.chromium.distiller.TreeCloneBuilder;
@@ -26,6 +27,7 @@
     // If this text needs to be split to place an image properly its group will signify how they
     // should be joined again (same group numbers get merged).
     private int groupNumber;
+    private static Set<String> sInlineTags = null;
 
     public WebText(String text, List<Node> allTextNodes, int start, int end, int firstWordNode,
             int lastWordNode, int numWords, int numLinkedWords, int tagLevel, int offsetBlock) {
@@ -48,6 +50,45 @@
         this.offsetBlock = offsetBlock;
     }
 
+    private Set<String> getInlineTags() {
+        if (sInlineTags == null) {
+            sInlineTags = new HashSet<String>();
+            // All inline elements except for impossible tags: BR, OBJECT, and SCRIPT.
+            // Please refer to DomConverter.visitElement() for skipped tags.
+            // Reference: https://developer.mozilla.org/en-US/docs/HTML/Inline_elements
+            sInlineTags.add("B");
+            sInlineTags.add("BIG");
+            sInlineTags.add("I");
+            sInlineTags.add("SMALL");
+            sInlineTags.add("TT");
+            sInlineTags.add("ABBR");
+            sInlineTags.add("ACRONYM");
+            sInlineTags.add("CITE");
+            sInlineTags.add("CODE");
+            sInlineTags.add("DFN");
+            sInlineTags.add("EM");
+            sInlineTags.add("KBD");
+            sInlineTags.add("STRONG");
+            sInlineTags.add("SAMP");
+            sInlineTags.add("TIME");
+            sInlineTags.add("VAR");
+            sInlineTags.add("A");
+            sInlineTags.add("BDO");
+            sInlineTags.add("IMG");
+            sInlineTags.add("MAP");
+            sInlineTags.add("Q");
+            sInlineTags.add("SPAN");
+            sInlineTags.add("SUB");
+            sInlineTags.add("SUP");
+            sInlineTags.add("BUTTON");
+            sInlineTags.add("INPUT");
+            sInlineTags.add("LABEL");
+            sInlineTags.add("SELECT");
+            sInlineTags.add("TEXTAREA");
+        }
+        return sInlineTags;
+    }
+
     @Override
     public String generateOutput(boolean textOnly) {
         if (hasLabel(DefaultLabels.TITLE)) return "";
@@ -57,13 +98,37 @@
         Node clonedRoot = TreeCloneBuilder.buildTreeClone(getTextNodes());
 
         // To keep formatting/structure, at least one parent element should be in the output. This
-        // is necessary because many times a WebText is only a single node.
+        // is necessary because many times a WebText is only a single text node.
         if (clonedRoot.getNodeType() != Node.ELEMENT_NODE) {
             Node parentClone = getTextNodes().get(0).getParentElement().cloneNode(false);
             parentClone.appendChild(clonedRoot);
             clonedRoot = parentClone;
         }
 
+        // The BODY element should not be used in the output.
+        if ("BODY".equals(Element.as(clonedRoot).getTagName())) {
+            Element div = Document.get().createDivElement();
+            div.setInnerHTML(Element.as(clonedRoot).getInnerHTML());
+            clonedRoot = div;
+        }
+
+        // Retain parent tags until the root is not an inline element, to make sure the style is
+        // display:block.
+        Node srcRoot = null;
+        while (getInlineTags().contains(Element.as(clonedRoot).getTagName())) {
+            if (srcRoot == null) {
+                srcRoot = DomUtil.getNearestCommonAncestor(getTextNodes());
+                if (srcRoot.getNodeType() != Node.ELEMENT_NODE) {
+                    srcRoot = srcRoot.getParentElement();
+                }
+            }
+            srcRoot = srcRoot.getParentElement();
+            if ("BODY".equals(Element.as(srcRoot).getTagName())) break;
+            Node parentClone = srcRoot.cloneNode(false);
+            parentClone.appendChild(clonedRoot);
+            clonedRoot = parentClone;
+        }
+
         // Make sure links are absolute and IDs are gone.
         DomUtil.makeAllLinksAbsolute(clonedRoot);
         DomUtil.stripTargetAttributes(clonedRoot);
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index 4b6d424..00995a3 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -259,6 +259,22 @@
                 TestUtil.removeAllDirAttributes(extractedContent));
     }
 
+    public void testPreserveOrderedListWithSpan() {
+        String html =
+            "<OL>" +
+                "<LI><span>" + CONTENT_TEXT + "</span></LI>" +
+                "<LI>" + CONTENT_TEXT + "</LI>" +
+                "<LI>" + CONTENT_TEXT + "</LI>" +
+                "<LI>" + CONTENT_TEXT + "</LI>" +
+            "</OL>";
+        mBody.setInnerHTML(html);
+
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals(html,
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
     public void testPreserveNestedOrderedList() {
         Element outerListTag = Document.get().createElement("OL");
         Element outerListItem = Document.get().createElement("LI");
@@ -546,11 +562,8 @@
     }
 
     private void assertExtractor(String expected, String html) {
-        mBody.setInnerHTML("");
-        Element div = TestUtil.createDiv(0);
-        mBody.appendChild(div);
+        mBody.setInnerHTML(html);
 
-        div.setInnerHTML(html);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
@@ -582,4 +595,41 @@
         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }
+
+    public void testBlockyArticle() {
+        final String htmlArticle =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<span>" + CONTENT_TEXT + "</span>" +
+            "<div><span>" + CONTENT_TEXT + "</span></div>" +
+            "<p><em>" + CONTENT_TEXT + "</em></p>" +
+            "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
+            "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
+            "<main><span><blockquote><cite>" +
+                "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
+            "</cite></blockquote></span></main>";
+
+        final String expected =
+            "<h1>" + CONTENT_TEXT + "</h1>" +
+            "<span>" + CONTENT_TEXT + "</span>" +
+            "<div><span>" + CONTENT_TEXT + "</span></div>" +
+            "<p><em>" + CONTENT_TEXT + "</em></p>" +
+            "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
+            "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
+            "<BLOCKQUOTE><cite>" +
+                "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
+            "</cite></BLOCKQUOTE>";
+
+        assertExtractor(expected, htmlArticle);
+    }
+
+    public void testSpanArticle() {
+        final String htmlArticle =
+            "<span>" + CONTENT_TEXT + "</span>" +
+            "<span>" + CONTENT_TEXT + "</span>" +
+            "<span>" + CONTENT_TEXT + "</span>";
+
+        final String expected = "<div>" + htmlArticle + "</div>";
+
+        assertExtractor(expected, htmlArticle);
+    }
 }
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index 5038442..85e61cf 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -111,7 +111,7 @@
         currDiv.appendChild(TestUtil.createDiv(5));
 
         assertEquals(div2, DomUtil.getNearestCommonAncestor(finalDiv1, currDiv.getChild(0)));
-        NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]");
+        NodeList nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]");
         assertEquals(div2, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
     }
 
@@ -130,7 +130,7 @@
         div2.appendChild(div3);
 
         assertEquals(div, DomUtil.getNearestCommonAncestor(div, div3));
-        NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]");
+        NodeList nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]");
         assertEquals(div, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
     }
 
diff --git a/javatests/org/chromium/distiller/TestUtil.java b/javatests/org/chromium/distiller/TestUtil.java
index 107eaa9..9b63336 100644
--- a/javatests/org/chromium/distiller/TestUtil.java
+++ b/javatests/org/chromium/distiller/TestUtil.java
@@ -11,6 +11,7 @@
 import com.google.gwt.dom.client.IFrameElement;
 import com.google.gwt.dom.client.ImageElement;
 import com.google.gwt.dom.client.MetaElement;
+import com.google.gwt.dom.client.Node;
 import com.google.gwt.dom.client.Text;
 import com.google.gwt.dom.client.TitleElement;
 import com.google.gwt.dom.client.NodeList;
@@ -162,12 +163,12 @@
         return originalHtml.replaceAll(" dir=\\\"(ltr|rtl|inherit|auto)\\\"","");
     }
 
-    public static List<Element> nodeListToList(NodeList<Element> nodeList) {
-        List<Element> elements = new ArrayList<>();
+    public static List<Node> nodeListToList(NodeList nodeList) {
+        List<Node> nodes = new ArrayList<>();
         for (int i = 0; i < nodeList.getLength(); i++) {
-            elements.add(nodeList.getItem(i));
+            nodes.add(nodeList.getItem(i));
         }
-        return elements;
+        return nodes;
     }
 
     /**