The display style of WebText root element should never be inline
Some webpages have the following structure:
<p>
<span>text</span>
</p>
<p>
<span>text2</span>
</p>
The <p> element should be kept; otherwise these paragraphs would be
concatenated as one after distillation.
BUG=531545, 539851
R=mdjones@chromium.org
Review URL: https://codereview.chromium.org/1518973002 .
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 45d78a2..3cca24b 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -222,7 +222,7 @@
/**
* Get the nearest common ancestor of nodes.
*/
- public static Node getNearestCommonAncestor(final List<Element> ns) {
+ public static Node getNearestCommonAncestor(final List<? extends Node> ns) {
if (ns.size() == 0) return null;
Node parent = ns.get(0);
for (int i = 1; i < ns.size(); i++) {
diff --git a/java/org/chromium/distiller/webdocument/WebText.java b/java/org/chromium/distiller/webdocument/WebText.java
index 47e97c0..e41927d 100644
--- a/java/org/chromium/distiller/webdocument/WebText.java
+++ b/java/org/chromium/distiller/webdocument/WebText.java
@@ -4,6 +4,7 @@
package org.chromium.distiller.webdocument;
+import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
import org.chromium.distiller.DomUtil;
import org.chromium.distiller.TreeCloneBuilder;
@@ -26,6 +27,7 @@
// If this text needs to be split to place an image properly its group will signify how they
// should be joined again (same group numbers get merged).
private int groupNumber;
+ private static Set<String> sInlineTags = null;
public WebText(String text, List<Node> allTextNodes, int start, int end, int firstWordNode,
int lastWordNode, int numWords, int numLinkedWords, int tagLevel, int offsetBlock) {
@@ -48,6 +50,45 @@
this.offsetBlock = offsetBlock;
}
+ private Set<String> getInlineTags() {
+ if (sInlineTags == null) {
+ sInlineTags = new HashSet<String>();
+ // All inline elements except for impossible tags: BR, OBJECT, and SCRIPT.
+ // Please refer to DomConverter.visitElement() for skipped tags.
+ // Reference: https://developer.mozilla.org/en-US/docs/HTML/Inline_elements
+ sInlineTags.add("B");
+ sInlineTags.add("BIG");
+ sInlineTags.add("I");
+ sInlineTags.add("SMALL");
+ sInlineTags.add("TT");
+ sInlineTags.add("ABBR");
+ sInlineTags.add("ACRONYM");
+ sInlineTags.add("CITE");
+ sInlineTags.add("CODE");
+ sInlineTags.add("DFN");
+ sInlineTags.add("EM");
+ sInlineTags.add("KBD");
+ sInlineTags.add("STRONG");
+ sInlineTags.add("SAMP");
+ sInlineTags.add("TIME");
+ sInlineTags.add("VAR");
+ sInlineTags.add("A");
+ sInlineTags.add("BDO");
+ sInlineTags.add("IMG");
+ sInlineTags.add("MAP");
+ sInlineTags.add("Q");
+ sInlineTags.add("SPAN");
+ sInlineTags.add("SUB");
+ sInlineTags.add("SUP");
+ sInlineTags.add("BUTTON");
+ sInlineTags.add("INPUT");
+ sInlineTags.add("LABEL");
+ sInlineTags.add("SELECT");
+ sInlineTags.add("TEXTAREA");
+ }
+ return sInlineTags;
+ }
+
@Override
public String generateOutput(boolean textOnly) {
if (hasLabel(DefaultLabels.TITLE)) return "";
@@ -57,13 +98,37 @@
Node clonedRoot = TreeCloneBuilder.buildTreeClone(getTextNodes());
// To keep formatting/structure, at least one parent element should be in the output. This
- // is necessary because many times a WebText is only a single node.
+ // is necessary because many times a WebText is only a single text node.
if (clonedRoot.getNodeType() != Node.ELEMENT_NODE) {
Node parentClone = getTextNodes().get(0).getParentElement().cloneNode(false);
parentClone.appendChild(clonedRoot);
clonedRoot = parentClone;
}
+ // The BODY element should not be used in the output.
+ if ("BODY".equals(Element.as(clonedRoot).getTagName())) {
+ Element div = Document.get().createDivElement();
+ div.setInnerHTML(Element.as(clonedRoot).getInnerHTML());
+ clonedRoot = div;
+ }
+
+ // Retain parent tags until the root is not an inline element, to make sure the style is
+ // display:block.
+ Node srcRoot = null;
+ while (getInlineTags().contains(Element.as(clonedRoot).getTagName())) {
+ if (srcRoot == null) {
+ srcRoot = DomUtil.getNearestCommonAncestor(getTextNodes());
+ if (srcRoot.getNodeType() != Node.ELEMENT_NODE) {
+ srcRoot = srcRoot.getParentElement();
+ }
+ }
+ srcRoot = srcRoot.getParentElement();
+ if ("BODY".equals(Element.as(srcRoot).getTagName())) break;
+ Node parentClone = srcRoot.cloneNode(false);
+ parentClone.appendChild(clonedRoot);
+ clonedRoot = parentClone;
+ }
+
// Make sure links are absolute and IDs are gone.
DomUtil.makeAllLinksAbsolute(clonedRoot);
DomUtil.stripTargetAttributes(clonedRoot);
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index 4b6d424..00995a3 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -259,6 +259,22 @@
TestUtil.removeAllDirAttributes(extractedContent));
}
+ public void testPreserveOrderedListWithSpan() {
+ String html =
+ "<OL>" +
+ "<LI><span>" + CONTENT_TEXT + "</span></LI>" +
+ "<LI>" + CONTENT_TEXT + "</LI>" +
+ "<LI>" + CONTENT_TEXT + "</LI>" +
+ "<LI>" + CONTENT_TEXT + "</LI>" +
+ "</OL>";
+ mBody.setInnerHTML(html);
+
+ ContentExtractor extractor = new ContentExtractor(mRoot);
+ String extractedContent = extractor.extractContent();
+ assertEquals(html,
+ TestUtil.removeAllDirAttributes(extractedContent));
+ }
+
public void testPreserveNestedOrderedList() {
Element outerListTag = Document.get().createElement("OL");
Element outerListItem = Document.get().createElement("LI");
@@ -546,11 +562,8 @@
}
private void assertExtractor(String expected, String html) {
- mBody.setInnerHTML("");
- Element div = TestUtil.createDiv(0);
- mBody.appendChild(div);
+ mBody.setInnerHTML(html);
- div.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
@@ -582,4 +595,41 @@
assertEquals(expected,
TestUtil.removeAllDirAttributes(extractedContent));
}
+
+ public void testBlockyArticle() {
+ final String htmlArticle =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<span>" + CONTENT_TEXT + "</span>" +
+ "<div><span>" + CONTENT_TEXT + "</span></div>" +
+ "<p><em>" + CONTENT_TEXT + "</em></p>" +
+ "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
+ "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
+ "<main><span><blockquote><cite>" +
+ "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
+ "</cite></blockquote></span></main>";
+
+ final String expected =
+ "<h1>" + CONTENT_TEXT + "</h1>" +
+ "<span>" + CONTENT_TEXT + "</span>" +
+ "<div><span>" + CONTENT_TEXT + "</span></div>" +
+ "<p><em>" + CONTENT_TEXT + "</em></p>" +
+ "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
+ "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
+ "<BLOCKQUOTE><cite>" +
+ "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
+ "</cite></BLOCKQUOTE>";
+
+ assertExtractor(expected, htmlArticle);
+ }
+
+ public void testSpanArticle() {
+ final String htmlArticle =
+ "<span>" + CONTENT_TEXT + "</span>" +
+ "<span>" + CONTENT_TEXT + "</span>" +
+ "<span>" + CONTENT_TEXT + "</span>";
+
+ final String expected = "<div>" + htmlArticle + "</div>";
+
+ assertExtractor(expected, htmlArticle);
+ }
}
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index 5038442..85e61cf 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -111,7 +111,7 @@
currDiv.appendChild(TestUtil.createDiv(5));
assertEquals(div2, DomUtil.getNearestCommonAncestor(finalDiv1, currDiv.getChild(0)));
- NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]");
+ NodeList nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"3\"],[id=\"5\"]");
assertEquals(div2, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
}
@@ -130,7 +130,7 @@
div2.appendChild(div3);
assertEquals(div, DomUtil.getNearestCommonAncestor(div, div3));
- NodeList<Element> nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]");
+ NodeList nodeList = DomUtil.querySelectorAll(mRoot, "[id=\"1\"],[id=\"3\"]");
assertEquals(div, DomUtil.getNearestCommonAncestor(TestUtil.nodeListToList(nodeList)));
}
diff --git a/javatests/org/chromium/distiller/TestUtil.java b/javatests/org/chromium/distiller/TestUtil.java
index 107eaa9..9b63336 100644
--- a/javatests/org/chromium/distiller/TestUtil.java
+++ b/javatests/org/chromium/distiller/TestUtil.java
@@ -11,6 +11,7 @@
import com.google.gwt.dom.client.IFrameElement;
import com.google.gwt.dom.client.ImageElement;
import com.google.gwt.dom.client.MetaElement;
+import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.Text;
import com.google.gwt.dom.client.TitleElement;
import com.google.gwt.dom.client.NodeList;
@@ -162,12 +163,12 @@
return originalHtml.replaceAll(" dir=\\\"(ltr|rtl|inherit|auto)\\\"","");
}
- public static List<Element> nodeListToList(NodeList<Element> nodeList) {
- List<Element> elements = new ArrayList<>();
+ public static List<Node> nodeListToList(NodeList nodeList) {
+ List<Node> nodes = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
- elements.add(nodeList.getItem(i));
+ nodes.add(nodeList.getItem(i));
}
- return elements;
+ return nodes;
}
/**