Fix for keeping lists structure

Created the WebTag class to represent the tags we want to preserve over
the distillation process. A WebTag instance acts like a placeholder
which is injected to the WebDocument when walking the DOM and is used to
track the UL, OL and LI element positions.

This mechanism could be used in the future for all the tags that we want
to preserve.

When the generateOuput() is called for each WebTag, it returns the html
tag that the instance is representing.  A new filter was created to
process these WebTags and mark as content considering its content
inside. This filter runs for the last since it relies on content found
by all the others filters.

BUG=502524
R=mdjones@chromium.org, wychen@chromium.org

Review URL: https://codereview.chromium.org/1230583006 .

Patch from marcelorcorrea <marcelorcorrea@gmail.com>.
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index b9f0b2d..4a8f8bd 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -16,6 +16,7 @@
 import org.chromium.distiller.webdocument.WebImage;
 import org.chromium.distiller.webdocument.filters.RelevantElements;
 import org.chromium.distiller.webdocument.filters.LeadImageFinder;
+import org.chromium.distiller.webdocument.filters.NestedElementRetainer;
 
 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
@@ -93,6 +94,7 @@
         processDocument(documentInfo.document);
         RelevantElements.process(documentInfo.document);
         LeadImageFinder.process(documentInfo.document);
+        NestedElementRetainer.process(documentInfo.document);
 
         List<WebImage> images = documentInfo.document.getContentImages();
         for (WebImage wi : images) {
diff --git a/java/org/chromium/distiller/webdocument/DomConverter.java b/java/org/chromium/distiller/webdocument/DomConverter.java
index 96c9c4b..31f17a3 100644
--- a/java/org/chromium/distiller/webdocument/DomConverter.java
+++ b/java/org/chromium/distiller/webdocument/DomConverter.java
@@ -99,6 +99,11 @@
             }
         }
 
+        // Create a placeholder for the elements we want to preserve.
+        if (WebTag.canBeNested(e.getTagName())) {
+            builder.tag(new WebTag(e.getTagName(), WebTag.TagType.START));
+        }
+
         switch (e.getTagName()) {
             case "BR":
                 builder.lineBreak(e);
@@ -142,6 +147,12 @@
 
     @Override
     public void exit(Node n) {
+        if (n.getNodeType() == Node.ELEMENT_NODE) {
+            Element e = Element.as(n);
+            if (WebTag.canBeNested(e.getTagName())) {
+                builder.tag(new WebTag(e.getTagName(), WebTag.TagType.END));
+            }
+        }
         builder.endElement();
     }
 
diff --git a/java/org/chromium/distiller/webdocument/WebDocument.java b/java/org/chromium/distiller/webdocument/WebDocument.java
index e8c1880..021851b 100644
--- a/java/org/chromium/distiller/webdocument/WebDocument.java
+++ b/java/org/chromium/distiller/webdocument/WebDocument.java
@@ -29,6 +29,10 @@
         elements.add(table);
     }
 
+    public void addTag(WebTag tag) {
+        elements.add(tag);
+    }
+
     public void addEmbed(WebElement embed) {
         elements.add(embed);
     }
diff --git a/java/org/chromium/distiller/webdocument/WebDocumentBuilder.java b/java/org/chromium/distiller/webdocument/WebDocumentBuilder.java
index 36d384f..d403086 100644
--- a/java/org/chromium/distiller/webdocument/WebDocumentBuilder.java
+++ b/java/org/chromium/distiller/webdocument/WebDocumentBuilder.java
@@ -115,6 +115,12 @@
     }
 
     @Override
+    public void tag(WebTag tag) {
+        flushBlock(groupNumber);
+        document.addTag(tag);
+    }
+
+    @Override
     public void embed(WebElement embedNode) {
         flushBlock(groupNumber);
         document.addEmbed(embedNode);
diff --git a/java/org/chromium/distiller/webdocument/WebDocumentBuilderInterface.java b/java/org/chromium/distiller/webdocument/WebDocumentBuilderInterface.java
index 340d9cb..91b9845 100644
--- a/java/org/chromium/distiller/webdocument/WebDocumentBuilderInterface.java
+++ b/java/org/chromium/distiller/webdocument/WebDocumentBuilderInterface.java
@@ -15,5 +15,6 @@
     void textNode(Text textNode);
     void lineBreak(Node node);
     void dataTable(Element e);
+    void tag(WebTag tag);
     void embed(WebElement embedNode);
 }
diff --git a/java/org/chromium/distiller/webdocument/WebTag.java b/java/org/chromium/distiller/webdocument/WebTag.java
new file mode 100644
index 0000000..ecf6e70
--- /dev/null
+++ b/java/org/chromium/distiller/webdocument/WebTag.java
@@ -0,0 +1,50 @@
+package org.chromium.distiller.webdocument;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * This class represents HTML tags that need to be preserved over
+ * the distillation process.
+ */
+public class WebTag extends WebElement {
+    private String tagName;
+    private TagType tagType;
+
+    public enum TagType {
+        START, END
+    }
+
+    private static Set<String> nestingTags;
+    static {
+        nestingTags = new HashSet<String>();
+        nestingTags.add("UL");
+        nestingTags.add("OL");
+        nestingTags.add("LI");
+    }
+
+    public WebTag(String tagName, TagType tagType) {
+        this.tagName = tagName;
+        this.tagType = tagType;
+    }
+
+    public boolean isStartTag() {
+        return tagType == TagType.START;
+    }
+
+    public String getTagName() {
+        return tagName;
+    }
+
+    @Override
+    public String generateOutput(boolean textOnly) {
+        if (textOnly) {
+            return "";
+        }
+        return "<" + (isStartTag() ? "" : "/") + tagName + ">";
+    }
+
+    public static boolean canBeNested(String tagName) {
+        return nestingTags.contains(tagName);
+    }
+}
diff --git a/java/org/chromium/distiller/webdocument/WebText.java b/java/org/chromium/distiller/webdocument/WebText.java
index ae81ee6..203f051 100644
--- a/java/org/chromium/distiller/webdocument/WebText.java
+++ b/java/org/chromium/distiller/webdocument/WebText.java
@@ -69,10 +69,16 @@
         DomUtil.stripIds(clonedRoot);
         DomUtil.stripFontColorAttributes(clonedRoot);
 
+        // Since there are tag elements that are being wrapped
+        // by a pair of {@link WebTag}s, we only need to
+        // get the innerHTML, otherwise these tags would be duplicated.
+        Element elementClonedRoot = Element.as(clonedRoot);
         if (textOnly) {
-            return Element.as(clonedRoot).getInnerText();
+            return elementClonedRoot.getInnerText();
+        } else if (WebTag.canBeNested(elementClonedRoot.getTagName())) {
+            return elementClonedRoot.getInnerHTML();
         }
-        return Element.as(clonedRoot).getString();
+        return elementClonedRoot.getString();
     }
 
     public List<Node> getTextNodes() {
diff --git a/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java b/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java
new file mode 100644
index 0000000..47b590f
--- /dev/null
+++ b/java/org/chromium/distiller/webdocument/filters/NestedElementRetainer.java
@@ -0,0 +1,49 @@
+package org.chromium.distiller.webdocument.filters;
+
+import org.chromium.distiller.webdocument.WebDocument;
+import org.chromium.distiller.webdocument.WebElement;
+import org.chromium.distiller.webdocument.WebTag;
+
+import java.util.Stack;
+
+/**
+ * This class is used to identify what WebTag should be
+ * marked as <i>isContent</i> based on its {@link WebElement}s inside.
+ * A {@link WebTag} is content when:
+ * <ul>
+ *    <li>Has any {@link WebElement} which is content.</li>
+ *    <li>Has at least one nested {@link WebTag} which is content.</li>
+ * </ul>
+ */
+public class NestedElementRetainer {
+    public static void process(WebDocument document) {
+        boolean isContent = false;
+        int stackMark = -1;
+        Stack<WebTag> stack = new Stack<>();
+
+        for (WebElement e : document.getElements()) {
+            if (!(e instanceof WebTag)) {
+                if (!isContent) {
+                    isContent = e.getIsContent();
+                }
+            } else {
+                WebTag webTag = (WebTag) e;
+                if (webTag.isStartTag()) {
+                    webTag.setIsContent(isContent);
+                    stack.push(webTag);
+                    isContent = false;
+                } else {
+                    WebTag startWebTag = stack.pop();
+                    isContent |= stackMark >= stack.size();
+                    if (isContent) {
+                        stackMark = stack.size() - 1;
+                    }
+                    boolean wasContent = startWebTag.getIsContent();
+                    startWebTag.setIsContent(isContent);
+                    webTag.setIsContent(isContent);
+                    isContent = wasContent;
+                }
+            }
+        }
+    }
+}
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index 63d349c..87f502f 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -143,6 +143,267 @@
                 TestUtil.removeAllDirAttributes(extractedContent));
     }
 
+    public void testPreserveOrderedList() {
+        Element outerListTag = Document.get().createElement("OL");
+        mBody.appendChild(outerListTag);
+
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</OL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveNestedOrderedList() {
+        Element outerListTag = Document.get().createElement("OL");
+        Element outerListItem = Document.get().createElement("LI");
+
+        Element innerListTag = Document.get().createElement("OL");
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        outerListItem.appendChild(innerListTag);
+        outerListTag.appendChild(outerListItem);
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        mBody.appendChild(outerListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<LI>" +
+                          "<OL>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "</OL>" +
+                        "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</OL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveNestedOrderedListWithOtherElementsInside() {
+        Element outerListTag = Document.get().createElement("OL");
+        Element outerListItem = Document.get().createElement("LI");
+        outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
+        outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
+
+        Element innerListTag = Document.get().createElement("OL");
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createParagraph(""));
+
+        outerListItem.appendChild(innerListTag);
+        outerListTag.appendChild(outerListItem);
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
+
+        mBody.appendChild(outerListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<LI>" + CONTENT_TEXT +
+                          "<p>" + CONTENT_TEXT + "</p>" +
+                          "<OL>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "</OL>" +
+                        "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<p>" + CONTENT_TEXT + "</p>" +
+                     "</OL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveUnorderedList() {
+        Element outerListTag = Document.get().createElement("UL");
+        mBody.appendChild(outerListTag);
+
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<UL>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</UL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveNestedUnorderedList() {
+        Element outerListTag = Document.get().createElement("UL");
+        Element outerListItem = Document.get().createElement("LI");
+
+        Element innerListTag = Document.get().createElement("UL");
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        outerListItem.appendChild(innerListTag);
+        outerListTag.appendChild(outerListItem);
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+
+        mBody.appendChild(outerListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<UL>" +
+                        "<LI>" +
+                          "<UL>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "</UL>" +
+                        "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</UL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveNestedUnorderedListWithOtherElementsInside() {
+        Element outerListTag = Document.get().createElement("UL");
+        Element outerListItem = Document.get().createElement("LI");
+        outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
+        outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
+
+        Element innerListTag = Document.get().createElement("UL");
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        innerListTag.appendChild(TestUtil.createParagraph(""));
+
+        outerListItem.appendChild(innerListTag);
+        outerListTag.appendChild(outerListItem);
+        outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
+
+        mBody.appendChild(outerListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<UL>" +
+                        "<LI>" + CONTENT_TEXT +
+                          "<p>" + CONTENT_TEXT + "</p>" +
+                          "<UL>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "</UL>" +
+                        "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<p>" + CONTENT_TEXT + "</p>" +
+                     "</UL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testPreserveUnorderedListWithNestedOrderedList() {
+        Element unorderedListTag = Document.get().createElement("UL");
+        Element li = Document.get().createElement("LI");
+        Element orderedList = Document.get().createElement("OL");
+        orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        li.appendChild(orderedList);
+        unorderedListTag.appendChild(li);
+        unorderedListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
+        mBody.appendChild(unorderedListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<UL>" +
+                        "<LI>" +
+                          "<OL>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                            "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "</OL>" +
+                        "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</UL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testMalformedListStructureWithExtraLITagEnd() {
+        Element unorderedListTag = Document.get().createElement("UL");
+        String html = "<LI>" +  CONTENT_TEXT + "</LI></LI><LI>" + CONTENT_TEXT + "</LI>";
+        unorderedListTag.setInnerHTML(html);
+        mBody.appendChild(unorderedListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<UL>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</UL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testMalformedListStructureWithExtraLITagStart() {
+        Element unorderedListTag = Document.get().createElement("OL");
+        String html = "<LI><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
+        unorderedListTag.setInnerHTML(html);
+        mBody.appendChild(unorderedListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</OL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testMalformedListStructureWithExtraOLTagStart() {
+        Element unorderedListTag = Document.get().createElement("OL");
+        String html = "<OL><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
+        unorderedListTag.setInnerHTML(html);
+        mBody.appendChild(unorderedListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<OL>" +
+                          "<LI>" + CONTENT_TEXT + "</LI>" +
+                          "<LI>" + CONTENT_TEXT + "</LI>" +
+                        "</OL>" +
+                     "</OL>",
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
+    public void testMalformedListStructureWithoutLITag(){
+        Element orderedListTag = Document.get().createElement("OL");
+        String html = "<LI>" + CONTENT_TEXT + "</LI>" +
+                       CONTENT_TEXT +
+                      "<LI>" + CONTENT_TEXT + "</LI>";
+        orderedListTag.setInnerHTML(html);
+        mBody.appendChild(orderedListTag);
+        ContentExtractor extractor = new ContentExtractor(mRoot);
+        String extractedContent = extractor.extractContent();
+        assertEquals("<OL>" +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                         CONTENT_TEXT +
+                        "<LI>" + CONTENT_TEXT + "</LI>" +
+                     "</OL>" ,
+                TestUtil.removeAllDirAttributes(extractedContent));
+    }
+
     private void assertExtractor(String expected, String html) {
         mBody.setInnerHTML("");
         Element div = TestUtil.createDiv(0);
diff --git a/javatests/org/chromium/distiller/TestUtil.java b/javatests/org/chromium/distiller/TestUtil.java
index 8092aff..6db5ad1 100644
--- a/javatests/org/chromium/distiller/TestUtil.java
+++ b/javatests/org/chromium/distiller/TestUtil.java
@@ -120,6 +120,18 @@
         return s;
     }
 
+    public static Element createParagraph(String value) {
+        Element s = Document.get().createElement("P");
+        s.setInnerHTML(value);
+        return s;
+    }
+
+    public static Element createListItem(String value) {
+        Element s = Document.get().createElement("LI");
+        s.setInnerText(value);
+        return s;
+    }
+
     private static void createDivTreeImpl(Element e, int depth, List<Element> divs) {
         if (depth > 2) return;
         for (int i = 0; i < 2; i++) {
diff --git a/javatests/org/chromium/distiller/webdocument/DomConverterTest.java b/javatests/org/chromium/distiller/webdocument/DomConverterTest.java
index 4f88602..6d813cc 100644
--- a/javatests/org/chromium/distiller/webdocument/DomConverterTest.java
+++ b/javatests/org/chromium/distiller/webdocument/DomConverterTest.java
@@ -152,4 +152,39 @@
         String html = "text<br>split<br/>with<br/>lines";
         runTest(html, "text\nsplit\nwith\nlines");
     }
+
+    public void testList() throws Throwable {
+        Element container = Document.get().createDivElement();
+        container.setInnerHTML("<ol><li>some text1</li><li>some text2</li></ol>");
+
+        WebDocumentBuilder builder = new WebDocumentBuilder();
+        DomConverter converter = new DomConverter(builder);
+        new DomWalker(converter).walk(container);
+
+        WebDocument doc = builder.toWebDocument();
+        List<WebElement> elements = doc.getElements();
+
+        assertEquals(8, elements.size());
+        assertTrue(elements.get(0) instanceof WebTag);
+        assertTrue(((WebTag) elements.get(0)).isStartTag());
+
+        assertTrue(elements.get(1) instanceof WebTag);
+        assertTrue(((WebTag) elements.get(1)).isStartTag());
+
+        assertTrue(elements.get(2) instanceof WebText);
+
+        assertTrue(elements.get(3) instanceof WebTag);
+        assertFalse(((WebTag) elements.get(3)).isStartTag());
+
+        assertTrue(elements.get(4) instanceof WebTag);
+        assertTrue(((WebTag) elements.get(4)).isStartTag());
+
+        assertTrue(elements.get(5) instanceof WebText);
+
+        assertTrue(elements.get(6) instanceof WebTag);
+        assertFalse(((WebTag) elements.get(6)).isStartTag());
+
+        assertTrue(elements.get(7) instanceof WebTag);
+        assertFalse(((WebTag) elements.get(7)).isStartTag());
+    }
 }
diff --git a/javatests/org/chromium/distiller/webdocument/FakeWebDocumentBuilder.java b/javatests/org/chromium/distiller/webdocument/FakeWebDocumentBuilder.java
index 8ef739d..9d4690c 100644
--- a/javatests/org/chromium/distiller/webdocument/FakeWebDocumentBuilder.java
+++ b/javatests/org/chromium/distiller/webdocument/FakeWebDocumentBuilder.java
@@ -73,4 +73,7 @@
 
     @Override
     public void embed(WebElement embed) {}
+
+    @Override
+    public void tag(WebTag tag) {}
 }
diff --git a/javatests/org/chromium/distiller/webdocument/TestWebDocumentBuilder.java b/javatests/org/chromium/distiller/webdocument/TestWebDocumentBuilder.java
index 1258953..93de213 100644
--- a/javatests/org/chromium/distiller/webdocument/TestWebDocumentBuilder.java
+++ b/javatests/org/chromium/distiller/webdocument/TestWebDocumentBuilder.java
@@ -56,6 +56,18 @@
         return wi;
     }
 
+    public WebTag addTagStart() {
+        WebTag webTag = new WebTag("OL", WebTag.TagType.START);
+        document.addTag(webTag);
+        return webTag;
+    }
+
+    public WebTag addTagEnd() {
+        WebTag webTag = new WebTag("OL", WebTag.TagType.END);
+        document.addTag(webTag);
+        return webTag;
+    }
+
     public WebDocument build() {
         return document;
     }
diff --git a/javatests/org/chromium/distiller/webdocument/WebTagTest.java b/javatests/org/chromium/distiller/webdocument/WebTagTest.java
new file mode 100644
index 0000000..9f1a959
--- /dev/null
+++ b/javatests/org/chromium/distiller/webdocument/WebTagTest.java
@@ -0,0 +1,40 @@
+package org.chromium.distiller.webdocument;
+
+import org.chromium.distiller.DomDistillerJsTestCase;
+
+public class WebTagTest extends DomDistillerJsTestCase {
+
+    public void testOLGenerateOutput() {
+        WebTag olStartWebTag = new WebTag("ol", WebTag.TagType.START);
+        WebTag olEndWebTag = new WebTag("ol", WebTag.TagType.END);
+        String startResult = olStartWebTag.generateOutput(false);
+        String endResult = olEndWebTag.generateOutput(false);
+        assertEquals(startResult, "<ol>");
+        assertEquals(endResult, "</ol>");
+    }
+
+    public void testULGenerateOutput() {
+        WebTag ulStartWebTag = new WebTag("ul", WebTag.TagType.START);
+        WebTag u = new WebTag("ul", WebTag.TagType.END);
+        String startResult = ulStartWebTag.generateOutput(false);
+        String endResult = u.generateOutput(false);
+        assertEquals(startResult, "<ul>");
+        assertEquals(endResult, "</ul>");
+    }
+
+    public void testLIGenerateOutput() {
+        WebTag liStartWebTag = new WebTag("li", WebTag.TagType.START);
+        WebTag liEndWebTag = new WebTag("li", WebTag.TagType.END);
+        String startResult = liStartWebTag.generateOutput(false);
+        String endResult = liEndWebTag.generateOutput(false);
+        assertEquals(startResult, "<li>");
+        assertEquals(endResult, "</li>");
+    }
+
+    public void testShouldGetInnerHTML() {
+        assertTrue(WebTag.canBeNested("LI"));
+        assertTrue(WebTag.canBeNested("UL"));
+        assertTrue(WebTag.canBeNested("OL"));
+        assertFalse(WebTag.canBeNested("SPAN"));
+    }
+}
diff --git a/javatests/org/chromium/distiller/webdocument/WebTextTest.java b/javatests/org/chromium/distiller/webdocument/WebTextTest.java
index e506491..d0b7f6c 100644
--- a/javatests/org/chromium/distiller/webdocument/WebTextTest.java
+++ b/javatests/org/chromium/distiller/webdocument/WebTextTest.java
@@ -91,4 +91,19 @@
         String want = "<p>Words<br>split<br>with<br>lines</p>";
         assertEquals(want, TestUtil.removeAllDirAttributes(got));
     }
+
+    public void testGenerateOutputLIElements() {
+        Element container = Document.get().createLIElement();
+        mBody.appendChild(container);
+
+        container.appendChild(TestUtil.createText("Some text content 1."));
+
+        WebTextBuilder builder = new WebTextBuilder();
+        builder.textNode(Text.as(container.getChild(0)), 0);
+
+        WebText text = builder.build(0);
+        String got = text.generateOutput(false);
+        String want = "Some text content 1.";
+        assertEquals(want, TestUtil.removeAllDirAttributes(got));
+    }
 }
diff --git a/javatests/org/chromium/distiller/webdocument/filters/NestedElementRetainerTest.java b/javatests/org/chromium/distiller/webdocument/filters/NestedElementRetainerTest.java
new file mode 100644
index 0000000..5627555
--- /dev/null
+++ b/javatests/org/chromium/distiller/webdocument/filters/NestedElementRetainerTest.java
@@ -0,0 +1,185 @@
+package org.chromium.distiller.webdocument.filters;
+
+import org.chromium.distiller.DomDistillerJsTestCase;
+import org.chromium.distiller.webdocument.TestWebDocumentBuilder;
+import org.chromium.distiller.webdocument.WebDocument;
+import org.chromium.distiller.webdocument.WebTag;
+
+public class NestedElementRetainerTest extends DomDistillerJsTestCase {
+    public void testOrderedListStructure() {
+        TestWebDocumentBuilder builder = new TestWebDocumentBuilder();
+        WebTag olStart = builder.addTagStart();
+        WebTag liStart = builder.addTagStart();
+        builder.addText("text 1").setIsContent(false);
+        WebTag liEnd = builder.addTagEnd();
+        WebTag liStart2 = builder.addTagStart();
+        builder.addText("text 2").setIsContent(false);
+        WebTag liEnd2 = builder.addTagEnd();
+        WebTag liStart3 = builder.addTagStart();
+        builder.addText("text 3").setIsContent(true);
+        WebTag liEnd3 = builder.addTagEnd();
+        WebTag olEnd = builder.addTagEnd();
+        WebDocument document = builder.build();
+        NestedElementRetainer.process(document);
+        assertTrue(olStart.getIsContent());
+        assertFalse(liStart.getIsContent());
+        assertFalse(liEnd.getIsContent());
+        assertFalse(liStart2.getIsContent());
+        assertFalse(liEnd2.getIsContent());
+        assertTrue(liStart3.getIsContent());
+        assertTrue(liEnd3.getIsContent());
+        assertTrue(olEnd.getIsContent());
+    }
+
+    public void testUnorderedListStructure() {
+        TestWebDocumentBuilder builder = new TestWebDocumentBuilder();
+        WebTag ulStart = builder.addTagStart();
+        WebTag liStart = builder.addTagStart();
+        builder.addText("text 1").setIsContent(true);
+        WebTag ulStart2 = builder.addTagStart();
+        WebTag liStart2 = builder.addTagStart();
+        builder.addText("text 2").setIsContent(false);
+        WebTag liEnd2 = builder.addTagEnd();
+        WebTag ulEnd2 = builder.addTagEnd();
+        WebTag liEnd = builder.addTagEnd();
+        WebTag ulEnd = builder.addTagEnd();
+        WebDocument document = builder.build();
+
+        NestedElementRetainer.process(document);
+        assertTrue(ulStart.getIsContent());
+        assertTrue(liStart.getIsContent());
+        assertFalse(ulStart2.getIsContent());
+        assertFalse(liStart2.getIsContent());
+        assertFalse(liEnd2.getIsContent());
+        assertFalse(ulEnd2.getIsContent());
+        assertTrue(liEnd.getIsContent());
+        assertTrue(ulEnd.getIsContent());
+
+    }
+
+    public void testContentFromListStrcture() {
+        TestWebDocumentBuilder builder = new TestWebDocumentBuilder();
+        WebTag olStartLevel1 = builder.addTagStart();
+        WebTag olStartLevel2 = builder.addTagStart();
+        WebTag liStart1 = builder.addTagStart();
+        builder.addText("text 1").setIsContent(false);
+        WebTag liEnd1 = builder.addTagEnd();
+        WebTag olStartLevel3 = builder.addTagStart();
+        WebTag liStart2 = builder.addTagStart();
+        builder.addText("text 2").setIsContent(true);
+        WebTag liEnd2 = builder.addTagEnd();
+        WebTag olStartLevel4 = builder.addTagStart();
+        WebTag liStart3 = builder.addTagStart();
+        builder.addText("text 3").setIsContent(false);
+        WebTag liEnd3 = builder.addTagEnd();
+        WebTag liStart4 = builder.addTagStart();
+        builder.addText("text 4").setIsContent(false);
+        WebTag liEnd4 = builder.addTagEnd();
+        WebTag liStart5 = builder.addTagStart();
+        builder.addText("text 5").setIsContent(false);
+        WebTag liEnd5 = builder.addTagEnd();
+        WebTag liStart6 = builder.addTagStart();
+        builder.addText("text 6").setIsContent(false);
+        WebTag liEnd6 = builder.addTagEnd();
+        WebTag olEndLevel4 = builder.addTagEnd();
+        WebTag olEndLevel3 = builder.addTagEnd();
+        WebTag liStart7 = builder.addTagStart();
+        builder.addText("text 7").setIsContent(true);
+        WebTag liEnd7 = builder.addTagEnd();
+        WebTag olEndLevel2 = builder.addTagEnd();
+        WebTag olEndLevel1 = builder.addTagEnd();
+        WebDocument document = builder.build();
+
+        NestedElementRetainer.process(document);
+        assertTrue(olStartLevel1.getIsContent());
+        assertTrue(olStartLevel2.getIsContent());
+        assertFalse(liStart1.getIsContent());
+        assertFalse(liEnd1.getIsContent());
+        assertTrue(olStartLevel3.getIsContent());
+        assertTrue(liStart2.getIsContent());
+        assertTrue(liEnd2.getIsContent());
+        assertFalse(olStartLevel4.getIsContent());
+        assertFalse(liStart3.getIsContent());
+        assertFalse(liEnd3.getIsContent());
+        assertFalse(liStart4.getIsContent());
+        assertFalse(liEnd4.getIsContent());
+        assertFalse(liStart5.getIsContent());
+        assertFalse(liEnd5.getIsContent());
+        assertFalse(liStart6.getIsContent());
+        assertFalse(liEnd6.getIsContent());
+        assertFalse(olEndLevel4.getIsContent());
+        assertTrue(olEndLevel3.getIsContent());
+        assertTrue(liStart7.getIsContent());
+        assertTrue(liEnd7.getIsContent());
+        assertTrue(olEndLevel2.getIsContent());
+        assertTrue(olEndLevel1.getIsContent());
+    }
+
+    public void testNoContentFromListStructure() {
+        TestWebDocumentBuilder builder = new TestWebDocumentBuilder();
+        WebTag olStartLevel1 = builder.addTagStart();
+        WebTag olStartLevel2 = builder.addTagStart();
+        WebTag liStart1 = builder.addTagStart();
+        builder.addText("text 1").setIsContent(false);
+        WebTag liEnd1 = builder.addTagEnd();
+        WebTag olStartLevel4 = builder.addTagStart();
+        WebTag liStart3 = builder.addTagStart();
+        builder.addText("text 3").setIsContent(false);
+        WebTag liEnd3 = builder.addTagEnd();
+        WebTag liStart4 = builder.addTagStart();
+        builder.addText("text 4").setIsContent(false);
+        WebTag liEnd4 = builder.addTagEnd();
+        WebTag liStart5 = builder.addTagStart();
+        builder.addText("text 5").setIsContent(false);
+        WebTag liEnd5 = builder.addTagEnd();
+        WebTag liStart6 = builder.addTagStart();
+        builder.addText("text 6").setIsContent(false);
+        WebTag liEnd6 = builder.addTagEnd();
+        WebTag olEndLevel4 = builder.addTagEnd();
+        WebTag olEndLevel2 = builder.addTagEnd();
+        WebTag olEndLevel1 = builder.addTagEnd();
+        WebDocument document = builder.build();
+
+        NestedElementRetainer.process(document);
+        assertFalse(olStartLevel1.getIsContent());
+        assertFalse(olStartLevel2.getIsContent());
+        assertFalse(liStart1.getIsContent());
+        assertFalse(liEnd1.getIsContent());
+        assertFalse(olStartLevel4.getIsContent());
+        assertFalse(liStart3.getIsContent());
+        assertFalse(liEnd3.getIsContent());
+        assertFalse(liStart4.getIsContent());
+        assertFalse(liEnd4.getIsContent());
+        assertFalse(liStart5.getIsContent());
+        assertFalse(liEnd5.getIsContent());
+        assertFalse(liStart6.getIsContent());
+        assertFalse(liEnd6.getIsContent());
+        assertFalse(olEndLevel4.getIsContent());
+        assertFalse(olEndLevel2.getIsContent());
+        assertFalse(olEndLevel1.getIsContent());
+    }
+
+    public void testNestedListStructure() {
+        TestWebDocumentBuilder builder = new TestWebDocumentBuilder();
+        WebTag ulStart = builder.addTagStart();
+        WebTag liStart = builder.addTagStart();
+        builder.addText("text 1").setIsContent(true);
+        WebTag liEnd = builder.addTagEnd();
+        WebTag olStart = builder.addTagStart();
+        WebTag liOLStart = builder.addTagStart();
+        builder.addText("text 2").setIsContent(true);
+        WebTag liOLEnd = builder.addTagEnd();
+        WebTag olEnd = builder.addTagEnd();
+        WebTag ulEnd = builder.addTagEnd();
+        WebDocument document = builder.build();
+        NestedElementRetainer.process(document);
+        assertTrue(ulStart.getIsContent());
+        assertTrue(liStart.getIsContent());
+        assertTrue(olStart.getIsContent());
+        assertTrue(liOLStart.getIsContent());
+        assertTrue(liOLEnd.getIsContent());
+        assertTrue(olEnd.getIsContent());
+        assertTrue(liEnd.getIsContent());
+        assertTrue(ulEnd.getIsContent());
+    }
+}
diff --git a/javatests/org/chromium/distiller/webdocument/filters/RelevantElementsTest.java b/javatests/org/chromium/distiller/webdocument/filters/RelevantElementsTest.java
index cb92a4c..c3832a2 100644
--- a/javatests/org/chromium/distiller/webdocument/filters/RelevantElementsTest.java
+++ b/javatests/org/chromium/distiller/webdocument/filters/RelevantElementsTest.java
@@ -5,12 +5,10 @@
 package org.chromium.distiller.webdocument.filters;
 
 import org.chromium.distiller.DomDistillerJsTestCase;
-import org.chromium.distiller.webdocument.TestWebTextBuilder;
 import org.chromium.distiller.webdocument.TestWebDocumentBuilder;
 import org.chromium.distiller.webdocument.WebDocument;
 import org.chromium.distiller.webdocument.WebElement;
 import org.chromium.distiller.webdocument.WebImage;
-import org.chromium.distiller.webdocument.WebText;
 import org.chromium.distiller.webdocument.WebTable;