javatests/org/chromium/distiller/ContentExtractorTest.java - chromium/dom-distiller - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 package org.chromium.distiller;

 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;

 public class ContentExtractorTest extends DomDistillerJsTestCase {
     private static final String CONTENT_TEXT = "Lorem Ipsum Lorem Ipsum Lorem Ipsum.";
     private static final String TITLE_TEXT = "I am the document title";

     public void testDoesNotExtractTitleInContent() {
         Element titleDiv = TestUtil.createDiv(0);
         titleDiv.appendChild(TestUtil.createText(TITLE_TEXT));
         mBody.appendChild(titleDiv);
         Element contentDiv = TestUtil.createDiv(1);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
         mBody.appendChild(contentDiv);

         contentDiv = TestUtil.createDiv(2);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
         mBody.appendChild(contentDiv);

         contentDiv = TestUtil.createDiv(3);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));

         mBody.appendChild(contentDiv);

         // Title hasn't been set yet, everything should be content.
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(contentDiv)));
         assertTrue(
                 extractedContent + " must contain 'title':" + TITLE_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(titleDiv)));

         // Now set the title and it should excluded from the content.
         mHead.appendChild(TestUtil.createTitle(TITLE_TEXT));
         extractor = new ContentExtractor(mRoot);
         extractedContent = extractor.extractContent();
         assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(contentDiv)));
         assertFalse(
                 extractedContent + " must not contain 'title':" + TITLE_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(titleDiv)));
     }

     public void testExtractsEssentialWhitespace() {
         Element div = TestUtil.createDiv(0);
         mBody.appendChild(div);

         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText(" "));
         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText("\n"));
         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText(" "));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<div><span>" + CONTENT_TEXT + "</span> " +
                      "<span>" + CONTENT_TEXT + "</span>\n" +
                      "<span>" + CONTENT_TEXT + "</span> </div>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPrefersMarkupParserOverDocumentTitle() {
         // Minimum fields for open-graph parser.
         final String MARKUP_PARSER_TITLE = "title from markup parser";
         createMeta("og:title", MARKUP_PARSER_TITLE);
         createMeta("og:type", "video.movie");
         createMeta("og:image", "http://test/image.jpeg");
         createMeta("og:url", "http://test/test.html");

         OpenGraphProtocolParserAccessor parser = new OpenGraphProtocolParserAccessor(mRoot);
         assertTrue(parser != null);
         assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());

         Document.get().setTitle(TITLE_TEXT);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         assertEquals("OpenGraph title should be picked over document.title",
                 MARKUP_PARSER_TITLE, extractor.extractTitle());
     }

     public void testImage() {
         // Test the absolute and different kinds of relative URLs for image sources,
         // and also add an extra comma (,) as malformed srcset syntax for robustness.
         // Also test images in WebImage and WebTable.
         // TODO(wychen): add images in WebText when it is supported.
         final String html =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<img id=\"a\" style=\"typo\" align=\"left\" src=\"image\" srcset=\"image200 200w, //example.org/image400 400w\">" +
             "<img id=\"b\" style=\"align: left\" alt=\"b\" data-dummy=\"c\" data-src=\"image2\">" +
             "<table role=\"grid\"><tbody><tr><td>" +
                 "<img id=\"c\" style=\"a\" alt=\"b\" src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
                 "<img id=\"d\" style=\"a\" align=\"left\" src=\"/image2\">" +
                 "</td></tr></tbody></table>" +
             "<p>" + CONTENT_TEXT + "</p>";

         final String expected =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<img src=\"http://example.com/path/image\" " +
                  "srcset=\"http://example.com/path/image200 200w, http://example.org/image400 400w\">" +
             "<img alt=\"b\" src=\"http://example.com/path/image2\">" +
             "<table role=\"grid\"><tbody><tr><td>" +
                 "<img alt=\"b\" src=\"http://example.com/image\" " +
                      "srcset=\"https://example.com/image2x 2x, http://example.com/image4x 4x, \">" +
                 "<img src=\"http://example.com/image2\">" +
             "</td></tr></tbody></table>" +
             "<p>" + CONTENT_TEXT + "</p>";

         mHead.setInnerHTML("<base href=\"http://example.com/path/\">");
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();

         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     private void createMeta(String property, String content) {
         mHead.appendChild(TestUtil.createMetaProperty(property, content));
     }

     public void testRemoveFontColorAttributes() {
         Element outerFontTag = Document.get().createElement("FONT");
         outerFontTag.setAttribute("COLOR", "blue");
         mBody.appendChild(outerFontTag);

         String text = "<font color=\"red\">" + CONTENT_TEXT + "</font>";

         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText(" "));
         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText("\n"));
         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText(" "));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<font><span><font>" + CONTENT_TEXT + "</font></span> " +
                      "<span><font>" + CONTENT_TEXT + "</font></span>\n" +
                      "<span><font>" + CONTENT_TEXT + "</font></span> </font>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testRemoveStyleAttributes() {
         String html =
             "<h1 style=\"font-weight: folder\">" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p style=\"\">" +
                 CONTENT_TEXT +
             "</p>" +
             "<img style=\"align: left\" data-src=\"/test.png\">" +
             "<table style=\"position: absolute\">" +
                 "<tbody style=\"font-size: 2\">" +
                     "<tr style=\"z-index: 0\">" +
                         "<th style=\"top: 0px\">" + CONTENT_TEXT +
                             "<img style=\"align: left\" src=\"/test.png\">" +
                         "</th>" +
                         "<th style=\"width: 20px\">" + CONTENT_TEXT + "</th>" +
                     "</tr><tr style=\"left: 0\">" +
                         "<td style=\"display: block\">" + CONTENT_TEXT + "</td>" +
                         "<td style=\"color: #123\">" + CONTENT_TEXT + "</td>" +
                     "</tr>" +
                 "</tbody>" +
             "</table>";

         final String expected =
             "<h1>" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p>" +
                 CONTENT_TEXT +
             "</p>" +
             "<img src=\"http://example.com/test.png\">" +
             "<table>" +
                 "<tbody>" +
                     "<tr>" +
                         "<th>" + CONTENT_TEXT +
                             "<img src=\"http://example.com/test.png\">" +
                         "</th>" +
                         "<th>" + CONTENT_TEXT + "</th>" +
                     "</tr><tr>" +
                         "<td>" + CONTENT_TEXT + "</td>" +
                         "<td>" + CONTENT_TEXT + "</td>" +
                     "</tr>" +
                 "</tbody>" +
             "</table>";

         mHead.setInnerHTML("<base href=\"http://example.com/\">");
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testKeepingWidthAndHeightAttributes() {
         String html =
             "<h1>" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p>" +
                 CONTENT_TEXT +
             "</p>" +
             "<img style=\"align: left\" src=\"/test.png\" " +
                     "width=\"200\" height=\"300\">" +
             "<img style=\"align: left\" src=\"/test.png\" " +
                     "width=\"200\">" +
             "<img style=\"align: left\" src=\"/test.png\">";

         final String expected =
             "<h1>" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p>" +
                 CONTENT_TEXT +
             "</p>" +
             "<img src=\"http://example.com/test.png\" " +
                     "width=\"200\" height=\"300\">" +
             "<img src=\"http://example.com/test.png\" " +
                     "width=\"200\">" +
             "<img src=\"http://example.com/test.png\">";

         mHead.setInnerHTML("<base href=\"http://example.com/\">");
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveOrderedList() {
         Element outerListTag = Document.get().createElement("OL");
         mBody.appendChild(outerListTag);

         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveOrderedListWithSpan() {
         String html =
             "<OL>" +
                 "<LI><span>" + CONTENT_TEXT + "</span></LI>" +
                 "<LI>" + CONTENT_TEXT + "</LI>" +
                 "<LI>" + CONTENT_TEXT + "</LI>" +
                 "<LI>" + CONTENT_TEXT + "</LI>" +
             "</OL>";
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(html,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedOrderedList() {
         Element outerListTag = Document.get().createElement("OL");
         Element outerListItem = Document.get().createElement("LI");

         Element innerListTag = Document.get().createElement("OL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedOrderedListWithOtherElementsInside() {
         Element outerListTag = Document.get().createElement("OL");
         Element outerListItem = Document.get().createElement("LI");
         outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
         outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         Element innerListTag = Document.get().createElement("OL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createParagraph(""));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT +
                           "<p>" + CONTENT_TEXT + "</p>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveUnorderedList() {
         Element outerListTag = Document.get().createElement("UL");
         mBody.appendChild(outerListTag);

         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedUnorderedList() {
         Element outerListTag = Document.get().createElement("UL");
         Element outerListItem = Document.get().createElement("LI");

         Element innerListTag = Document.get().createElement("UL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" +
                           "<UL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</UL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedUnorderedListWithOtherElementsInside() {
         Element outerListTag = Document.get().createElement("UL");
         Element outerListItem = Document.get().createElement("LI");
         outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
         outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         Element innerListTag = Document.get().createElement("UL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createParagraph(""));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT +
                           "<p>" + CONTENT_TEXT + "</p>" +
                           "<UL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</UL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveUnorderedListWithNestedOrderedList() {
         Element unorderedListTag = Document.get().createElement("UL");
         Element li = Document.get().createElement("LI");
         Element orderedList = Document.get().createElement("OL");
         orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         li.appendChild(orderedList);
         unorderedListTag.appendChild(li);
         unorderedListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraLITagEnd() {
         Element unorderedListTag = Document.get().createElement("UL");
         String html = "<LI>" +  CONTENT_TEXT + "</LI></LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraLITagStart() {
         Element unorderedListTag = Document.get().createElement("OL");
         String html = "<LI><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraOLTagStart() {
         Element unorderedListTag = Document.get().createElement("OL");
         String html = "<OL><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<OL>" +
                           "<LI>" + CONTENT_TEXT + "</LI>" +
                           "<LI>" + CONTENT_TEXT + "</LI>" +
                         "</OL>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithoutLITag(){
         Element orderedListTag = Document.get().createElement("OL");
         String html = "<LI>" + CONTENT_TEXT + "</LI>" +
                        CONTENT_TEXT +
                       "<LI>" + CONTENT_TEXT + "</LI>";
         orderedListTag.setInnerHTML(html);
         mBody.appendChild(orderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                          CONTENT_TEXT +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>" ,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveChildElementWithinBlockquote() {
         Element blockquote = Document.get().createElement("BLOCKQUOTE");
         mBody.appendChild(blockquote);

         blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT
                 + CONTENT_TEXT + CONTENT_TEXT + CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<BLOCKQUOTE>" +
                        "<p>" + CONTENT_TEXT + CONTENT_TEXT
                         + CONTENT_TEXT + CONTENT_TEXT + "</p>" +
                      "</BLOCKQUOTE>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveChildrenElementsWithinBlockquote() {
         Element blockquote = Document.get().createElement("BLOCKQUOTE");
         mBody.appendChild(blockquote);

         blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
         blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
         blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<BLOCKQUOTE>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                      "</BLOCKQUOTE>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testDiscardBlockquoteWithoutContent() {
         assertExtractor("", "<BLOCKQUOTE></BLOCKQUOTE>");
     }

     public void testPreservePre() {
         final String article = CONTENT_TEXT + CONTENT_TEXT + CONTENT_TEXT;
         final String html = "<h1>" + CONTENT_TEXT + "</h1><PRE><kbd>" + article + "</kbd></PRE>";

         assertExtractor(html, html);
     }

     private void assertExtractor(String expected, String html) {
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testDropCap() {
         String html =
             "<h1>" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p>" +
                 "<strong><span style=\"float: left\">T</span>est</strong>" +
                 CONTENT_TEXT +
             "</p>";

         final String expected =
             "<h1>" +
                 CONTENT_TEXT +
             "</h1>" +
             "<p>" +
                 "<strong><span>T</span>est</strong>" +
                 CONTENT_TEXT +
             "</p>";

         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testBlockyArticle() {
         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<span>" + CONTENT_TEXT + "</span>" +
             "<div><span>" + CONTENT_TEXT + "</span></div>" +
             "<p><em>" + CONTENT_TEXT + "</em></p>" +
             "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
             "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
             "<main><span><blockquote><cite>" +
                 "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
             "</cite></blockquote></span></main>";

         final String expected =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<span>" + CONTENT_TEXT + "</span>" +
             "<div><span>" + CONTENT_TEXT + "</span></div>" +
             "<p><em>" + CONTENT_TEXT + "</em></p>" +
             "<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
             "<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
             "<BLOCKQUOTE><cite>" +
                 "<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
             "</cite></BLOCKQUOTE>";

         assertExtractor(expected, htmlArticle);
     }

     public void testSpanArticle() {
         final String htmlArticle =
             "<span>" + CONTENT_TEXT + "</span>" +
             "<span>" + CONTENT_TEXT + "</span>" +
             "<span>" + CONTENT_TEXT + "</span>";

         final String expected = "<div>" + htmlArticle + "</div>";

         assertExtractor(expected, htmlArticle);
     }
 }