javatests/org/chromium/distiller/ContentExtractorTest.java - chromium/dom-distiller - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 package org.chromium.distiller;

 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;

 public class ContentExtractorTest extends DomDistillerJsTestCase {
     private static final String CONTENT_TEXT = "Lorem Ipsum Lorem Ipsum Lorem Ipsum.";
     private static final String TITLE_TEXT = "I am the document title";

     public void testDoesNotExtractTitleInContent() {
         Element titleDiv = TestUtil.createDiv(0);
         titleDiv.appendChild(TestUtil.createText(TITLE_TEXT));
         mBody.appendChild(titleDiv);
         Element contentDiv = TestUtil.createDiv(1);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
         mBody.appendChild(contentDiv);

         contentDiv = TestUtil.createDiv(2);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
         mBody.appendChild(contentDiv);

         contentDiv = TestUtil.createDiv(3);
         contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));

         mBody.appendChild(contentDiv);

         // Title hasn't been set yet, everything should be content.
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(contentDiv)));
         assertTrue(
                 extractedContent + " must contain 'title':" + TITLE_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(titleDiv)));

         // Now set the title and it should excluded from the content.
         mHead.appendChild(TestUtil.createTitle(TITLE_TEXT));
         extractor = new ContentExtractor(mRoot);
         extractedContent = extractor.extractContent();
         assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(contentDiv)));
         assertFalse(
                 extractedContent + " must not contain 'title':" + TITLE_TEXT,
                 extractedContent.contains(DomUtil.getInnerText(titleDiv)));
     }

     public void testExtractsEssentialWhitespace() {
         Element div = TestUtil.createDiv(0);
         mBody.appendChild(div);

         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText(" "));
         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText("\n"));
         div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
         div.appendChild(TestUtil.createText(" "));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<div><span>" + CONTENT_TEXT + "</span> " +
                      "<span>" + CONTENT_TEXT + "</span>\n" +
                      "<span>" + CONTENT_TEXT + "</span> </div>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPrefersMarkupParserOverDocumentTitle() {
         // Minimum fields for open-graph parser.
         final String MARKUP_PARSER_TITLE = "title from markup parser";
         createMeta("og:title", MARKUP_PARSER_TITLE);
         createMeta("og:type", "video.movie");
         createMeta("og:image", "http://test/image.jpeg");
         createMeta("og:url", "http://test/test.html");

         OpenGraphProtocolParser parser = OpenGraphProtocolParser.parse(mRoot);
         assertTrue(parser != null);
         assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());

         Document.get().setTitle(TITLE_TEXT);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         assertEquals("OpenGraph title should be picked over document.title",
                 MARKUP_PARSER_TITLE, extractor.extractTitle());
     }

     public void testImageWithSrcset() {
         // Test the absolute and different kinds of relative URLs for image sources,
         // and also add an extra comma (,) as malformed srcset syntax for robustness.
         final String html =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<img src=\"image\" srcset=\"image200 200w, //example.org/image400 400w\">" +
             "<table role=\"grid\"><tbody><tr><td>" +
                 "<img src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
                 "</td></tr></tbody></table>" +
             "<p>" + CONTENT_TEXT + "</p>";

         final String expected =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<img src=\"http://example.com/path/image\" " +
                  "srcset=\"http://example.com/path/image200 200w, http://example.org/image400 400w\">" +
             "<table role=\"grid\"><tbody><tr><td>" +
                 "<img src=\"http://example.com/image\" " +
                      "srcset=\"https://example.com/image2x 2x, http://example.com/image4x 4x, \">" +
             "</td></tr></tbody></table>" +
             "<p>" + CONTENT_TEXT + "</p>";

         mHead.setInnerHTML("<base href=\"http://example.com/path/\">");
         mBody.setInnerHTML(html);

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();

         assertEquals(expected,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     private void createMeta(String property, String content) {
         mHead.appendChild(TestUtil.createMetaProperty(property, content));
     }

     public void testRemoveFontColorAttributes() {
         Element outerFontTag = Document.get().createElement("FONT");
         outerFontTag.setAttribute("COLOR", "blue");
         mBody.appendChild(outerFontTag);

         String text = "<font color=\"red\">" + CONTENT_TEXT + "</font>";

         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText(" "));
         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText("\n"));
         outerFontTag.appendChild(TestUtil.createSpan(text));
         outerFontTag.appendChild(TestUtil.createText(" "));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<font><span><font>" + CONTENT_TEXT + "</font></span> " +
                      "<span><font>" + CONTENT_TEXT + "</font></span>\n" +
                      "<span><font>" + CONTENT_TEXT + "</font></span> </font>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveOrderedList() {
         Element outerListTag = Document.get().createElement("OL");
         mBody.appendChild(outerListTag);

         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedOrderedList() {
         Element outerListTag = Document.get().createElement("OL");
         Element outerListItem = Document.get().createElement("LI");

         Element innerListTag = Document.get().createElement("OL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedOrderedListWithOtherElementsInside() {
         Element outerListTag = Document.get().createElement("OL");
         Element outerListItem = Document.get().createElement("LI");
         outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
         outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         Element innerListTag = Document.get().createElement("OL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createParagraph(""));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT +
                           "<p>" + CONTENT_TEXT + "</p>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveUnorderedList() {
         Element outerListTag = Document.get().createElement("UL");
         mBody.appendChild(outerListTag);

         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedUnorderedList() {
         Element outerListTag = Document.get().createElement("UL");
         Element outerListItem = Document.get().createElement("LI");

         Element innerListTag = Document.get().createElement("UL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" +
                           "<UL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</UL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveNestedUnorderedListWithOtherElementsInside() {
         Element outerListTag = Document.get().createElement("UL");
         Element outerListItem = Document.get().createElement("LI");
         outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
         outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         Element innerListTag = Document.get().createElement("UL");
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         innerListTag.appendChild(TestUtil.createParagraph(""));

         outerListItem.appendChild(innerListTag);
         outerListTag.appendChild(outerListItem);
         outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

         mBody.appendChild(outerListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT +
                           "<p>" + CONTENT_TEXT + "</p>" +
                           "<UL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</UL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<p>" + CONTENT_TEXT + "</p>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testPreserveUnorderedListWithNestedOrderedList() {
         Element unorderedListTag = Document.get().createElement("UL");
         Element li = Document.get().createElement("LI");
         Element orderedList = Document.get().createElement("OL");
         orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         li.appendChild(orderedList);
         unorderedListTag.appendChild(li);
         unorderedListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" +
                           "<OL>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                             "<LI>" + CONTENT_TEXT + "</LI>" +
                           "</OL>" +
                         "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraLITagEnd() {
         Element unorderedListTag = Document.get().createElement("UL");
         String html = "<LI>" +  CONTENT_TEXT + "</LI></LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<UL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</UL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraLITagStart() {
         Element unorderedListTag = Document.get().createElement("OL");
         String html = "<LI><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithExtraOLTagStart() {
         Element unorderedListTag = Document.get().createElement("OL");
         String html = "<OL><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
         unorderedListTag.setInnerHTML(html);
         mBody.appendChild(unorderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<OL>" +
                           "<LI>" + CONTENT_TEXT + "</LI>" +
                           "<LI>" + CONTENT_TEXT + "</LI>" +
                         "</OL>" +
                      "</OL>",
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testMalformedListStructureWithoutLITag(){
         Element orderedListTag = Document.get().createElement("OL");
         String html = "<LI>" + CONTENT_TEXT + "</LI>" +
                        CONTENT_TEXT +
                       "<LI>" + CONTENT_TEXT + "</LI>";
         orderedListTag.setInnerHTML(html);
         mBody.appendChild(orderedListTag);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals("<OL>" +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                          CONTENT_TEXT +
                         "<LI>" + CONTENT_TEXT + "</LI>" +
                      "</OL>" ,
                 TestUtil.removeAllDirAttributes(extractedContent));
     }

     private void assertExtractor(String expected, String html) {
         mBody.setInnerHTML("");
         Element div = TestUtil.createDiv(0);
         mBody.appendChild(div);

         div.setInnerHTML(html);
         ContentExtractor extractor = new ContentExtractor(mRoot);
         String extractedContent = extractor.extractContent();
         assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
     }

     public void testOnlyProcessArticleElement() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String html = "<h1>" + CONTENT_TEXT + "</h1><div>" + article + "</div>";
         final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;

         // Make sure everything is there before using the fast path.
         assertExtractor(expected, html);

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<article>" + article + "</article>";

         assertExtractor(article, htmlArticle);
     }

     public void testOnlyProcessArticleElementMultiple() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<article>" + article + "</article>" +
             "<article>" + article + "</article>";
         final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article + article;

         // The existence of multiple articles disables the fast path.
         assertExtractor(expected, htmlArticle);
     }

     public void testOnlyProcessOGArticle() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<div itemscope itemtype=\"http://schema.org/Article\">" + article + "</div>";

         assertExtractor(article, htmlArticle);
     }

     public void testOnlyProcessOGArticleNews() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<div itemscope itemtype=\"http://schema.org/NewsArticle\">" + article + "</div>";

         assertExtractor(article, htmlArticle);
     }

     public void testOnlyProcessOGArticleBlog() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<div itemscope itemtype=\"http://schema.org/BlogPosting\">" + article + "</div>";

         assertExtractor(article, htmlArticle);
     }

     public void testOnlyProcessOGArticleNested() {
         final String paragraph = "<p>" + CONTENT_TEXT + "</p>";
         final String article = paragraph + paragraph;

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<div itemscope itemtype=\"http://schema.org/Article\">" +
                 paragraph +
                 "<div itemscope itemtype=\"http://schema.org/Article\">" + paragraph + "</div>" +
             "</div>";

         assertExtractor(article, htmlArticle);
     }

     public void testOnlyProcessOGNonArticleMovie() {
         final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

         final String htmlArticle =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<div itemscope itemtype=\"http://schema.org/Movie\">" + article + "</div>";
         final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;

         // Non-article schema.org types should not use the fast path.
         assertExtractor(expected, htmlArticle);
     }
 }
	// Copyright 2014 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	package org.chromium.distiller;

	import com.google.gwt.dom.client.Document;
	import com.google.gwt.dom.client.Element;

	public class ContentExtractorTest extends DomDistillerJsTestCase {
	private static final String CONTENT_TEXT = "Lorem Ipsum Lorem Ipsum Lorem Ipsum.";
	private static final String TITLE_TEXT = "I am the document title";

	public void testDoesNotExtractTitleInContent() {
	Element titleDiv = TestUtil.createDiv(0);
	titleDiv.appendChild(TestUtil.createText(TITLE_TEXT));
	mBody.appendChild(titleDiv);
	Element contentDiv = TestUtil.createDiv(1);
	contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
	mBody.appendChild(contentDiv);

	contentDiv = TestUtil.createDiv(2);
	contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
	mBody.appendChild(contentDiv);

	contentDiv = TestUtil.createDiv(3);
	contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));

	mBody.appendChild(contentDiv);

	// Title hasn't been set yet, everything should be content.
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
	extractedContent.contains(DomUtil.getInnerText(contentDiv)));
	assertTrue(
	extractedContent + " must contain 'title':" + TITLE_TEXT,
	extractedContent.contains(DomUtil.getInnerText(titleDiv)));

	// Now set the title and it should excluded from the content.
	mHead.appendChild(TestUtil.createTitle(TITLE_TEXT));
	extractor = new ContentExtractor(mRoot);
	extractedContent = extractor.extractContent();
	assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
	extractedContent.contains(DomUtil.getInnerText(contentDiv)));
	assertFalse(
	extractedContent + " must not contain 'title':" + TITLE_TEXT,
	extractedContent.contains(DomUtil.getInnerText(titleDiv)));
	}

	public void testExtractsEssentialWhitespace() {
	Element div = TestUtil.createDiv(0);
	mBody.appendChild(div);

	div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
	div.appendChild(TestUtil.createText(" "));
	div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
	div.appendChild(TestUtil.createText("\n"));
	div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
	div.appendChild(TestUtil.createText(" "));

	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<div><span>" + CONTENT_TEXT + "</span> " +
	"<span>" + CONTENT_TEXT + "</span>\n" +
	"<span>" + CONTENT_TEXT + "</span> </div>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPrefersMarkupParserOverDocumentTitle() {
	// Minimum fields for open-graph parser.
	final String MARKUP_PARSER_TITLE = "title from markup parser";
	createMeta("og:title", MARKUP_PARSER_TITLE);
	createMeta("og:type", "video.movie");
	createMeta("og:image", "http://test/image.jpeg");
	createMeta("og:url", "http://test/test.html");

	OpenGraphProtocolParser parser = OpenGraphProtocolParser.parse(mRoot);
	assertTrue(parser != null);
	assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());

	Document.get().setTitle(TITLE_TEXT);

	ContentExtractor extractor = new ContentExtractor(mRoot);
	assertEquals("OpenGraph title should be picked over document.title",
	MARKUP_PARSER_TITLE, extractor.extractTitle());
	}

	public void testImageWithSrcset() {
	// Test the absolute and different kinds of relative URLs for image sources,
	// and also add an extra comma (,) as malformed srcset syntax for robustness.
	final String html =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<img src=\"image\" srcset=\"image200 200w, //example.org/image400 400w\">" +
	"<table role=\"grid\"><tbody><tr><td>" +
	"<img src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
	"</td></tr></tbody></table>" +
	"<p>" + CONTENT_TEXT + "</p>";

	final String expected =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<img src=\"http://example.com/path/image\" " +
	"srcset=\"http://example.com/path/image200 200w, http://example.org/image400 400w\">" +
	"<table role=\"grid\"><tbody><tr><td>" +
	"<img src=\"http://example.com/image\" " +
	"srcset=\"https://example.com/image2x 2x, http://example.com/image4x 4x, \">" +
	"</td></tr></tbody></table>" +
	"<p>" + CONTENT_TEXT + "</p>";

	mHead.setInnerHTML("<base href=\"http://example.com/path/\">");
	mBody.setInnerHTML(html);

	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();

	assertEquals(expected,
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	private void createMeta(String property, String content) {
	mHead.appendChild(TestUtil.createMetaProperty(property, content));
	}

	public void testRemoveFontColorAttributes() {
	Element outerFontTag = Document.get().createElement("FONT");
	outerFontTag.setAttribute("COLOR", "blue");
	mBody.appendChild(outerFontTag);

	String text = "<font color=\"red\">" + CONTENT_TEXT + "</font>";

	outerFontTag.appendChild(TestUtil.createSpan(text));
	outerFontTag.appendChild(TestUtil.createText(" "));
	outerFontTag.appendChild(TestUtil.createSpan(text));
	outerFontTag.appendChild(TestUtil.createText("\n"));
	outerFontTag.appendChild(TestUtil.createSpan(text));
	outerFontTag.appendChild(TestUtil.createText(" "));

	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<font><span><font>" + CONTENT_TEXT + "</font></span> " +
	"<span><font>" + CONTENT_TEXT + "</font></span>\n" +
	"<span><font>" + CONTENT_TEXT + "</font></span> </font>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveOrderedList() {
	Element outerListTag = Document.get().createElement("OL");
	mBody.appendChild(outerListTag);

	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveNestedOrderedList() {
	Element outerListTag = Document.get().createElement("OL");
	Element outerListItem = Document.get().createElement("LI");

	Element innerListTag = Document.get().createElement("OL");
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	outerListItem.appendChild(innerListTag);
	outerListTag.appendChild(outerListItem);
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	mBody.appendChild(outerListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<LI>" +
	"<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>" +
	"</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveNestedOrderedListWithOtherElementsInside() {
	Element outerListTag = Document.get().createElement("OL");
	Element outerListItem = Document.get().createElement("LI");
	outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
	outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

	Element innerListTag = Document.get().createElement("OL");
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createParagraph(""));

	outerListItem.appendChild(innerListTag);
	outerListTag.appendChild(outerListItem);
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

	mBody.appendChild(outerListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<LI>" + CONTENT_TEXT +
	"<p>" + CONTENT_TEXT + "</p>" +
	"<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>" +
	"</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<p>" + CONTENT_TEXT + "</p>" +
	"</OL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveUnorderedList() {
	Element outerListTag = Document.get().createElement("UL");
	mBody.appendChild(outerListTag);

	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<UL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveNestedUnorderedList() {
	Element outerListTag = Document.get().createElement("UL");
	Element outerListItem = Document.get().createElement("LI");

	Element innerListTag = Document.get().createElement("UL");
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	outerListItem.appendChild(innerListTag);
	outerListTag.appendChild(outerListItem);
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));

	mBody.appendChild(outerListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<UL>" +
	"<LI>" +
	"<UL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>" +
	"</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveNestedUnorderedListWithOtherElementsInside() {
	Element outerListTag = Document.get().createElement("UL");
	Element outerListItem = Document.get().createElement("LI");
	outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
	outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

	Element innerListTag = Document.get().createElement("UL");
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	innerListTag.appendChild(TestUtil.createParagraph(""));

	outerListItem.appendChild(innerListTag);
	outerListTag.appendChild(outerListItem);
	outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));

	mBody.appendChild(outerListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<UL>" +
	"<LI>" + CONTENT_TEXT +
	"<p>" + CONTENT_TEXT + "</p>" +
	"<UL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>" +
	"</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<p>" + CONTENT_TEXT + "</p>" +
	"</UL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testPreserveUnorderedListWithNestedOrderedList() {
	Element unorderedListTag = Document.get().createElement("UL");
	Element li = Document.get().createElement("LI");
	Element orderedList = Document.get().createElement("OL");
	orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	li.appendChild(orderedList);
	unorderedListTag.appendChild(li);
	unorderedListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
	mBody.appendChild(unorderedListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<UL>" +
	"<LI>" +
	"<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>" +
	"</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testMalformedListStructureWithExtraLITagEnd() {
	Element unorderedListTag = Document.get().createElement("UL");
	String html = "<LI>" + CONTENT_TEXT + "</LI></LI><LI>" + CONTENT_TEXT + "</LI>";
	unorderedListTag.setInnerHTML(html);
	mBody.appendChild(unorderedListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<UL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</UL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testMalformedListStructureWithExtraLITagStart() {
	Element unorderedListTag = Document.get().createElement("OL");
	String html = "<LI><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
	unorderedListTag.setInnerHTML(html);
	mBody.appendChild(unorderedListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testMalformedListStructureWithExtraOLTagStart() {
	Element unorderedListTag = Document.get().createElement("OL");
	String html = "<OL><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
	unorderedListTag.setInnerHTML(html);
	mBody.appendChild(unorderedListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>" +
	"</OL>",
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testMalformedListStructureWithoutLITag(){
	Element orderedListTag = Document.get().createElement("OL");
	String html = "<LI>" + CONTENT_TEXT + "</LI>" +
	CONTENT_TEXT +
	"<LI>" + CONTENT_TEXT + "</LI>";
	orderedListTag.setInnerHTML(html);
	mBody.appendChild(orderedListTag);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals("<OL>" +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	CONTENT_TEXT +
	"<LI>" + CONTENT_TEXT + "</LI>" +
	"</OL>" ,
	TestUtil.removeAllDirAttributes(extractedContent));
	}

	private void assertExtractor(String expected, String html) {
	mBody.setInnerHTML("");
	Element div = TestUtil.createDiv(0);
	mBody.appendChild(div);

	div.setInnerHTML(html);
	ContentExtractor extractor = new ContentExtractor(mRoot);
	String extractedContent = extractor.extractContent();
	assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
	}

	public void testOnlyProcessArticleElement() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String html = "<h1>" + CONTENT_TEXT + "</h1><div>" + article + "</div>";
	final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;

	// Make sure everything is there before using the fast path.
	assertExtractor(expected, html);

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<article>" + article + "</article>";

	assertExtractor(article, htmlArticle);
	}

	public void testOnlyProcessArticleElementMultiple() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<article>" + article + "</article>" +
	"<article>" + article + "</article>";
	final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article + article;

	// The existence of multiple articles disables the fast path.
	assertExtractor(expected, htmlArticle);
	}

	public void testOnlyProcessOGArticle() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<div itemscope itemtype=\"http://schema.org/Article\">" + article + "</div>";

	assertExtractor(article, htmlArticle);
	}

	public void testOnlyProcessOGArticleNews() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<div itemscope itemtype=\"http://schema.org/NewsArticle\">" + article + "</div>";

	assertExtractor(article, htmlArticle);
	}

	public void testOnlyProcessOGArticleBlog() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<div itemscope itemtype=\"http://schema.org/BlogPosting\">" + article + "</div>";

	assertExtractor(article, htmlArticle);
	}

	public void testOnlyProcessOGArticleNested() {
	final String paragraph = "<p>" + CONTENT_TEXT + "</p>";
	final String article = paragraph + paragraph;

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<div itemscope itemtype=\"http://schema.org/Article\">" +
	paragraph +
	"<div itemscope itemtype=\"http://schema.org/Article\">" + paragraph + "</div>" +
	"</div>";

	assertExtractor(article, htmlArticle);
	}

	public void testOnlyProcessOGNonArticleMovie() {
	final String article = "<p>" + CONTENT_TEXT + "</p><p>" + CONTENT_TEXT + "</p>";

	final String htmlArticle =
	"<h1>" + CONTENT_TEXT + "</h1>" +
	"<div itemscope itemtype=\"http://schema.org/Movie\">" + article + "</div>";
	final String expected = "<h1>" + CONTENT_TEXT + "</h1>" + article;

	// Non-article schema.org types should not use the fast path.
	assertExtractor(expected, htmlArticle);
	}
	}