blob: 00995a33a83ea90e1ac6b281e9f7d68bd82566f7 [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package org.chromium.distiller;
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
public class ContentExtractorTest extends DomDistillerJsTestCase {
private static final String CONTENT_TEXT = "Lorem Ipsum Lorem Ipsum Lorem Ipsum.";
private static final String TITLE_TEXT = "I am the document title";
public void testDoesNotExtractTitleInContent() {
Element titleDiv = TestUtil.createDiv(0);
titleDiv.appendChild(TestUtil.createText(TITLE_TEXT));
mBody.appendChild(titleDiv);
Element contentDiv = TestUtil.createDiv(1);
contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
mBody.appendChild(contentDiv);
contentDiv = TestUtil.createDiv(2);
contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
mBody.appendChild(contentDiv);
contentDiv = TestUtil.createDiv(3);
contentDiv.appendChild(TestUtil.createText(CONTENT_TEXT));
mBody.appendChild(contentDiv);
// Title hasn't been set yet, everything should be content.
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
extractedContent.contains(DomUtil.getInnerText(contentDiv)));
assertTrue(
extractedContent + " must contain 'title':" + TITLE_TEXT,
extractedContent.contains(DomUtil.getInnerText(titleDiv)));
// Now set the title and it should excluded from the content.
mHead.appendChild(TestUtil.createTitle(TITLE_TEXT));
extractor = new ContentExtractor(mRoot);
extractedContent = extractor.extractContent();
assertTrue(extractedContent + " must contain 'content':" + CONTENT_TEXT,
extractedContent.contains(DomUtil.getInnerText(contentDiv)));
assertFalse(
extractedContent + " must not contain 'title':" + TITLE_TEXT,
extractedContent.contains(DomUtil.getInnerText(titleDiv)));
}
public void testExtractsEssentialWhitespace() {
Element div = TestUtil.createDiv(0);
mBody.appendChild(div);
div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
div.appendChild(TestUtil.createText(" "));
div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
div.appendChild(TestUtil.createText("\n"));
div.appendChild(TestUtil.createSpan(CONTENT_TEXT));
div.appendChild(TestUtil.createText(" "));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<div><span>" + CONTENT_TEXT + "</span> " +
"<span>" + CONTENT_TEXT + "</span>\n" +
"<span>" + CONTENT_TEXT + "</span> </div>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPrefersMarkupParserOverDocumentTitle() {
// Minimum fields for open-graph parser.
final String MARKUP_PARSER_TITLE = "title from markup parser";
createMeta("og:title", MARKUP_PARSER_TITLE);
createMeta("og:type", "video.movie");
createMeta("og:image", "http://test/image.jpeg");
createMeta("og:url", "http://test/test.html");
OpenGraphProtocolParserAccessor parser = new OpenGraphProtocolParserAccessor(mRoot);
assertTrue(parser != null);
assertEquals(MARKUP_PARSER_TITLE, parser.getTitle());
Document.get().setTitle(TITLE_TEXT);
ContentExtractor extractor = new ContentExtractor(mRoot);
assertEquals("OpenGraph title should be picked over document.title",
MARKUP_PARSER_TITLE, extractor.extractTitle());
}
public void testImage() {
// Test the absolute and different kinds of relative URLs for image sources,
// and also add an extra comma (,) as malformed srcset syntax for robustness.
// Also test images in WebImage and WebTable.
// TODO(wychen): add images in WebText when it is supported.
final String html =
"<h1>" + CONTENT_TEXT + "</h1>" +
"<img id=\"a\" style=\"typo\" align=\"left\" src=\"image\" srcset=\"image200 200w, //example.org/image400 400w\">" +
"<img id=\"b\" style=\"align: left\" alt=\"b\" data-dummy=\"c\" data-src=\"image2\">" +
"<table role=\"grid\"><tbody><tr><td>" +
"<img id=\"c\" style=\"a\" alt=\"b\" src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
"<img id=\"d\" style=\"a\" align=\"left\" src=\"/image2\">" +
"</td></tr></tbody></table>" +
"<p>" + CONTENT_TEXT + "</p>";
final String expected =
"<h1>" + CONTENT_TEXT + "</h1>" +
"<img src=\"http://example.com/path/image\" " +
"srcset=\"http://example.com/path/image200 200w, http://example.org/image400 400w\">" +
"<img alt=\"b\" src=\"http://example.com/path/image2\">" +
"<table role=\"grid\"><tbody><tr><td>" +
"<img alt=\"b\" src=\"http://example.com/image\" " +
"srcset=\"https://example.com/image2x 2x, http://example.com/image4x 4x, \">" +
"<img src=\"http://example.com/image2\">" +
"</td></tr></tbody></table>" +
"<p>" + CONTENT_TEXT + "</p>";
mHead.setInnerHTML("<base href=\"http://example.com/path/\">");
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected,
TestUtil.removeAllDirAttributes(extractedContent));
}
private void createMeta(String property, String content) {
mHead.appendChild(TestUtil.createMetaProperty(property, content));
}
public void testRemoveFontColorAttributes() {
Element outerFontTag = Document.get().createElement("FONT");
outerFontTag.setAttribute("COLOR", "blue");
mBody.appendChild(outerFontTag);
String text = "<font color=\"red\">" + CONTENT_TEXT + "</font>";
outerFontTag.appendChild(TestUtil.createSpan(text));
outerFontTag.appendChild(TestUtil.createText(" "));
outerFontTag.appendChild(TestUtil.createSpan(text));
outerFontTag.appendChild(TestUtil.createText("\n"));
outerFontTag.appendChild(TestUtil.createSpan(text));
outerFontTag.appendChild(TestUtil.createText(" "));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<font><span><font>" + CONTENT_TEXT + "</font></span> " +
"<span><font>" + CONTENT_TEXT + "</font></span>\n" +
"<span><font>" + CONTENT_TEXT + "</font></span> </font>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testRemoveStyleAttributes() {
String html =
"<h1 style=\"font-weight: folder\">" +
CONTENT_TEXT +
"</h1>" +
"<p style=\"\">" +
CONTENT_TEXT +
"</p>" +
"<img style=\"align: left\" data-src=\"/test.png\">" +
"<table style=\"position: absolute\">" +
"<tbody style=\"font-size: 2\">" +
"<tr style=\"z-index: 0\">" +
"<th style=\"top: 0px\">" + CONTENT_TEXT +
"<img style=\"align: left\" src=\"/test.png\">" +
"</th>" +
"<th style=\"width: 20px\">" + CONTENT_TEXT + "</th>" +
"</tr><tr style=\"left: 0\">" +
"<td style=\"display: block\">" + CONTENT_TEXT + "</td>" +
"<td style=\"color: #123\">" + CONTENT_TEXT + "</td>" +
"</tr>" +
"</tbody>" +
"</table>";
final String expected =
"<h1>" +
CONTENT_TEXT +
"</h1>" +
"<p>" +
CONTENT_TEXT +
"</p>" +
"<img src=\"http://example.com/test.png\">" +
"<table>" +
"<tbody>" +
"<tr>" +
"<th>" + CONTENT_TEXT +
"<img src=\"http://example.com/test.png\">" +
"</th>" +
"<th>" + CONTENT_TEXT + "</th>" +
"</tr><tr>" +
"<td>" + CONTENT_TEXT + "</td>" +
"<td>" + CONTENT_TEXT + "</td>" +
"</tr>" +
"</tbody>" +
"</table>";
mHead.setInnerHTML("<base href=\"http://example.com/\">");
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected,
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testKeepingWidthAndHeightAttributes() {
String html =
"<h1>" +
CONTENT_TEXT +
"</h1>" +
"<p>" +
CONTENT_TEXT +
"</p>" +
"<img style=\"align: left\" src=\"/test.png\" " +
"width=\"200\" height=\"300\">" +
"<img style=\"align: left\" src=\"/test.png\" " +
"width=\"200\">" +
"<img style=\"align: left\" src=\"/test.png\">";
final String expected =
"<h1>" +
CONTENT_TEXT +
"</h1>" +
"<p>" +
CONTENT_TEXT +
"</p>" +
"<img src=\"http://example.com/test.png\" " +
"width=\"200\" height=\"300\">" +
"<img src=\"http://example.com/test.png\" " +
"width=\"200\">" +
"<img src=\"http://example.com/test.png\">";
mHead.setInnerHTML("<base href=\"http://example.com/\">");
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected,
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveOrderedList() {
Element outerListTag = Document.get().createElement("OL");
mBody.appendChild(outerListTag);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveOrderedListWithSpan() {
String html =
"<OL>" +
"<LI><span>" + CONTENT_TEXT + "</span></LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>";
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(html,
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveNestedOrderedList() {
Element outerListTag = Document.get().createElement("OL");
Element outerListItem = Document.get().createElement("LI");
Element innerListTag = Document.get().createElement("OL");
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListItem.appendChild(innerListTag);
outerListTag.appendChild(outerListItem);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
mBody.appendChild(outerListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<LI>" +
"<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>" +
"</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveNestedOrderedListWithOtherElementsInside() {
Element outerListTag = Document.get().createElement("OL");
Element outerListItem = Document.get().createElement("LI");
outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
Element innerListTag = Document.get().createElement("OL");
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createParagraph(""));
outerListItem.appendChild(innerListTag);
outerListTag.appendChild(outerListItem);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
mBody.appendChild(outerListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<LI>" + CONTENT_TEXT +
"<p>" + CONTENT_TEXT + "</p>" +
"<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>" +
"</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<p>" + CONTENT_TEXT + "</p>" +
"</OL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveUnorderedList() {
Element outerListTag = Document.get().createElement("UL");
mBody.appendChild(outerListTag);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<UL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveNestedUnorderedList() {
Element outerListTag = Document.get().createElement("UL");
Element outerListItem = Document.get().createElement("LI");
Element innerListTag = Document.get().createElement("UL");
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListItem.appendChild(innerListTag);
outerListTag.appendChild(outerListItem);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
mBody.appendChild(outerListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<UL>" +
"<LI>" +
"<UL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>" +
"</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveNestedUnorderedListWithOtherElementsInside() {
Element outerListTag = Document.get().createElement("UL");
Element outerListItem = Document.get().createElement("LI");
outerListItem.appendChild(TestUtil.createText(CONTENT_TEXT));
outerListItem.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
Element innerListTag = Document.get().createElement("UL");
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
innerListTag.appendChild(TestUtil.createParagraph(""));
outerListItem.appendChild(innerListTag);
outerListTag.appendChild(outerListItem);
outerListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
outerListTag.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
mBody.appendChild(outerListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<UL>" +
"<LI>" + CONTENT_TEXT +
"<p>" + CONTENT_TEXT + "</p>" +
"<UL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>" +
"</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<p>" + CONTENT_TEXT + "</p>" +
"</UL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveUnorderedListWithNestedOrderedList() {
Element unorderedListTag = Document.get().createElement("UL");
Element li = Document.get().createElement("LI");
Element orderedList = Document.get().createElement("OL");
orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
orderedList.appendChild(TestUtil.createListItem(CONTENT_TEXT));
li.appendChild(orderedList);
unorderedListTag.appendChild(li);
unorderedListTag.appendChild(TestUtil.createListItem(CONTENT_TEXT));
mBody.appendChild(unorderedListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<UL>" +
"<LI>" +
"<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>" +
"</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testMalformedListStructureWithExtraLITagEnd() {
Element unorderedListTag = Document.get().createElement("UL");
String html = "<LI>" + CONTENT_TEXT + "</LI></LI><LI>" + CONTENT_TEXT + "</LI>";
unorderedListTag.setInnerHTML(html);
mBody.appendChild(unorderedListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<UL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</UL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testMalformedListStructureWithExtraLITagStart() {
Element unorderedListTag = Document.get().createElement("OL");
String html = "<LI><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
unorderedListTag.setInnerHTML(html);
mBody.appendChild(unorderedListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testMalformedListStructureWithExtraOLTagStart() {
Element unorderedListTag = Document.get().createElement("OL");
String html = "<OL><LI>" + CONTENT_TEXT + "</LI><LI>" + CONTENT_TEXT + "</LI>";
unorderedListTag.setInnerHTML(html);
mBody.appendChild(unorderedListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>" +
"</OL>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testMalformedListStructureWithoutLITag(){
Element orderedListTag = Document.get().createElement("OL");
String html = "<LI>" + CONTENT_TEXT + "</LI>" +
CONTENT_TEXT +
"<LI>" + CONTENT_TEXT + "</LI>";
orderedListTag.setInnerHTML(html);
mBody.appendChild(orderedListTag);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<OL>" +
"<LI>" + CONTENT_TEXT + "</LI>" +
CONTENT_TEXT +
"<LI>" + CONTENT_TEXT + "</LI>" +
"</OL>" ,
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveChildElementWithinBlockquote() {
Element blockquote = Document.get().createElement("BLOCKQUOTE");
mBody.appendChild(blockquote);
blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT
+ CONTENT_TEXT + CONTENT_TEXT + CONTENT_TEXT));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<BLOCKQUOTE>" +
"<p>" + CONTENT_TEXT + CONTENT_TEXT
+ CONTENT_TEXT + CONTENT_TEXT + "</p>" +
"</BLOCKQUOTE>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testPreserveChildrenElementsWithinBlockquote() {
Element blockquote = Document.get().createElement("BLOCKQUOTE");
mBody.appendChild(blockquote);
blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
blockquote.appendChild(TestUtil.createParagraph(CONTENT_TEXT));
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals("<BLOCKQUOTE>" +
"<p>" + CONTENT_TEXT + "</p>" +
"<p>" + CONTENT_TEXT + "</p>" +
"<p>" + CONTENT_TEXT + "</p>" +
"</BLOCKQUOTE>",
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testDiscardBlockquoteWithoutContent() {
assertExtractor("", "<BLOCKQUOTE></BLOCKQUOTE>");
}
public void testPreservePre() {
final String article = CONTENT_TEXT + CONTENT_TEXT + CONTENT_TEXT;
final String html = "<h1>" + CONTENT_TEXT + "</h1><PRE><kbd>" + article + "</kbd></PRE>";
assertExtractor(html, html);
}
private void assertExtractor(String expected, String html) {
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected, TestUtil.removeAllDirAttributes(extractedContent));
}
public void testDropCap() {
String html =
"<h1>" +
CONTENT_TEXT +
"</h1>" +
"<p>" +
"<strong><span style=\"float: left\">T</span>est</strong>" +
CONTENT_TEXT +
"</p>";
final String expected =
"<h1>" +
CONTENT_TEXT +
"</h1>" +
"<p>" +
"<strong><span>T</span>est</strong>" +
CONTENT_TEXT +
"</p>";
mBody.setInnerHTML(html);
ContentExtractor extractor = new ContentExtractor(mRoot);
String extractedContent = extractor.extractContent();
assertEquals(expected,
TestUtil.removeAllDirAttributes(extractedContent));
}
public void testBlockyArticle() {
final String htmlArticle =
"<h1>" + CONTENT_TEXT + "</h1>" +
"<span>" + CONTENT_TEXT + "</span>" +
"<div><span>" + CONTENT_TEXT + "</span></div>" +
"<p><em>" + CONTENT_TEXT + "</em></p>" +
"<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
"<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
"<main><span><blockquote><cite>" +
"<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
"</cite></blockquote></span></main>";
final String expected =
"<h1>" + CONTENT_TEXT + "</h1>" +
"<span>" + CONTENT_TEXT + "</span>" +
"<div><span>" + CONTENT_TEXT + "</span></div>" +
"<p><em>" + CONTENT_TEXT + "</em></p>" +
"<div><cite><span><span>" + CONTENT_TEXT + "</span></span></cite></div>" +
"<div><span>" + CONTENT_TEXT + "</span><span>" + CONTENT_TEXT + "</span></div>" +
"<BLOCKQUOTE><cite>" +
"<span><span>" + CONTENT_TEXT + "</span></span><span>" + CONTENT_TEXT + "</span>" +
"</cite></BLOCKQUOTE>";
assertExtractor(expected, htmlArticle);
}
public void testSpanArticle() {
final String htmlArticle =
"<span>" + CONTENT_TEXT + "</span>" +
"<span>" + CONTENT_TEXT + "</span>" +
"<span>" + CONTENT_TEXT + "</span>";
final String expected = "<div>" + htmlArticle + "</div>";
assertExtractor(expected, htmlArticle);
}
}