Extract image URLs in srcset as well
BUG=625621
R=mdjones@chromium.org
Review URL: https://codereview.chromium.org/2203563002 .
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 7d16a77..e5858b6 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -34,7 +34,7 @@
private final TimingInfo mTimingInfo;
private final StatisticsInfo mStatisticsInfo;
private final MarkupParser parser;
- private final List<String> imageUrls;
+ private List<String> imageUrls;
private String textDirection;
private class WebDocumentInfo {
@@ -47,7 +47,6 @@
candidateTitles = new LinkedList<String>();
mTimingInfo = TimingInfo.create();
mStatisticsInfo = StatisticsInfo.create();
- imageUrls = new ArrayList<String>();
double startTime = DomUtil.getTime();
parser = new MarkupParser(root, mTimingInfo);
@@ -96,16 +95,14 @@
LeadImageFinder.process(documentInfo.document);
NestedElementRetainer.process(documentInfo.document);
- List<WebImage> images = documentInfo.document.getContentImages();
- for (WebImage wi : images) {
- imageUrls.add(wi.getSrc());
- }
mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);
now = DomUtil.getTime();
String html = documentInfo.document.generateOutput(textOnly);
mTimingInfo.setFormattingTime(DomUtil.getTime() - now);
+ imageUrls = documentInfo.document.getImageUrls();
+
if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
TimingEntry entry = mTimingInfo.getOtherTimes(i);
diff --git a/java/org/chromium/distiller/DocumentTitleGetter.java b/java/org/chromium/distiller/DocumentTitleGetter.java
index edf59f5..cda8d0c 100644
--- a/java/org/chromium/distiller/DocumentTitleGetter.java
+++ b/java/org/chromium/distiller/DocumentTitleGetter.java
@@ -46,7 +46,7 @@
currTitle = origTitle = DomUtil.javascriptTextContent(titles.getItem(0));
}
}
- if (currTitle == "") return "";
+ if (currTitle.isEmpty()) return "";
if (StringUtil.match(currTitle, " [\\|\\-] ")) { // Title has '|' and/or '-'.
// Get part before last '|' or '-'.
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 3cca24b..1ff099b 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -189,7 +189,7 @@
* @return A list of the provided node's parents.
*/
public static List<Node> getParentNodes(Node n) {
- ArrayList<Node> result = new ArrayList<Node>();
+ ArrayList<Node> result = new ArrayList<>();
Node curr = n;
while (curr != null) {
result.add(curr);
@@ -329,7 +329,7 @@
public static void makeSrcSetAbsolute(ImageElement ie) {
String srcset = ie.getAttribute("srcset");
- if (srcset == "") {
+ if (srcset.isEmpty()) {
ie.removeAttribute("srcset");
return;
}
@@ -348,6 +348,23 @@
ie.setSrc(oldsrc);
}
+ public static List<String> getSrcSetUrls(ImageElement ie) {
+ List<String> list = new ArrayList<>();
+ String srcset = ie.getAttribute("srcset");
+ if (srcset.isEmpty()) {
+ return list;
+ }
+
+ String[] sizes = StringUtil.jsSplit(srcset, ",");
+ for(int i = 0; i < sizes.length; i++) {
+ String size = StringUtil.jsTrim(sizes[i]);
+ if (size.isEmpty()) continue;
+ String[] comp = size.split(" ");
+ list.add(comp[0]);
+ }
+ return list;
+ }
+
public static void stripImageElements(Node root) {
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element element = Element.as(root);
diff --git a/java/org/chromium/distiller/webdocument/WebDocument.java b/java/org/chromium/distiller/webdocument/WebDocument.java
index 021851b..0b8f482 100644
--- a/java/org/chromium/distiller/webdocument/WebDocument.java
+++ b/java/org/chromium/distiller/webdocument/WebDocument.java
@@ -41,11 +41,12 @@
return elements;
}
- public List<WebImage> getContentImages() {
- List<WebImage> images = new ArrayList<>();
+ public List<String> getImageUrls() {
+ List<String> images = new ArrayList<>();
for (WebElement e : elements) {
- if (e instanceof WebImage && e.getIsContent()) {
- images.add((WebImage) e);
+ if (!e.getIsContent()) continue;
+ if (e instanceof WebImage) {
+ images.addAll(((WebImage) e).getUrlList());
}
}
return images;
diff --git a/java/org/chromium/distiller/webdocument/WebImage.java b/java/org/chromium/distiller/webdocument/WebImage.java
index ceb3a40..d58f4e4 100644
--- a/java/org/chromium/distiller/webdocument/WebImage.java
+++ b/java/org/chromium/distiller/webdocument/WebImage.java
@@ -10,6 +10,9 @@
import org.chromium.distiller.DomUtil;
+import java.util.ArrayList;
+import java.util.List;
+
/**
* WebImage represents an image in the WebDocument potentially needing extraction.
*/
@@ -22,6 +25,8 @@
private int width;
// The original height of the image in pixels.
private int height;
+ // Cloned and processed element.
+ private ImageElement clonedImg;
/**
* Build an image element.
@@ -40,10 +45,7 @@
}
}
- @Override
- public String generateOutput(boolean textOnly) {
- if (textOnly) return "";
-
+ private void cloneAndProcessNode() {
ImageElement ie = ImageElement.as(Element.as(imgElement.cloneNode(false)));
ie.setSrc(srcUrl);
ie.setSrc(ie.getSrc());
@@ -56,8 +58,17 @@
DomUtil.makeSrcSetAbsolute(ie);
DomUtil.stripImageElement(ie);
+ clonedImg = ie;
+ }
+
+ @Override
+ public String generateOutput(boolean textOnly) {
+ if (textOnly) return "";
+ if (clonedImg == null) {
+ cloneAndProcessNode();
+ }
Element container = Document.get().createDivElement();
- container.appendChild(ie);
+ container.appendChild(clonedImg);
return container.getInnerHTML();
}
@@ -86,10 +97,17 @@
}
/**
- * Get the source URL of this image.
- * @return Source URL or an empty string.
+ * Get the list of source URLs of this image.
+ * It's more efficient to call after generateOutput().
+ * @return Source URLs or an empty List.
*/
- public String getSrc() {
- return srcUrl;
+ public List<String> getUrlList() {
+ if (clonedImg == null) {
+ cloneAndProcessNode();
+ }
+ List<String> list = new ArrayList<>();
+ list.add(srcUrl);
+ list.addAll(DomUtil.getSrcSetUrls(clonedImg));
+ return list;
}
}
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index 85e61cf..295e2fe 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -9,6 +9,7 @@
import com.google.gwt.core.client.JsArray;
import com.google.gwt.dom.client.Document;
import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.ImageElement;
import com.google.gwt.dom.client.Node;
import com.google.gwt.dom.client.NodeList;
@@ -273,6 +274,18 @@
assertEquals(expected, mBody.getInnerHTML());
}
+ public void testGetSrcSetUrls() {
+ String html =
+ "<img src=\"http://example.com/image\" " +
+ "srcset=\"http://example.com/image200 200w, http://example.com/image400 400w\">";
+
+ mBody.setInnerHTML(html);
+ List<String> list = DomUtil.getSrcSetUrls((ImageElement) mBody.getChild(0));
+ assertEquals(2, list.size());
+ assertEquals("http://example.com/image200", list.get(0));
+ assertEquals("http://example.com/image400", list.get(1));
+ }
+
public void testStripTableBackgroundColorAttributes() {
String tableWithBGColorHTML =
"<table bgcolor=\"red\">" +
diff --git a/javatests/org/chromium/distiller/webdocument/WebImageTest.java b/javatests/org/chromium/distiller/webdocument/WebImageTest.java
new file mode 100644
index 0000000..406db95
--- /dev/null
+++ b/javatests/org/chromium/distiller/webdocument/WebImageTest.java
@@ -0,0 +1,27 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller.webdocument;
+
+import org.chromium.distiller.DomDistillerJsTestCase;
+
+import java.util.List;
+
+import com.google.gwt.dom.client.Document;
+import com.google.gwt.dom.client.ImageElement;
+
+public class WebImageTest extends DomDistillerJsTestCase {
+ public void testGetSrcList() {
+ ImageElement img = Document.get().createImageElement();
+ img.setSrc("http://example.com/image");
+ img.setAttribute("srcset",
+ "http://example.com/image200 200w, http://example.com/image400 400w");
+ WebImage wi = new WebImage(img, 1, 1, img.getSrc());
+ List<String> urls = wi.getUrlList();
+ assertEquals(3, urls.size());
+ assertEquals("http://example.com/image", urls.get(0));
+ assertEquals("http://example.com/image200", urls.get(1));
+ assertEquals("http://example.com/image400", urls.get(2));
+ }
+}