Extract image URLs in srcset as well

BUG=625621
R=mdjones@chromium.org

Review URL: https://codereview.chromium.org/2203563002 .
diff --git a/java/org/chromium/distiller/ContentExtractor.java b/java/org/chromium/distiller/ContentExtractor.java
index 7d16a77..e5858b6 100644
--- a/java/org/chromium/distiller/ContentExtractor.java
+++ b/java/org/chromium/distiller/ContentExtractor.java
@@ -34,7 +34,7 @@
     private final TimingInfo mTimingInfo;
     private final StatisticsInfo mStatisticsInfo;
     private final MarkupParser parser;
-    private final List<String> imageUrls;
+    private List<String> imageUrls;
     private String textDirection;
 
     private class WebDocumentInfo {
@@ -47,7 +47,6 @@
         candidateTitles = new LinkedList<String>();
         mTimingInfo = TimingInfo.create();
         mStatisticsInfo = StatisticsInfo.create();
-        imageUrls = new ArrayList<String>();
 
         double startTime = DomUtil.getTime();
         parser = new MarkupParser(root, mTimingInfo);
@@ -96,16 +95,14 @@
         LeadImageFinder.process(documentInfo.document);
         NestedElementRetainer.process(documentInfo.document);
 
-        List<WebImage> images = documentInfo.document.getContentImages();
-        for (WebImage wi : images) {
-            imageUrls.add(wi.getSrc());
-        }
         mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);
 
         now = DomUtil.getTime();
         String html = documentInfo.document.generateOutput(textOnly);
         mTimingInfo.setFormattingTime(DomUtil.getTime() - now);
 
+        imageUrls = documentInfo.document.getImageUrls();
+
         if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
             for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
                 TimingEntry entry =  mTimingInfo.getOtherTimes(i);
diff --git a/java/org/chromium/distiller/DocumentTitleGetter.java b/java/org/chromium/distiller/DocumentTitleGetter.java
index edf59f5..cda8d0c 100644
--- a/java/org/chromium/distiller/DocumentTitleGetter.java
+++ b/java/org/chromium/distiller/DocumentTitleGetter.java
@@ -46,7 +46,7 @@
               currTitle = origTitle = DomUtil.javascriptTextContent(titles.getItem(0));
             }
         }
-        if (currTitle == "") return "";
+        if (currTitle.isEmpty()) return "";
 
         if (StringUtil.match(currTitle, " [\\|\\-] ")) {  // Title has '|' and/or '-'.
             // Get part before last '|' or '-'.
diff --git a/java/org/chromium/distiller/DomUtil.java b/java/org/chromium/distiller/DomUtil.java
index 3cca24b..1ff099b 100644
--- a/java/org/chromium/distiller/DomUtil.java
+++ b/java/org/chromium/distiller/DomUtil.java
@@ -189,7 +189,7 @@
      * @return A list of the provided node's parents.
      */
     public static List<Node> getParentNodes(Node n) {
-        ArrayList<Node> result = new ArrayList<Node>();
+        ArrayList<Node> result = new ArrayList<>();
         Node curr = n;
         while (curr != null) {
             result.add(curr);
@@ -329,7 +329,7 @@
 
     public static void makeSrcSetAbsolute(ImageElement ie) {
         String srcset = ie.getAttribute("srcset");
-        if (srcset == "") {
+        if (srcset.isEmpty()) {
             ie.removeAttribute("srcset");
             return;
         }
@@ -348,6 +348,23 @@
         ie.setSrc(oldsrc);
     }
 
+    public static List<String> getSrcSetUrls(ImageElement ie) {
+        List<String> list = new ArrayList<>();
+        String srcset = ie.getAttribute("srcset");
+        if (srcset.isEmpty()) {
+            return list;
+        }
+
+        String[] sizes = StringUtil.jsSplit(srcset, ",");
+        for(int i = 0; i < sizes.length; i++) {
+            String size = StringUtil.jsTrim(sizes[i]);
+            if (size.isEmpty()) continue;
+            String[] comp = size.split(" ");
+            list.add(comp[0]);
+        }
+        return list;
+    }
+
     public static void stripImageElements(Node root) {
         if (root.getNodeType() == Node.ELEMENT_NODE) {
             Element element = Element.as(root);
diff --git a/java/org/chromium/distiller/webdocument/WebDocument.java b/java/org/chromium/distiller/webdocument/WebDocument.java
index 021851b..0b8f482 100644
--- a/java/org/chromium/distiller/webdocument/WebDocument.java
+++ b/java/org/chromium/distiller/webdocument/WebDocument.java
@@ -41,11 +41,12 @@
         return elements;
     }
 
-    public List<WebImage> getContentImages() {
-        List<WebImage> images = new ArrayList<>();
+    public List<String> getImageUrls() {
+        List<String> images = new ArrayList<>();
         for (WebElement e : elements) {
-            if (e instanceof WebImage && e.getIsContent()) {
-                images.add((WebImage) e);
+            if (!e.getIsContent()) continue;
+            if (e instanceof WebImage) {
+                images.addAll(((WebImage) e).getUrlList());
             }
         }
         return images;
diff --git a/java/org/chromium/distiller/webdocument/WebImage.java b/java/org/chromium/distiller/webdocument/WebImage.java
index ceb3a40..d58f4e4 100644
--- a/java/org/chromium/distiller/webdocument/WebImage.java
+++ b/java/org/chromium/distiller/webdocument/WebImage.java
@@ -10,6 +10,9 @@
 
 import org.chromium.distiller.DomUtil;
 
+import java.util.ArrayList;
+import java.util.List;
+
 /**
  * WebImage represents an image in the WebDocument potentially needing extraction.
  */
@@ -22,6 +25,8 @@
     private int width;
     // The original height of the image in pixels.
     private int height;
+    // Cloned and processed element.
+    private ImageElement clonedImg;
 
     /**
      * Build an image element.
@@ -40,10 +45,7 @@
         }
     }
 
-    @Override
-    public String generateOutput(boolean textOnly) {
-        if (textOnly) return "";
-
+    private void cloneAndProcessNode() {
         ImageElement ie = ImageElement.as(Element.as(imgElement.cloneNode(false)));
         ie.setSrc(srcUrl);
         ie.setSrc(ie.getSrc());
@@ -56,8 +58,17 @@
         DomUtil.makeSrcSetAbsolute(ie);
         DomUtil.stripImageElement(ie);
 
+        clonedImg = ie;
+    }
+
+    @Override
+    public String generateOutput(boolean textOnly) {
+        if (textOnly) return "";
+        if (clonedImg == null) {
+            cloneAndProcessNode();
+        }
         Element container = Document.get().createDivElement();
-        container.appendChild(ie);
+        container.appendChild(clonedImg);
         return container.getInnerHTML();
     }
 
@@ -86,10 +97,17 @@
     }
 
     /**
-     * Get the source URL of this image.
-     * @return Source URL or an empty string.
+     * Get the list of source URLs of this image.
+     * It's more efficient to call after generateOutput().
+     * @return Source URLs or an empty List.
      */
-    public String getSrc() {
-        return srcUrl;
+    public List<String> getUrlList() {
+        if (clonedImg == null) {
+            cloneAndProcessNode();
+        }
+        List<String> list = new ArrayList<>();
+        list.add(srcUrl);
+        list.addAll(DomUtil.getSrcSetUrls(clonedImg));
+        return list;
     }
 }
diff --git a/javatests/org/chromium/distiller/DomUtilTest.java b/javatests/org/chromium/distiller/DomUtilTest.java
index 85e61cf..295e2fe 100644
--- a/javatests/org/chromium/distiller/DomUtilTest.java
+++ b/javatests/org/chromium/distiller/DomUtilTest.java
@@ -9,6 +9,7 @@
 import com.google.gwt.core.client.JsArray;
 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.ImageElement;
 import com.google.gwt.dom.client.Node;
 import com.google.gwt.dom.client.NodeList;
 
@@ -273,6 +274,18 @@
         assertEquals(expected, mBody.getInnerHTML());
     }
 
+    public void testGetSrcSetUrls() {
+        String html =
+            "<img src=\"http://example.com/image\" " +
+              "srcset=\"http://example.com/image200 200w, http://example.com/image400 400w\">";
+
+        mBody.setInnerHTML(html);
+        List<String> list = DomUtil.getSrcSetUrls((ImageElement) mBody.getChild(0));
+        assertEquals(2, list.size());
+        assertEquals("http://example.com/image200", list.get(0));
+        assertEquals("http://example.com/image400", list.get(1));
+    }
+
     public void testStripTableBackgroundColorAttributes() {
         String tableWithBGColorHTML =
             "<table bgcolor=\"red\">" +
diff --git a/javatests/org/chromium/distiller/webdocument/WebImageTest.java b/javatests/org/chromium/distiller/webdocument/WebImageTest.java
new file mode 100644
index 0000000..406db95
--- /dev/null
+++ b/javatests/org/chromium/distiller/webdocument/WebImageTest.java
@@ -0,0 +1,27 @@
+// Copyright 2016 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package org.chromium.distiller.webdocument;
+
+import org.chromium.distiller.DomDistillerJsTestCase;
+
+import java.util.List;
+
+import com.google.gwt.dom.client.Document;
+import com.google.gwt.dom.client.ImageElement;
+
+public class WebImageTest extends DomDistillerJsTestCase {
+    public void testGetSrcList() {
+        ImageElement img = Document.get().createImageElement();
+        img.setSrc("http://example.com/image");
+        img.setAttribute("srcset",
+                "http://example.com/image200 200w, http://example.com/image400 400w");
+        WebImage wi = new WebImage(img, 1, 1, img.getSrc());
+        List<String> urls = wi.getUrlList();
+        assertEquals(3, urls.size());
+        assertEquals("http://example.com/image", urls.get(0));
+        assertEquals("http://example.com/image200", urls.get(1));
+        assertEquals("http://example.com/image400", urls.get(2));
+    }
+}