Support extraction of lazily-loaded images

BUG=481111
R=mdjones@chromium.org

Review URL: https://codereview.chromium.org/2000093005 .
diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
index c9b527a..5b4eb00 100644
--- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
+++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
@@ -6,6 +6,7 @@
 
 import com.google.gwt.dom.client.Element;
 import com.google.gwt.dom.client.ImageElement;
+import org.chromium.distiller.LogUtil;
 import org.chromium.distiller.webdocument.WebImage;
 
 import java.util.HashSet;
@@ -21,6 +22,8 @@
         // TODO(mdjones): Add "DIV" to this list for css images and possibly captions.
         relevantTags.add("IMG");
     }
+    private static final String[] LAZY_IMAGE_ATTRIBUTES =
+        {"data-src", "data-original", "datasrc", "data-url"};
 
     @Override
     public Set<String> getRelevantTagNames() {
@@ -41,13 +44,29 @@
             // This will get the absolute URL of the image and
             // the displayed image dimension.
             ImageElement imageElement = ImageElement.as(e);
-            imgSrc = imageElement.getSrc();
-            // As an ImageElement is manipulated here, it is possible
-            // to get the real dimensions.
-            width = imageElement.getWidth();
-            height = imageElement.getHeight();
+            // Try to get lazily-loaded images before falling back to get the src attribute.
+            for(String attr: LAZY_IMAGE_ATTRIBUTES) {
+                imgSrc = imageElement.getAttribute(attr);
+                if (!imgSrc.isEmpty())
+                    break;
+            }
+            if (!imgSrc.isEmpty()) {
+                // We cannot trust the dimension if the image is not loaded yet.
+                // In some cases there are 1x1 placeholder images.
+                width = 0;
+                height = 0;
+            } else {
+                imgSrc = imageElement.getSrc();
+                // As an ImageElement is manipulated here, it is possible
+                // to get the real dimensions.
+                width = imageElement.getWidth();
+                height = imageElement.getHeight();
+            }
         }
 
+        if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_VISIBILITY_INFO)) {
+            LogUtil.logToConsole("Extracted WebImage: " + imgSrc);
+        }
         return new WebImage(e, width, height, imgSrc);
     }
 }
diff --git a/java/org/chromium/distiller/webdocument/WebImage.java b/java/org/chromium/distiller/webdocument/WebImage.java
index 0c14d46..ceb3a40 100644
--- a/java/org/chromium/distiller/webdocument/WebImage.java
+++ b/java/org/chromium/distiller/webdocument/WebImage.java
@@ -45,6 +45,7 @@
         if (textOnly) return "";
 
         ImageElement ie = ImageElement.as(Element.as(imgElement.cloneNode(false)));
+        ie.setSrc(srcUrl);
         ie.setSrc(ie.getSrc());
         // If computed width or height is zero, do not override them
         // to keep them visible.
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index ddc97e3..c1457c6 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -94,7 +94,7 @@
         final String html =
             "<h1>" + CONTENT_TEXT + "</h1>" +
             "<img id=\"a\" style=\"typo\" align=\"left\" src=\"image\" srcset=\"image200 200w, //example.org/image400 400w\">" +
-            "<img id=\"b\" style=\"align: left\" alt=\"b\" data-dummy=\"c\" src=\"image2\">" +
+            "<img id=\"b\" style=\"align: left\" alt=\"b\" data-dummy=\"c\" data-src=\"image2\">" +
             "<table role=\"grid\"><tbody><tr><td>" +
                 "<img id=\"c\" style=\"a\" alt=\"b\" src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
                 "<img id=\"d\" style=\"a\" align=\"left\" src=\"/image2\">" +
@@ -157,7 +157,7 @@
             "<p style=\"\">" +
                 CONTENT_TEXT +
             "</p>" +
-            "<img style=\"align: left\" src=\"/test.png\">" +
+            "<img style=\"align: left\" data-src=\"/test.png\">" +
             "<table style=\"position: absolute\">" +
                 "<tbody style=\"font-size: 2\">" +
                     "<tr style=\"z-index: 0\">" +
diff --git a/javatests/org/chromium/distiller/EmbedExtractorTest.java b/javatests/org/chromium/distiller/EmbedExtractorTest.java
index 5c23f0c..2b7a2f8 100644
--- a/javatests/org/chromium/distiller/EmbedExtractorTest.java
+++ b/javatests/org/chromium/distiller/EmbedExtractorTest.java
@@ -380,4 +380,24 @@
         assertEquals(38, result.getHeight());
         assertEquals(38, result.getWidth());
     }
+
+    private void extractLazilyLoadedImage(String attr) {
+        ImageElement image = TestUtil.createImage();
+        image.setAttribute(attr, "image.png");
+        mBody.appendChild(image);
+
+        mHead.setInnerHTML("<base href=\"http://example.com/\">");
+
+        EmbedExtractor extractor = new ImageExtractor();
+        WebImage result = (WebImage) extractor.extract(image);
+        assertNotNull(result);
+        assertEquals("<img src=\"http://example.com/image.png\">", result.generateOutput(false));
+    }
+
+    public void testImageExtractorLazy() {
+        extractLazilyLoadedImage("data-src");
+        extractLazilyLoadedImage("datasrc");
+        extractLazilyLoadedImage("data-original");
+        extractLazilyLoadedImage("data-url");
+    }
 }