Handle image lazy loading on Wikipedia
Hopefully Wikipedia will use <img> instead of <span> as placeholder.
Score changes (as of @150267738):
https://x20web.corp.google.com/~wychen/domdistillerscore/wiki-lazy/top-mobile-mhtml.html
3 image entries changed (all wikipedia pages):
Precision remains 1.0
Recall 0.077 → 0.538
F1 0.143 → 0.700
Performance changes:
distillable-desktop-mhtml is 3.5% slower.
BUG=647667
R=mdjones@chromium.org
Review-Url: https://codereview.chromium.org/2729143003 .
diff --git a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
index b1d6e5b..a615b73 100644
--- a/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
+++ b/java/org/chromium/distiller/extractors/embeds/ImageExtractor.java
@@ -9,6 +9,7 @@
import com.google.gwt.dom.client.ImageElement;
import com.google.gwt.dom.client.NodeList;
import org.chromium.distiller.DomUtil;
+import org.chromium.distiller.JavaScript;
import org.chromium.distiller.LogUtil;
import org.chromium.distiller.webdocument.WebFigure;
import org.chromium.distiller.webdocument.WebImage;
@@ -31,6 +32,7 @@
relevantTags.add("IMG");
relevantTags.add("PICTURE");
relevantTags.add("FIGURE");
+ relevantTags.add("SPAN");
}
private static final String[] LAZY_IMAGE_ATTRIBUTES =
@@ -75,6 +77,19 @@
return new WebFigure(img, width, height, imgSrc, figcaption);
}
+ if ("SPAN".equals(e.getTagName())) {
+ if (!e.getAttribute("class").contains("lazy-image-placeholder")) {
+ return null;
+ }
+ // Image lazy loading on Wikipedia.
+ ie = Document.get().createImageElement();
+ imgSrc = e.getAttribute("data-src");
+ width = JavaScript.parseInt(e.getAttribute("data-width"));
+ height = JavaScript.parseInt(e.getAttribute("data-height"));
+ ie.setAttribute("srcset", e.getAttribute("data-srcset"));
+ return new WebImage(ie, width, height, imgSrc);
+ }
+
extractImageAttributes(ie);
return new WebImage(e, width, height, imgSrc);
}
diff --git a/javatests/org/chromium/distiller/ContentExtractorTest.java b/javatests/org/chromium/distiller/ContentExtractorTest.java
index c72e56c..654f6cb 100644
--- a/javatests/org/chromium/distiller/ContentExtractorTest.java
+++ b/javatests/org/chromium/distiller/ContentExtractorTest.java
@@ -99,6 +99,8 @@
"<source srcset=\"image100 100w, //example.org/image300 300w\">" +
"<img>" +
"</picture></figure>" +
+ "<span class=\"lazy-image-placeholder\" data-src=\"/image\" " +
+ "data-srcset=\"/image2x 2x\" data-width=\"20\" data-height=\"10\"></span>" +
"<img id=\"b\" style=\"align: left\" alt=\"b\" data-dummy=\"c\" data-src=\"image2\">" +
"<table role=\"grid\"><tbody><tr><td>" +
"<img id=\"c\" style=\"a\" alt=\"b\" src=\"/image\" srcset=\"https://example.com/image2x 2x, /image4x 4x,\">" +
@@ -115,6 +117,8 @@
"<source srcset=\"http://example.com/path/image100 100w, http://example.org/image300 300w\">" +
"<img>" +
"</picture></figure>" +
+ "<img srcset=\"http://example.com/image2x 2x\" src=\"http://example.com/image\" " +
+ "width=\"20\" height=\"10\">" +
"<img alt=\"b\" src=\"http://example.com/path/image2\">" +
"<table role=\"grid\"><tbody><tr><td>" +
"<img alt=\"b\" src=\"http://example.com/image\" " +