blob: eb6c9da99ec7d3caed96a0d4ed1dd946081ba257 [file] [log] [blame]
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/renderer/savable_resources.h"
#include <set>
#include "base/compiler_specific.h"
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "content/renderer/web_frame_utils.h"
#include "third_party/WebKit/public/platform/WebString.h"
#include "third_party/WebKit/public/platform/WebVector.h"
#include "third_party/WebKit/public/web/WebDocument.h"
#include "third_party/WebKit/public/web/WebElement.h"
#include "third_party/WebKit/public/web/WebElementCollection.h"
#include "third_party/WebKit/public/web/WebInputElement.h"
#include "third_party/WebKit/public/web/WebLocalFrame.h"
#include "third_party/WebKit/public/web/WebNode.h"
#include "third_party/WebKit/public/web/WebView.h"
using blink::WebDocument;
using blink::WebElement;
using blink::WebElementCollection;
using blink::WebFrame;
using blink::WebInputElement;
using blink::WebLocalFrame;
using blink::WebNode;
using blink::WebString;
using blink::WebVector;
using blink::WebView;
namespace content {
namespace {
// Returns |true| if |web_frame| contains (or should be assumed to contain)
// a html document.
bool DoesFrameContainHtmlDocument(const WebFrame& web_frame,
const WebElement& element) {
if (web_frame.isWebLocalFrame()) {
WebDocument doc = web_frame.document();
return doc.isHTMLDocument() || doc.isXHTMLDocument();
// Cannot inspect contents of a remote frame, so we use a heuristic:
// Assume that <iframe> and <frame> elements contain a html document,
// and other elements (i.e. <object>) contain plugins or other resources.
// If the heuristic is wrong (i.e. the remote frame in <object> does
// contain an html document), then things will still work, but with the
// following caveats: 1) original frame content will be saved and 2) links
// in frame's html doc will not be rewritten to point to locally saved
// files.
return element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame");
// If present and valid, then push the link associated with |element|
// into either SavableResourcesResult::subframes or
// SavableResourcesResult::resources_list.
void GetSavableResourceLinkForElement(
const WebElement& element,
const WebDocument& current_doc,
SavableResourcesResult* result) {
// Check whether the node has sub resource URL or not.
WebString value = GetSubResourceLinkFromElement(element);
if (value.isNull())
// Get absolute URL.
GURL element_url = current_doc.completeURL(value);
// See whether to report this element as a subframe.
WebFrame* web_frame = WebFrame::fromFrameOwnerElement(element);
if (web_frame && DoesFrameContainHtmlDocument(*web_frame, element)) {
SavableSubframe subframe;
subframe.original_url = element_url;
subframe.routing_id = GetRoutingIdForFrameOrProxy(web_frame);
// Ignore invalid URL.
if (!element_url.is_valid())
// Ignore those URLs which are not standard protocols. Because FTP
// protocol does no have cache mechanism, we will skip all
// sub-resources if they use FTP protocol.
if (!element_url.SchemeIsHTTPOrHTTPS() &&
} // namespace
bool GetSavableResourceLinksForFrame(WebFrame* current_frame,
SavableResourcesResult* result,
const char** savable_schemes) {
// Get current frame's URL.
GURL current_frame_url = current_frame->document().url();
// If url of current frame is invalid, ignore it.
if (!current_frame_url.is_valid())
return false;
// If url of current frame is not a savable protocol, ignore it.
bool is_valid_protocol = false;
for (int i = 0; savable_schemes[i] != NULL; ++i) {
if (current_frame_url.SchemeIs(savable_schemes[i])) {
is_valid_protocol = true;
if (!is_valid_protocol)
return false;
// Get current using document.
WebDocument current_doc = current_frame->document();
// Go through all descent nodes.
WebElementCollection all = current_doc.all();
// Go through all elements in this frame.
for (WebElement element = all.firstItem(); !element.isNull();
element = all.nextItem()) {
return true;
WebString GetSubResourceLinkFromElement(const WebElement& element) {
const char* attribute_name = NULL;
if (element.hasHTMLTagName("img") ||
element.hasHTMLTagName("frame") ||
element.hasHTMLTagName("iframe") ||
element.hasHTMLTagName("script")) {
attribute_name = "src";
} else if (element.hasHTMLTagName("input")) {
const WebInputElement input = element.toConst<WebInputElement>();
if (input.isImageButton()) {
attribute_name = "src";
} else if (element.hasHTMLTagName("body") ||
element.hasHTMLTagName("table") ||
element.hasHTMLTagName("tr") ||
element.hasHTMLTagName("td")) {
attribute_name = "background";
} else if (element.hasHTMLTagName("blockquote") ||
element.hasHTMLTagName("q") ||
element.hasHTMLTagName("del") ||
element.hasHTMLTagName("ins")) {
attribute_name = "cite";
} else if (element.hasHTMLTagName("object")) {
attribute_name = "data";
} else if (element.hasHTMLTagName("link")) {
// If the link element is not linked to css, ignore it.
if (base::LowerCaseEqualsASCII(
base::StringPiece16(element.getAttribute("type")), "text/css") ||
base::StringPiece16(element.getAttribute("rel")), "stylesheet")) {
// TODO(jnd): Add support for extracting links of sub-resources which
// are inside style-sheet such as @import, url(), etc.
// See bug: http://b/issue?id=1111667.
attribute_name = "href";
if (!attribute_name)
return WebString();
WebString value = element.getAttribute(WebString::fromUTF8(attribute_name));
// If value has content and not start with "javascript:" then return it,
// otherwise return NULL.
if (!value.isNull() && !value.isEmpty() &&
!base::StartsWith(value.utf8(), "javascript:",
return value;
return WebString();
} // namespace content