blob: 5ae9b75c24659cbfe022424e8fb66b0041688498 [file] [log] [blame]
/*
* Copyright (C) 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "public/web/WebFrameSerializer.h"
#include "core/HTMLNames.h"
#include "core/InputTypeNames.h"
#include "core/dom/Document.h"
#include "core/dom/Element.h"
#include "core/frame/Frame.h"
#include "core/frame/FrameSerializer.h"
#include "core/frame/LocalFrame.h"
#include "core/frame/RemoteFrame.h"
#include "core/frame/WebLocalFrameBase.h"
#include "core/html/HTMLAllCollection.h"
#include "core/html/HTMLFrameElementBase.h"
#include "core/html/HTMLFrameOwnerElement.h"
#include "core/html/HTMLImageElement.h"
#include "core/html/HTMLInputElement.h"
#include "core/html/HTMLTableElement.h"
#include "core/layout/LayoutBox.h"
#include "core/loader/DocumentLoader.h"
#include "platform/Histogram.h"
#include "platform/SerializedResource.h"
#include "platform/SharedBuffer.h"
#include "platform/instrumentation/tracing/TraceEvent.h"
#include "platform/loader/fetch/ResourceRequest.h"
#include "platform/loader/fetch/ResourceResponse.h"
#include "platform/mhtml/MHTMLArchive.h"
#include "platform/mhtml/MHTMLParser.h"
#include "platform/weborigin/KURL.h"
#include "platform/wtf/Assertions.h"
#include "platform/wtf/Deque.h"
#include "platform/wtf/HashMap.h"
#include "platform/wtf/HashSet.h"
#include "platform/wtf/Noncopyable.h"
#include "platform/wtf/Vector.h"
#include "platform/wtf/text/StringConcatenate.h"
#include "public/platform/WebString.h"
#include "public/platform/WebURL.h"
#include "public/platform/WebURLResponse.h"
#include "public/platform/WebVector.h"
#include "public/web/WebDataSource.h"
#include "public/web/WebDocument.h"
#include "public/web/WebFrame.h"
#include "public/web/WebFrameSerializerCacheControlPolicy.h"
#include "public/web/WebFrameSerializerClient.h"
#include "web/WebFrameSerializerImpl.h"
#include "web/WebRemoteFrameImpl.h"
namespace blink {
namespace {
const int kPopupOverlayZIndexThreshold = 50;
class MHTMLFrameSerializerDelegate final : public FrameSerializer::Delegate {
WTF_MAKE_NONCOPYABLE(MHTMLFrameSerializerDelegate);
public:
explicit MHTMLFrameSerializerDelegate(
WebFrameSerializer::MHTMLPartsGenerationDelegate&);
~MHTMLFrameSerializerDelegate() override;
bool ShouldIgnoreElement(const Element&) override;
bool ShouldIgnoreAttribute(const Element&, const Attribute&) override;
bool RewriteLink(const Element&, String& rewritten_link) override;
bool ShouldSkipResourceWithURL(const KURL&) override;
bool ShouldSkipResource(
FrameSerializer::ResourceHasCacheControlNoStoreHeader) override;
Vector<Attribute> GetCustomAttributes(const Element&) override;
private:
bool ShouldIgnoreHiddenElement(const Element&);
bool ShouldIgnoreMetaElement(const Element&);
bool ShouldIgnorePopupOverlayElement(const Element&);
void GetCustomAttributesForImageElement(const HTMLImageElement&,
Vector<Attribute>*);
void GetCustomAttributesForFormControlElement(const Element&,
Vector<Attribute>*);
WebFrameSerializer::MHTMLPartsGenerationDelegate& web_delegate_;
bool popup_overlays_skipped_;
};
MHTMLFrameSerializerDelegate::MHTMLFrameSerializerDelegate(
WebFrameSerializer::MHTMLPartsGenerationDelegate& web_delegate)
: web_delegate_(web_delegate), popup_overlays_skipped_(false) {}
MHTMLFrameSerializerDelegate::~MHTMLFrameSerializerDelegate() {
if (web_delegate_.RemovePopupOverlay()) {
UMA_HISTOGRAM_BOOLEAN(
"PageSerialization.MhtmlGeneration.PopupOverlaySkipped",
popup_overlays_skipped_);
}
}
bool MHTMLFrameSerializerDelegate::ShouldIgnoreElement(const Element& element) {
if (ShouldIgnoreHiddenElement(element))
return true;
if (ShouldIgnoreMetaElement(element))
return true;
if (web_delegate_.RemovePopupOverlay() &&
ShouldIgnorePopupOverlayElement(element)) {
return true;
}
return false;
}
bool MHTMLFrameSerializerDelegate::ShouldIgnoreHiddenElement(
const Element& element) {
// Do not include the element that is marked with hidden attribute.
if (element.FastHasAttribute(HTMLNames::hiddenAttr))
return true;
// Do not include the hidden form element.
return isHTMLInputElement(element) &&
toHTMLInputElement(&element)->type() == InputTypeNames::hidden;
}
bool MHTMLFrameSerializerDelegate::ShouldIgnoreMetaElement(
const Element& element) {
// Do not include meta elements that declare Content-Security-Policy
// directives. They should have already been enforced when the original
// document is loaded. Since only the rendered resources are encapsulated in
// the saved MHTML page, there is no need to carry the directives. If they
// are still kept in the MHTML, child frames that are referred to using cid:
// scheme could be prevented from loading.
if (!isHTMLMetaElement(element))
return false;
if (!element.FastHasAttribute(HTMLNames::contentAttr))
return false;
const AtomicString& http_equiv =
element.FastGetAttribute(HTMLNames::http_equivAttr);
return http_equiv == "Content-Security-Policy";
}
bool MHTMLFrameSerializerDelegate::ShouldIgnorePopupOverlayElement(
const Element& element) {
// The element should be visible.
LayoutBox* box = element.GetLayoutBox();
if (!box)
return false;
// The bounding box of the element should contain center point of the
// viewport.
LocalDOMWindow* window = element.GetDocument().domWindow();
DCHECK(window);
LayoutPoint center_point(window->innerWidth() / 2, window->innerHeight() / 2);
if (!box->FrameRect().Contains(center_point))
return false;
// The z-index should be greater than the threshold.
if (box->Style()->ZIndex() < kPopupOverlayZIndexThreshold)
return false;
popup_overlays_skipped_ = true;
return true;
}
bool MHTMLFrameSerializerDelegate::ShouldIgnoreAttribute(
const Element& element,
const Attribute& attribute) {
// TODO(fgorski): Presence of srcset attribute causes MHTML to not display
// images, as only the value of src is pulled into the archive. Discarding
// srcset prevents the problem. Long term we should make sure to MHTML plays
// nicely with srcset.
if (attribute.LocalName() == HTMLNames::srcsetAttr)
return true;
// Do not save ping attribute since anyway the ping will be blocked from
// MHTML.
if (isHTMLAnchorElement(element) &&
attribute.LocalName() == HTMLNames::pingAttr) {
return true;
}
// If srcdoc attribute for frame elements will be rewritten as src attribute
// containing link instead of html contents, don't ignore the attribute.
// Bail out now to avoid the check in Element::isScriptingAttribute.
bool is_src_doc_attribute = IsHTMLFrameElementBase(element) &&
attribute.GetName() == HTMLNames::srcdocAttr;
String new_link_for_the_element;
if (is_src_doc_attribute && RewriteLink(element, new_link_for_the_element))
return false;
// Do not include attributes that contain javascript. This is because the
// script will not be executed when a MHTML page is being loaded.
return element.IsScriptingAttribute(attribute);
}
bool MHTMLFrameSerializerDelegate::RewriteLink(const Element& element,
String& rewritten_link) {
if (!element.IsFrameOwnerElement())
return false;
auto* frame_owner_element = ToHTMLFrameOwnerElement(&element);
Frame* frame = frame_owner_element->ContentFrame();
if (!frame)
return false;
WebString content_id = web_delegate_.GetContentID(WebFrame::FromFrame(frame));
if (content_id.IsNull())
return false;
KURL cid_uri = MHTMLParser::ConvertContentIDToURI(content_id);
DCHECK(cid_uri.IsValid());
if (IsHTMLFrameElementBase(&element)) {
rewritten_link = cid_uri.GetString();
return true;
}
if (isHTMLObjectElement(&element)) {
Document* doc = frame_owner_element->contentDocument();
bool is_handled_by_serializer = doc->IsHTMLDocument() ||
doc->IsXHTMLDocument() ||
doc->IsImageDocument();
if (is_handled_by_serializer) {
rewritten_link = cid_uri.GetString();
return true;
}
}
return false;
}
bool MHTMLFrameSerializerDelegate::ShouldSkipResourceWithURL(const KURL& url) {
return web_delegate_.ShouldSkipResource(url);
}
bool MHTMLFrameSerializerDelegate::ShouldSkipResource(
FrameSerializer::ResourceHasCacheControlNoStoreHeader
has_cache_control_no_store_header) {
return web_delegate_.CacheControlPolicy() ==
WebFrameSerializerCacheControlPolicy::
kSkipAnyFrameOrResourceMarkedNoStore &&
has_cache_control_no_store_header ==
FrameSerializer::kHasCacheControlNoStoreHeader;
}
Vector<Attribute> MHTMLFrameSerializerDelegate::GetCustomAttributes(
const Element& element) {
Vector<Attribute> attributes;
if (isHTMLImageElement(element)) {
GetCustomAttributesForImageElement(toHTMLImageElement(element),
&attributes);
} else if (element.IsFormControlElement()) {
GetCustomAttributesForFormControlElement(element, &attributes);
}
return attributes;
}
void MHTMLFrameSerializerDelegate::GetCustomAttributesForImageElement(
const HTMLImageElement& element,
Vector<Attribute>* attributes) {
// Currently only the value of src is pulled into the archive and the srcset
// attribute is ignored (see shouldIgnoreAttribute() above). If the device
// has a higher DPR, a different image from srcset could be loaded instead.
// When this occurs, we should provide the rendering width and height for
// <img> element if not set.
// The image should be loaded and participate the layout.
ImageResourceContent* image = element.CachedImage();
if (!image || !image->HasImage() || image->ErrorOccurred() ||
!element.GetLayoutObject()) {
return;
}
// The width and height attributes should not be set.
if (element.FastHasAttribute(HTMLNames::widthAttr) ||
element.FastHasAttribute(HTMLNames::heightAttr)) {
return;
}
// Check if different image is loaded. naturalWidth/naturalHeight will return
// the image size adjusted with current DPR.
if (((int)element.naturalWidth()) == image->GetImage()->width() &&
((int)element.naturalHeight()) == image->GetImage()->height()) {
return;
}
Attribute width_attribute(HTMLNames::widthAttr,
AtomicString::Number(element.LayoutBoxWidth()));
attributes->push_back(width_attribute);
Attribute height_attribute(HTMLNames::heightAttr,
AtomicString::Number(element.LayoutBoxHeight()));
attributes->push_back(height_attribute);
}
void MHTMLFrameSerializerDelegate::GetCustomAttributesForFormControlElement(
const Element& element,
Vector<Attribute>* attributes) {
// Disable all form elements in MTHML to tell the user that the form cannot be
// worked on. MHTML is loaded in full sandboxing mode which disable the form
// submission and script execution.
if (element.FastHasAttribute(HTMLNames::disabledAttr))
return;
Attribute disabled_attribute(HTMLNames::disabledAttr, "");
attributes->push_back(disabled_attribute);
}
bool CacheControlNoStoreHeaderPresent(
const WebLocalFrameBase& web_local_frame) {
const ResourceResponse& response =
web_local_frame.DataSource()->GetResponse().ToResourceResponse();
if (response.CacheControlContainsNoStore())
return true;
const ResourceRequest& request =
web_local_frame.DataSource()->GetRequest().ToResourceRequest();
return request.CacheControlContainsNoStore();
}
bool FrameShouldBeSerializedAsMHTML(
WebLocalFrame* frame,
WebFrameSerializerCacheControlPolicy cache_control_policy) {
WebLocalFrameBase* web_local_frame = ToWebLocalFrameBase(frame);
DCHECK(web_local_frame);
if (cache_control_policy == WebFrameSerializerCacheControlPolicy::kNone)
return true;
bool need_to_check_no_store =
cache_control_policy == WebFrameSerializerCacheControlPolicy::
kSkipAnyFrameOrResourceMarkedNoStore ||
(!frame->Parent() &&
cache_control_policy ==
WebFrameSerializerCacheControlPolicy::kFailForNoStoreMainFrame);
if (!need_to_check_no_store)
return true;
return !CacheControlNoStoreHeaderPresent(*web_local_frame);
}
} // namespace
WebThreadSafeData WebFrameSerializer::GenerateMHTMLHeader(
const WebString& boundary,
WebLocalFrame* frame,
MHTMLPartsGenerationDelegate* delegate) {
TRACE_EVENT0("page-serialization", "WebFrameSerializer::generateMHTMLHeader");
DCHECK(frame);
DCHECK(delegate);
if (!FrameShouldBeSerializedAsMHTML(frame, delegate->CacheControlPolicy()))
return WebThreadSafeData();
WebLocalFrameBase* web_local_frame = ToWebLocalFrameBase(frame);
DCHECK(web_local_frame);
Document* document = web_local_frame->GetFrame()->GetDocument();
RefPtr<RawData> buffer = RawData::Create();
MHTMLArchive::GenerateMHTMLHeader(boundary, document->title(),
document->SuggestedMIMEType(),
*buffer->MutableData());
return buffer.Release();
}
WebThreadSafeData WebFrameSerializer::GenerateMHTMLParts(
const WebString& boundary,
WebLocalFrame* web_frame,
MHTMLPartsGenerationDelegate* web_delegate) {
TRACE_EVENT0("page-serialization", "WebFrameSerializer::generateMHTMLParts");
DCHECK(web_frame);
DCHECK(web_delegate);
if (!FrameShouldBeSerializedAsMHTML(web_frame,
web_delegate->CacheControlPolicy()))
return WebThreadSafeData();
// Translate arguments from public to internal blink APIs.
LocalFrame* frame = ToWebLocalFrameBase(web_frame)->GetFrame();
MHTMLArchive::EncodingPolicy encoding_policy =
web_delegate->UseBinaryEncoding()
? MHTMLArchive::EncodingPolicy::kUseBinaryEncoding
: MHTMLArchive::EncodingPolicy::kUseDefaultEncoding;
// Serialize.
TRACE_EVENT_BEGIN0("page-serialization",
"WebFrameSerializer::generateMHTMLParts serializing");
Deque<SerializedResource> resources;
{
SCOPED_BLINK_UMA_HISTOGRAM_TIMER(
"PageSerialization.MhtmlGeneration.SerializationTime.SingleFrame");
MHTMLFrameSerializerDelegate core_delegate(*web_delegate);
FrameSerializer serializer(resources, core_delegate);
serializer.SerializeFrame(*frame);
}
TRACE_EVENT_END1("page-serialization",
"WebFrameSerializer::generateMHTMLParts serializing",
"resource count",
static_cast<unsigned long long>(resources.size()));
// There was an error serializing the frame (e.g. of an image resource).
if (resources.IsEmpty())
return WebThreadSafeData();
// Encode serialized resources as MHTML.
RefPtr<RawData> output = RawData::Create();
{
SCOPED_BLINK_UMA_HISTOGRAM_TIMER(
"PageSerialization.MhtmlGeneration.EncodingTime.SingleFrame");
// Frame is the 1st resource (see FrameSerializer::serializeFrame doc
// comment). Frames get a Content-ID header.
MHTMLArchive::GenerateMHTMLPart(
boundary, web_delegate->GetContentID(web_frame), encoding_policy,
resources.TakeFirst(), *output->MutableData());
while (!resources.IsEmpty()) {
TRACE_EVENT0("page-serialization",
"WebFrameSerializer::generateMHTMLParts encoding");
MHTMLArchive::GenerateMHTMLPart(boundary, String(), encoding_policy,
resources.TakeFirst(),
*output->MutableData());
}
}
return output.Release();
}
bool WebFrameSerializer::Serialize(
WebLocalFrame* frame,
WebFrameSerializerClient* client,
WebFrameSerializer::LinkRewritingDelegate* delegate) {
WebFrameSerializerImpl serializer_impl(frame, client, delegate);
return serializer_impl.Serialize();
}
WebString WebFrameSerializer::GenerateMetaCharsetDeclaration(
const WebString& charset) {
// TODO(yosin) We should call |FrameSerializer::metaCharsetDeclarationOf()|.
String charset_string =
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" +
static_cast<const String&>(charset) + "\">";
return charset_string;
}
WebString WebFrameSerializer::GenerateMarkOfTheWebDeclaration(
const WebURL& url) {
StringBuilder builder;
builder.Append("\n<!-- ");
builder.Append(FrameSerializer::MarkOfTheWebDeclaration(url));
builder.Append(" -->\n");
return builder.ToString();
}
WebString WebFrameSerializer::GenerateBaseTagDeclaration(
const WebString& base_target) {
// TODO(yosin) We should call |FrameSerializer::baseTagDeclarationOf()|.
if (base_target.IsEmpty())
return String("<base href=\".\">");
String base_string = "<base href=\".\" target=\"" +
static_cast<const String&>(base_target) + "\">";
return base_string;
}
} // namespace blink