blob: be64cd48aff347065527f5816d68e6191e355f14 [file] [log] [blame]
/*
* Copyright (C) 2009 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "public/web/WebFrameSerializer.h"
#include "core/HTMLNames.h"
#include "core/dom/Document.h"
#include "core/dom/Element.h"
#include "core/frame/Frame.h"
#include "core/frame/FrameSerializer.h"
#include "core/frame/LocalFrame.h"
#include "core/frame/RemoteFrame.h"
#include "core/html/HTMLAllCollection.h"
#include "core/html/HTMLFrameElementBase.h"
#include "core/html/HTMLFrameOwnerElement.h"
#include "core/html/HTMLInputElement.h"
#include "core/html/HTMLTableElement.h"
#include "core/loader/DocumentLoader.h"
#include "platform/SerializedResource.h"
#include "platform/SharedBuffer.h"
#include "platform/mhtml/MHTMLArchive.h"
#include "platform/mhtml/MHTMLParser.h"
#include "platform/network/ResourceRequest.h"
#include "platform/network/ResourceResponse.h"
#include "platform/weborigin/KURL.h"
#include "public/platform/WebString.h"
#include "public/platform/WebURL.h"
#include "public/platform/WebURLResponse.h"
#include "public/platform/WebVector.h"
#include "public/web/WebDataSource.h"
#include "public/web/WebDocument.h"
#include "public/web/WebFrame.h"
#include "public/web/WebFrameSerializerCacheControlPolicy.h"
#include "public/web/WebFrameSerializerClient.h"
#include "web/WebFrameSerializerImpl.h"
#include "web/WebLocalFrameImpl.h"
#include "web/WebRemoteFrameImpl.h"
#include "wtf/Assertions.h"
#include "wtf/HashMap.h"
#include "wtf/HashSet.h"
#include "wtf/Noncopyable.h"
#include "wtf/Vector.h"
#include "wtf/text/StringConcatenate.h"
namespace blink {
namespace {
class MHTMLFrameSerializerDelegate final : public FrameSerializer::Delegate {
WTF_MAKE_NONCOPYABLE(MHTMLFrameSerializerDelegate);
public:
explicit MHTMLFrameSerializerDelegate(WebFrameSerializer::MHTMLPartsGenerationDelegate&);
bool shouldIgnoreAttribute(const Attribute&) override;
bool rewriteLink(const Element&, String& rewrittenLink) override;
bool shouldSkipResourceWithURL(const KURL&) override;
bool shouldSkipResource(const Resource&) override;
private:
WebFrameSerializer::MHTMLPartsGenerationDelegate& m_webDelegate;
};
MHTMLFrameSerializerDelegate::MHTMLFrameSerializerDelegate(
WebFrameSerializer::MHTMLPartsGenerationDelegate& webDelegate)
: m_webDelegate(webDelegate)
{
}
bool MHTMLFrameSerializerDelegate::shouldIgnoreAttribute(const Attribute& attribute)
{
// TODO(fgorski): Presence of srcset attribute causes MHTML to not display images, as only the value of src
// is pulled into the archive. Discarding srcset prevents the problem. Long term we should make sure to MHTML
// plays nicely with srcset.
return attribute.localName() == HTMLNames::srcsetAttr;
}
bool MHTMLFrameSerializerDelegate::rewriteLink(
const Element& element,
String& rewrittenLink)
{
if (!element.isFrameOwnerElement())
return false;
auto* frameOwnerElement = toHTMLFrameOwnerElement(&element);
Frame* frame = frameOwnerElement->contentFrame();
if (!frame)
return false;
WebString contentID = m_webDelegate.getContentID(WebFrame::fromFrame(frame));
if (contentID.isNull())
return false;
KURL cidURI = MHTMLParser::convertContentIDToURI(contentID);
DCHECK(cidURI.isValid());
if (isHTMLFrameElementBase(&element)) {
rewrittenLink = cidURI.getString();
return true;
}
if (isHTMLObjectElement(&element)) {
Document* doc = frameOwnerElement->contentDocument();
bool isHandledBySerializer = doc->isHTMLDocument()
|| doc->isXHTMLDocument() || doc->isImageDocument();
if (isHandledBySerializer) {
rewrittenLink = cidURI.getString();
return true;
}
}
return false;
}
bool MHTMLFrameSerializerDelegate::shouldSkipResourceWithURL(const KURL& url)
{
return m_webDelegate.shouldSkipResource(url);
}
bool MHTMLFrameSerializerDelegate::shouldSkipResource(const Resource& resource)
{
return m_webDelegate.cacheControlPolicy() == WebFrameSerializerCacheControlPolicy::SkipAnyFrameOrResourceMarkedNoStore
&& resource.hasCacheControlNoStoreHeader();
}
bool cacheControlNoStoreHeaderPresent(const WebLocalFrameImpl& webLocalFrameImpl)
{
const ResourceResponse& response = webLocalFrameImpl.dataSource()->response().toResourceResponse();
if (response.cacheControlContainsNoStore())
return true;
const ResourceRequest& request = webLocalFrameImpl.dataSource()->request().toResourceRequest();
return request.cacheControlContainsNoStore();
}
bool frameShouldBeSerializedAsMHTML(WebLocalFrame* frame, WebFrameSerializerCacheControlPolicy cacheControlPolicy)
{
WebLocalFrameImpl* webLocalFrameImpl = toWebLocalFrameImpl(frame);
DCHECK(webLocalFrameImpl);
if (cacheControlPolicy == WebFrameSerializerCacheControlPolicy::None)
return true;
bool needToCheckNoStore = cacheControlPolicy == WebFrameSerializerCacheControlPolicy::SkipAnyFrameOrResourceMarkedNoStore
|| (!frame->parent() && cacheControlPolicy == WebFrameSerializerCacheControlPolicy::FailForNoStoreMainFrame);
if (!needToCheckNoStore)
return true;
return !cacheControlNoStoreHeaderPresent(*webLocalFrameImpl);
}
} // namespace
WebData WebFrameSerializer::generateMHTMLHeader(
const WebString& boundary, WebLocalFrame* frame, MHTMLPartsGenerationDelegate* delegate)
{
DCHECK(frame);
DCHECK(delegate);
if (!frameShouldBeSerializedAsMHTML(frame, delegate->cacheControlPolicy()))
return WebData();
WebLocalFrameImpl* webLocalFrameImpl = toWebLocalFrameImpl(frame);
DCHECK(webLocalFrameImpl);
Document* document = webLocalFrameImpl->frame()->document();
RefPtr<SharedBuffer> buffer = SharedBuffer::create();
MHTMLArchive::generateMHTMLHeader(
boundary, document->title(), document->suggestedMIMEType(),
*buffer);
return buffer.release();
}
WebData WebFrameSerializer::generateMHTMLParts(
const WebString& boundary, WebLocalFrame* webFrame, MHTMLPartsGenerationDelegate* webDelegate)
{
DCHECK(webFrame);
DCHECK(webDelegate);
if (!frameShouldBeSerializedAsMHTML(webFrame, webDelegate->cacheControlPolicy()))
return WebData();
// Translate arguments from public to internal blink APIs.
LocalFrame* frame = toWebLocalFrameImpl(webFrame)->frame();
MHTMLArchive::EncodingPolicy encodingPolicy = webDelegate->useBinaryEncoding()
? MHTMLArchive::EncodingPolicy::UseBinaryEncoding
: MHTMLArchive::EncodingPolicy::UseDefaultEncoding;
// Serialize.
Vector<SerializedResource> resources;
MHTMLFrameSerializerDelegate coreDelegate(*webDelegate);
FrameSerializer serializer(resources, coreDelegate);
serializer.serializeFrame(*frame);
// Get Content-ID for the frame being serialized.
String frameContentID = webDelegate->getContentID(webFrame);
// Encode serializer's output as MHTML.
RefPtr<SharedBuffer> output = SharedBuffer::create();
bool isFirstResource = true;
for (const SerializedResource& resource : resources) {
// Frame is the 1st resource (see FrameSerializer::serializeFrame doc
// comment). Frames get a Content-ID header.
String contentID = isFirstResource ? frameContentID : String();
MHTMLArchive::generateMHTMLPart(
boundary, contentID, encodingPolicy, resource, *output);
isFirstResource = false;
}
return output.release();
}
WebData WebFrameSerializer::generateMHTMLFooter(const WebString& boundary)
{
RefPtr<SharedBuffer> buffer = SharedBuffer::create();
MHTMLArchive::generateMHTMLFooter(boundary, *buffer);
return buffer.release();
}
bool WebFrameSerializer::serialize(
WebLocalFrame* frame,
WebFrameSerializerClient* client,
WebFrameSerializer::LinkRewritingDelegate* delegate)
{
WebFrameSerializerImpl serializerImpl(frame, client, delegate);
return serializerImpl.serialize();
}
WebString WebFrameSerializer::generateMetaCharsetDeclaration(const WebString& charset)
{
// TODO(yosin) We should call |FrameSerializer::metaCharsetDeclarationOf()|.
String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
return charsetString;
}
WebString WebFrameSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
{
StringBuilder builder;
builder.append("\n<!-- ");
builder.append(FrameSerializer::markOfTheWebDeclaration(url));
builder.append(" -->\n");
return builder.toString();
}
WebString WebFrameSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
{
// TODO(yosin) We should call |FrameSerializer::baseTagDeclarationOf()|.
if (baseTarget.isEmpty())
return String("<base href=\".\">");
String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
return baseString;
}
} // namespace blink