| /* |
| * Copyright (C) 2009 Google Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are |
| * met: |
| * |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above |
| * copyright notice, this list of conditions and the following disclaimer |
| * in the documentation and/or other materials provided with the |
| * distribution. |
| * * Neither the name of Google Inc. nor the names of its |
| * contributors may be used to endorse or promote products derived from |
| * this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| // How we handle the base tag better. |
| // Current status: |
| // At now the normal way we use to handling base tag is |
| // a) For those links which have corresponding local saved files, such as |
| // savable CSS, JavaScript files, they will be written to relative URLs which |
| // point to local saved file. Why those links can not be resolved as absolute |
| // file URLs, because if they are resolved as absolute URLs, after moving the |
| // file location from one directory to another directory, the file URLs will |
| // be dead links. |
| // b) For those links which have not corresponding local saved files, such as |
| // links in A, AREA tags, they will be resolved as absolute URLs. |
| // c) We comment all base tags when serialzing DOM for the page. |
| // FireFox also uses above way to handle base tag. |
| // |
| // Problem: |
| // This way can not handle the following situation: |
| // the base tag is written by JavaScript. |
| // For example. The page "www.yahoo.com" use |
| // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL |
| // of page when loading page. So when saving page as completed-HTML, we assume |
| // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved |
| // completed-HTML page, then the JavaScript will insert a base tag |
| // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to |
| // local saved resource files will be resolved as |
| // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource |
| // files can not be loaded correctly. Also the page will be rendered ugly since |
| // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame |
| // files can not be fetched. |
| // Now FireFox, IE and WebKit based Browser all have this problem. |
| // |
| // Solution: |
| // My solution is that we comment old base tag and write new base tag: |
| // <base href="." ...> after the previous commented base tag. In WebKit, it |
| // always uses the latest "href" attribute of base tag to set document's base |
| // URL. Based on this behavior, when we encounter a base tag, we comment it and |
| // write a new base tag <base href="."> after the previous commented base tag. |
| // The new added base tag can help engine to locate correct base URL for |
| // correctly loading local saved resource files. Also I think we need to inherit |
| // the base target value from document object when appending new base tag. |
| // If there are multiple base tags in original document, we will comment all old |
| // base tags and append new base tag after each old base tag because we do not |
| // know those old base tags are original content or added by JavaScript. If |
| // they are added by JavaScript, it means when loading saved page, the script(s) |
| // will still insert base tag(s) to DOM, so the new added base tag(s) can |
| // override the incorrect base URL and make sure we alway load correct local |
| // saved resource files. |
| |
| #include "third_party/blink/renderer/core/frame/web_frame_serializer_impl.h" |
| |
| #include "third_party/blink/public/platform/web_vector.h" |
| #include "third_party/blink/renderer/core/dom/document.h" |
| #include "third_party/blink/renderer/core/dom/document_type.h" |
| #include "third_party/blink/renderer/core/dom/element.h" |
| #include "third_party/blink/renderer/core/editing/serializers/serialization.h" |
| #include "third_party/blink/renderer/core/frame/frame_serializer.h" |
| #include "third_party/blink/renderer/core/frame/web_local_frame_impl.h" |
| #include "third_party/blink/renderer/core/html/forms/html_form_element.h" |
| #include "third_party/blink/renderer/core/html/html_all_collection.h" |
| #include "third_party/blink/renderer/core/html/html_element.h" |
| #include "third_party/blink/renderer/core/html/html_frame_element_base.h" |
| #include "third_party/blink/renderer/core/html/html_frame_owner_element.h" |
| #include "third_party/blink/renderer/core/html/html_html_element.h" |
| #include "third_party/blink/renderer/core/html/html_meta_element.h" |
| #include "third_party/blink/renderer/core/html_names.h" |
| #include "third_party/blink/renderer/core/loader/document_loader.h" |
| #include "third_party/blink/renderer/core/loader/frame_loader.h" |
| #include "third_party/blink/renderer/platform/wtf/text/text_encoding.h" |
| |
| namespace blink { |
| |
| // Maximum length of data buffer which is used to temporary save generated |
| // html content data. This is a soft limit which might be passed if a very large |
| // contegious string is found in the html document. |
| static const unsigned kDataBufferCapacity = 65536; |
| |
| WebFrameSerializerImpl::SerializeDomParam::SerializeDomParam( |
| const KURL& url, |
| const WTF::TextEncoding& text_encoding, |
| Document* document) |
| : url(url), |
| text_encoding(text_encoding), |
| document(document), |
| is_html_document(document->IsHTMLDocument()), |
| have_seen_doc_type(false), |
| have_added_charset_declaration(false), |
| skip_meta_element(nullptr), |
| have_added_xml_processing_directive(false), |
| have_added_contents_before_end(false) {} |
| |
| String WebFrameSerializerImpl::PreActionBeforeSerializeOpenTag( |
| const Element* element, |
| SerializeDomParam* param, |
| bool* need_skip) { |
| StringBuilder result; |
| |
| *need_skip = false; |
| if (param->is_html_document) { |
| // Skip the open tag of original META tag which declare charset since we |
| // have overrided the META which have correct charset declaration after |
| // serializing open tag of HEAD element. |
| DCHECK(element); |
| if (IsHTMLMetaElement(element) && |
| ToHTMLMetaElement(element)->ComputeEncoding().IsValid()) { |
| // Found META tag declared charset, we need to skip it when |
| // serializing DOM. |
| param->skip_meta_element = element; |
| *need_skip = true; |
| } else if (IsHTMLHtmlElement(*element)) { |
| // Check something before processing the open tag of HEAD element. |
| // First we add doc type declaration if original document has it. |
| if (!param->have_seen_doc_type) { |
| param->have_seen_doc_type = true; |
| result.Append(CreateMarkup(param->document->doctype())); |
| } |
| |
| // Add MOTW declaration before html tag. |
| // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. |
| result.Append( |
| WebFrameSerializer::GenerateMarkOfTheWebDeclaration(param->url)); |
| } else if (IsHTMLBaseElement(*element)) { |
| // Comment the BASE tag when serializing dom. |
| result.Append("<!--"); |
| } |
| } else { |
| // Write XML declaration. |
| if (!param->have_added_xml_processing_directive) { |
| param->have_added_xml_processing_directive = true; |
| // Get encoding info. |
| String xml_encoding = param->document->xmlEncoding(); |
| if (xml_encoding.IsEmpty()) |
| xml_encoding = param->document->EncodingName(); |
| if (xml_encoding.IsEmpty()) |
| xml_encoding = UTF8Encoding().GetName(); |
| result.Append("<?xml version=\""); |
| result.Append(param->document->xmlVersion()); |
| result.Append("\" encoding=\""); |
| result.Append(xml_encoding); |
| if (param->document->xmlStandalone()) |
| result.Append("\" standalone=\"yes"); |
| result.Append("\"?>\n"); |
| } |
| // Add doc type declaration if original document has it. |
| if (!param->have_seen_doc_type) { |
| param->have_seen_doc_type = true; |
| result.Append(CreateMarkup(param->document->doctype())); |
| } |
| } |
| return result.ToString(); |
| } |
| |
| String WebFrameSerializerImpl::PostActionAfterSerializeOpenTag( |
| const Element* element, |
| SerializeDomParam* param) { |
| StringBuilder result; |
| |
| param->have_added_contents_before_end = false; |
| if (!param->is_html_document) |
| return result.ToString(); |
| // Check after processing the open tag of HEAD element |
| if (!param->have_added_charset_declaration && IsHTMLHeadElement(*element)) { |
| param->have_added_charset_declaration = true; |
| // Check meta element. WebKit only pre-parse the first 512 bytes of the |
| // document. If the whole <HEAD> is larger and meta is the end of head |
| // part, then this kind of html documents aren't decoded correctly |
| // because of this issue. So when we serialize the DOM, we need to make |
| // sure the meta will in first child of head tag. |
| // See http://bugs.webkit.org/show_bug.cgi?id=16621. |
| // First we generate new content for writing correct META element. |
| result.Append(WebFrameSerializer::GenerateMetaCharsetDeclaration( |
| String(param->text_encoding.GetName()))); |
| |
| param->have_added_contents_before_end = true; |
| // Will search each META which has charset declaration, and skip them all |
| // in PreActionBeforeSerializeOpenTag. |
| } |
| |
| return result.ToString(); |
| } |
| |
| String WebFrameSerializerImpl::PreActionBeforeSerializeEndTag( |
| const Element* element, |
| SerializeDomParam* param, |
| bool* need_skip) { |
| String result; |
| |
| *need_skip = false; |
| if (!param->is_html_document) |
| return result; |
| // Skip the end tag of original META tag which declare charset. |
| // Need not to check whether it's META tag since we guarantee |
| // skipMetaElement is definitely META tag if it's not 0. |
| if (param->skip_meta_element == element) { |
| *need_skip = true; |
| } |
| |
| return result; |
| } |
| |
| // After we finish serializing end tag of a element, we give the target |
| // element a chance to do some post work to add some additional data. |
| String WebFrameSerializerImpl::PostActionAfterSerializeEndTag( |
| const Element* element, |
| SerializeDomParam* param) { |
| StringBuilder result; |
| |
| if (!param->is_html_document) |
| return result.ToString(); |
| // Comment the BASE tag when serializing DOM. |
| if (IsHTMLBaseElement(*element)) { |
| result.Append("-->"); |
| // Append a new base tag declaration. |
| result.Append(WebFrameSerializer::GenerateBaseTagDeclaration( |
| param->document->BaseTarget())); |
| } |
| |
| return result.ToString(); |
| } |
| |
| void WebFrameSerializerImpl::SaveHTMLContentToBuffer(const String& result, |
| SerializeDomParam* param) { |
| data_buffer_.Append(result); |
| EncodeAndFlushBuffer(WebFrameSerializerClient::kCurrentFrameIsNotFinished, |
| param, kDoNotForceFlush); |
| } |
| |
| void WebFrameSerializerImpl::EncodeAndFlushBuffer( |
| WebFrameSerializerClient::FrameSerializationStatus status, |
| SerializeDomParam* param, |
| FlushOption flush_option) { |
| // Data buffer is not full nor do we want to force flush. |
| if (flush_option != kForceFlush && |
| data_buffer_.length() <= kDataBufferCapacity) |
| return; |
| |
| String content = data_buffer_.ToString(); |
| data_buffer_.Clear(); |
| |
| CString encoded_content = |
| param->text_encoding.Encode(content, WTF::kEntitiesForUnencodables); |
| |
| // Send result to the client. |
| client_->DidSerializeDataForFrame( |
| WebVector<char>(encoded_content.data(), encoded_content.length()), |
| status); |
| } |
| |
| // TODO(yosin): We should utilize |MarkupFormatter| here to share code, |
| // especially escaping attribute values, done by |WebEntities| |m_htmlEntities| |
| // and |m_xmlEntities|. |
| void WebFrameSerializerImpl::AppendAttribute(StringBuilder& result, |
| bool is_html_document, |
| const String& attr_name, |
| const String& attr_value) { |
| result.Append(' '); |
| result.Append(attr_name); |
| result.Append("=\""); |
| if (is_html_document) |
| result.Append(html_entities_.ConvertEntitiesInString(attr_value)); |
| else |
| result.Append(xml_entities_.ConvertEntitiesInString(attr_value)); |
| result.Append('\"'); |
| } |
| |
| void WebFrameSerializerImpl::OpenTagToString(Element* element, |
| SerializeDomParam* param) { |
| bool need_skip; |
| StringBuilder result; |
| // Do pre action for open tag. |
| result.Append(PreActionBeforeSerializeOpenTag(element, param, &need_skip)); |
| if (need_skip) |
| return; |
| // Add open tag |
| result.Append('<'); |
| result.Append(element->nodeName().DeprecatedLower()); |
| |
| // Find out if we need to do frame-specific link rewriting. |
| WebFrame* frame = nullptr; |
| if (element->IsFrameOwnerElement()) { |
| frame = |
| WebFrame::FromFrame(ToHTMLFrameOwnerElement(element)->ContentFrame()); |
| } |
| WebString rewritten_frame_link; |
| bool should_rewrite_frame_src = |
| frame && delegate_->RewriteFrameSource(frame, &rewritten_frame_link); |
| bool did_rewrite_frame_src = false; |
| |
| // Go through all attributes and serialize them. |
| for (const auto& it : element->Attributes()) { |
| const QualifiedName& attr_name = it.GetName(); |
| String attr_value = it.Value(); |
| |
| // Skip srcdoc attribute if we will emit src attribute (for frames). |
| if (should_rewrite_frame_src && attr_name == html_names::kSrcdocAttr) |
| continue; |
| |
| // Rewrite the attribute value if requested. |
| if (element->HasLegalLinkAttribute(attr_name)) { |
| // For links start with "javascript:", we do not change it. |
| if (!attr_value.StartsWithIgnoringASCIICase("javascript:")) { |
| // Get the absolute link. |
| KURL complete_url = param->document->CompleteURL(attr_value); |
| |
| // Check whether we have a local file to link to. |
| WebString rewritten_url; |
| if (should_rewrite_frame_src) { |
| attr_value = rewritten_frame_link; |
| did_rewrite_frame_src = true; |
| } else if (delegate_->RewriteLink(complete_url, &rewritten_url)) { |
| attr_value = rewritten_url; |
| } else { |
| attr_value = complete_url; |
| } |
| } |
| } |
| |
| AppendAttribute(result, param->is_html_document, attr_name.ToString(), |
| attr_value); |
| } |
| |
| // For frames where link rewriting was requested, ensure that src attribute |
| // is written even if the original document didn't have that attribute |
| // (mainly needed for iframes with srcdoc, but with no src attribute). |
| if (should_rewrite_frame_src && !did_rewrite_frame_src && |
| IsHTMLIFrameElement(element)) { |
| AppendAttribute(result, param->is_html_document, |
| html_names::kSrcAttr.ToString(), rewritten_frame_link); |
| } |
| |
| // Do post action for open tag. |
| String added_contents = PostActionAfterSerializeOpenTag(element, param); |
| // Complete the open tag for element when it has child/children. |
| if (element->HasChildren() || param->have_added_contents_before_end) |
| result.Append('>'); |
| // Append the added contents generate in post action of open tag. |
| result.Append(added_contents); |
| // Save the result to data buffer. |
| SaveHTMLContentToBuffer(result.ToString(), param); |
| } |
| |
| // Serialize end tag of an specified element. |
| void WebFrameSerializerImpl::EndTagToString(Element* element, |
| SerializeDomParam* param) { |
| bool need_skip; |
| StringBuilder result; |
| // Do pre action for end tag. |
| result.Append(PreActionBeforeSerializeEndTag(element, param, &need_skip)); |
| if (need_skip) |
| return; |
| // Write end tag when element has child/children. |
| if (element->HasChildren() || param->have_added_contents_before_end) { |
| result.Append("</"); |
| result.Append(element->nodeName().DeprecatedLower()); |
| result.Append('>'); |
| } else { |
| // Check whether we have to write end tag for empty element. |
| if (param->is_html_document) { |
| result.Append('>'); |
| // FIXME: This code is horribly wrong. WebFrameSerializerImpl must die. |
| if (!element->IsHTMLElement() || |
| ToHTMLElement(element)->ShouldSerializeEndTag()) { |
| // We need to write end tag when it is required. |
| result.Append("</"); |
| result.Append(element->nodeName().DeprecatedLower()); |
| result.Append('>'); |
| } |
| } else { |
| // For xml base document. |
| result.Append(" />"); |
| } |
| } |
| // Do post action for end tag. |
| result.Append(PostActionAfterSerializeEndTag(element, param)); |
| // Save the result to data buffer. |
| SaveHTMLContentToBuffer(result.ToString(), param); |
| } |
| |
| void WebFrameSerializerImpl::BuildContentForNode(Node* node, |
| SerializeDomParam* param) { |
| switch (node->getNodeType()) { |
| case Node::kElementNode: |
| // Process open tag of element. |
| OpenTagToString(ToElement(node), param); |
| // Walk through the children nodes and process it. |
| for (Node* child = node->firstChild(); child; |
| child = child->nextSibling()) |
| BuildContentForNode(child, param); |
| // Process end tag of element. |
| EndTagToString(ToElement(node), param); |
| break; |
| case Node::kTextNode: |
| SaveHTMLContentToBuffer(CreateMarkup(node), param); |
| break; |
| case Node::kAttributeNode: |
| case Node::kDocumentNode: |
| case Node::kDocumentFragmentNode: |
| // Should not exist. |
| NOTREACHED(); |
| break; |
| // Document type node can be in DOM? |
| case Node::kDocumentTypeNode: |
| param->have_seen_doc_type = true; |
| FALLTHROUGH; |
| default: |
| // For other type node, call default action. |
| SaveHTMLContentToBuffer(CreateMarkup(node), param); |
| break; |
| } |
| } |
| |
| WebFrameSerializerImpl::WebFrameSerializerImpl( |
| WebLocalFrame* frame, |
| WebFrameSerializerClient* client, |
| WebFrameSerializer::LinkRewritingDelegate* delegate) |
| : client_(client), |
| delegate_(delegate), |
| html_entities_(false), |
| xml_entities_(true) { |
| // Must specify available webframe. |
| DCHECK(frame); |
| specified_web_local_frame_impl_ = ToWebLocalFrameImpl(frame); |
| // Make sure we have non null client and delegate. |
| DCHECK(client); |
| DCHECK(delegate); |
| |
| DCHECK(data_buffer_.IsEmpty()); |
| } |
| |
| bool WebFrameSerializerImpl::Serialize() { |
| bool did_serialization = false; |
| |
| Document* document = |
| specified_web_local_frame_impl_->GetFrame()->GetDocument(); |
| const KURL& url = document->Url(); |
| |
| if (url.IsValid()) { |
| did_serialization = true; |
| |
| const WTF::TextEncoding& text_encoding = |
| document->Encoding().IsValid() ? document->Encoding() : UTF8Encoding(); |
| if (text_encoding.IsNonByteBasedEncoding()) { |
| const UChar kByteOrderMark = 0xFEFF; |
| data_buffer_.Append(kByteOrderMark); |
| } |
| |
| SerializeDomParam param(url, text_encoding, document); |
| |
| Element* document_element = document->documentElement(); |
| if (document_element) |
| BuildContentForNode(document_element, ¶m); |
| |
| EncodeAndFlushBuffer(WebFrameSerializerClient::kCurrentFrameIsFinished, |
| ¶m, kForceFlush); |
| } else { |
| // Report empty contents for invalid URLs. |
| client_->DidSerializeDataForFrame( |
| WebVector<char>(), WebFrameSerializerClient::kCurrentFrameIsFinished); |
| } |
| |
| DCHECK(data_buffer_.IsEmpty()); |
| return did_serialization; |
| } |
| |
| } // namespace blink |