blob: 4628ae13178e977d98b9183107db4af096a0d896 [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google Inc. nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/platform/mhtml/mhtml_archive.h"
#include <stddef.h>
#include "base/metrics/histogram_macros.h"
#include "build/build_config.h"
#include "third_party/blink/renderer/platform/date_components.h"
#include "third_party/blink/renderer/platform/mhtml/archive_resource.h"
#include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h"
#include "third_party/blink/renderer/platform/mhtml/serialized_resource.h"
#include "third_party/blink/renderer/platform/network/mime/mime_type_registry.h"
#include "third_party/blink/renderer/platform/shared_buffer.h"
#include "third_party/blink/renderer/platform/weborigin/scheme_registry.h"
#include "third_party/blink/renderer/platform/wtf/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/assertions.h"
#include "third_party/blink/renderer/platform/wtf/date_math.h"
#include "third_party/blink/renderer/platform/wtf/text/base64.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/time.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"
namespace blink {
namespace {
using blink::mojom::MHTMLLoadResult;
const wtf_size_t kMaximumLineLength = 76;
const char kRFC2047EncodingPrefix[] = "=?utf-8?Q?";
const size_t kRFC2047EncodingPrefixLength = 10;
const char kRFC2047EncodingSuffix[] = "?=";
const size_t kRFC2047EncodingSuffixLength = 2;
const char kQuotedPrintable[] = "quoted-printable";
const char kBase64[] = "base64";
const char kBinary[] = "binary";
// Returns the length of a line-ending if one is present starting at
// |input[index]| or zero if no line-ending is present at the given |index|.
size_t LengthOfLineEndingAtIndex(const char* input,
size_t input_length,
size_t index) {
SECURITY_DCHECK(index < input_length);
if (input[index] == '\n')
return 1; // Single LF.
if (input[index] == '\r') {
if ((index + 1) == input_length || input[index + 1] != '\n')
return 1; // Single CR (Classic Mac OS).
return 2; // CR-LF.
}
return 0;
}
// Performs quoted-printable encoding characters, per RFC 2047.
void QuotedPrintableEncode(const char* input,
wtf_size_t input_length,
bool is_header,
Vector<char>& out) {
out.clear();
out.ReserveCapacity(input_length);
if (is_header)
out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
size_t current_line_length = 0;
for (size_t i = 0; i < input_length; ++i) {
bool is_last_character = (i == input_length - 1);
char current_character = input[i];
bool requires_encoding = false;
// All non-printable ASCII characters and = require encoding.
if ((current_character < ' ' || current_character > '~' ||
current_character == '=') &&
current_character != '\t')
requires_encoding = true;
// Decide if space and tab characters need to be encoded.
if (!requires_encoding &&
(current_character == '\t' || current_character == ' ')) {
if (is_header) {
// White space characters should always be encoded if they appear
// anywhere in the header.
requires_encoding = true;
} else {
bool end_of_line = is_last_character || LengthOfLineEndingAtIndex(
input, input_length, i + 1);
requires_encoding = end_of_line;
}
}
// End of line should be converted to CR-LF sequences.
if (!is_last_character) {
size_t length_of_line_ending =
LengthOfLineEndingAtIndex(input, input_length, i);
if (length_of_line_ending) {
out.Append("\r\n", 2);
current_line_length = 0;
i += (length_of_line_ending -
1); // -1 because we'll ++ in the for() above.
continue;
}
}
size_t length_of_encoded_character = 1;
if (requires_encoding)
length_of_encoded_character += 2;
if (!is_last_character)
length_of_encoded_character += 1; // + 1 for the = (soft line break).
// Insert a soft line break if necessary.
size_t max_line_length_for_encoded_content = kMaximumLineLength;
if (is_header) {
max_line_length_for_encoded_content -= kRFC2047EncodingPrefixLength;
max_line_length_for_encoded_content -= kRFC2047EncodingSuffixLength;
}
if (current_line_length + length_of_encoded_character >
max_line_length_for_encoded_content) {
if (is_header) {
out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
out.Append("\r\n", 2);
out.push_back(' ');
} else {
out.push_back('=');
out.Append("\r\n", 2);
}
current_line_length = 0;
if (is_header)
out.Append(kRFC2047EncodingPrefix, kRFC2047EncodingPrefixLength);
}
// Finally, insert the actual character(s).
if (requires_encoding) {
out.push_back('=');
out.push_back(UpperNibbleToASCIIHexDigit(current_character));
out.push_back(LowerNibbleToASCIIHexDigit(current_character));
current_line_length += 3;
} else {
out.push_back(current_character);
current_line_length++;
}
}
if (is_header)
out.Append(kRFC2047EncodingSuffix, kRFC2047EncodingSuffixLength);
}
String ConvertToPrintableCharacters(const String& text) {
// If the text contains all printable ASCII characters, no need for encoding.
bool found_non_printable_char = false;
for (wtf_size_t i = 0; i < text.length(); ++i) {
if (!IsASCIIPrintable(text[i])) {
found_non_printable_char = true;
break;
}
}
if (!found_non_printable_char)
return text;
// Encode the text as sequences of printable ASCII characters per RFC 2047
// (https://tools.ietf.org/html/rfc2047). Specially, the encoded text will be
// as: =?utf-8?Q?encoded_text?=
// where, "utf-8" is the chosen charset to represent the text and "Q" is the
// Quoted-Printable format to convert to 7-bit printable ASCII characters.
CString utf8_text = text.Utf8();
Vector<char> encoded_text;
QuotedPrintableEncode(utf8_text.data(), utf8_text.length(),
true /* is_header */, encoded_text);
return String(encoded_text.data(), encoded_text.size());
}
} // namespace
MHTMLArchive::MHTMLArchive() : load_result_(MHTMLLoadResult::kInvalidArchive) {}
// static
void MHTMLArchive::ReportLoadResult(MHTMLLoadResult result) {
UMA_HISTOGRAM_ENUMERATION("PageSerialization.MhtmlLoading.LoadResult",
result);
}
// static
MHTMLArchive* MHTMLArchive::Create(const KURL& url,
scoped_refptr<const SharedBuffer> data) {
MHTMLArchive* archive = CreateArchive(url, data);
ReportLoadResult(archive->LoadResult());
return archive;
}
// static
MHTMLArchive* MHTMLArchive::CreateArchive(
const KURL& url,
scoped_refptr<const SharedBuffer> data) {
MHTMLArchive* archive = MakeGarbageCollected<MHTMLArchive>();
// |data| may be null if archive file is empty.
if (!data || data->IsEmpty()) {
archive->load_result_ = MHTMLLoadResult::kEmptyFile;
return archive;
}
// MHTML pages can only be loaded from local URLs, http/https URLs, and
// content URLs(Android specific). The latter is now allowed due to full
// sandboxing enforcement on MHTML pages.
if (!CanLoadArchive(url)) {
archive->load_result_ = MHTMLLoadResult::kUrlSchemeNotAllowed;
return archive;
}
MHTMLParser parser(std::move(data));
HeapVector<Member<ArchiveResource>> resources = parser.ParseArchive();
if (resources.IsEmpty()) {
archive->load_result_ = MHTMLLoadResult::kInvalidArchive;
return archive;
}
archive->date_ = parser.CreationDate();
size_t resources_count = resources.size();
// The first document suitable resource is the main resource of the top frame.
for (ArchiveResource* resource : resources) {
if (archive->MainResource()) {
archive->AddSubresource(resource);
continue;
}
const AtomicString& mime_type = resource->MimeType();
bool is_mime_type_suitable_for_main_resource =
MIMETypeRegistry::IsSupportedNonImageMIMEType(mime_type);
// Want to allow image-only MHTML archives, but retain behavior for other
// documents that have already been created expecting the first HTML page to
// be considered the main resource.
if (resources_count == 1 &&
MIMETypeRegistry::IsSupportedImageResourceMIMEType(mime_type)) {
is_mime_type_suitable_for_main_resource = true;
}
// explicitly disallow JS and CSS as the main resource.
if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(mime_type) ||
MIMETypeRegistry::IsSupportedStyleSheetMIMEType(mime_type))
is_mime_type_suitable_for_main_resource = false;
if (is_mime_type_suitable_for_main_resource)
archive->SetMainResource(resource);
else
archive->AddSubresource(resource);
}
if (archive->MainResource())
archive->load_result_ = MHTMLLoadResult::kSuccess;
else
archive->load_result_ = MHTMLLoadResult::kMissingMainResource;
return archive;
}
bool MHTMLArchive::CanLoadArchive(const KURL& url) {
// MHTML pages can only be loaded from local URLs, http/https URLs, and
// content URLs(Android specific). The latter is now allowed due to full
// sandboxing enforcement on MHTML pages.
if (SchemeRegistry::ShouldTreatURLSchemeAsLocal(url.Protocol()))
return true;
if (url.ProtocolIsInHTTPFamily())
return true;
#if defined(OS_ANDROID)
if (url.ProtocolIs("content"))
return true;
#endif
return false;
}
void MHTMLArchive::GenerateMHTMLHeader(const String& boundary,
const KURL& url,
const String& title,
const String& mime_type,
WTF::Time date,
Vector<char>& output_buffer) {
DCHECK(!boundary.IsEmpty());
DCHECK(!mime_type.IsEmpty());
String date_string = MakeRFC2822DateString(date, 0);
StringBuilder string_builder;
string_builder.Append("From: <Saved by Blink>\r\n");
// Add the document URL in the MHTML headers in order to avoid complicated
// parsing to locate it in the multipart body headers.
string_builder.Append("Snapshot-Content-Location: ");
string_builder.Append(url.GetString());
string_builder.Append("\r\nSubject: ");
string_builder.Append(ConvertToPrintableCharacters(title));
string_builder.Append("\r\nDate: ");
string_builder.Append(date_string);
string_builder.Append("\r\nMIME-Version: 1.0\r\n");
string_builder.Append("Content-Type: multipart/related;\r\n");
string_builder.Append("\ttype=\"");
string_builder.Append(mime_type);
string_builder.Append("\";\r\n");
string_builder.Append("\tboundary=\"");
string_builder.Append(boundary);
string_builder.Append("\"\r\n\r\n");
// We use utf8() below instead of ascii() as ascii() replaces CRLFs with ??
// (we still only have put ASCII characters in it).
DCHECK(string_builder.ToString().ContainsOnlyASCIIOrEmpty());
CString ascii_string = string_builder.ToString().Utf8();
output_buffer.Append(ascii_string.data(), ascii_string.length());
}
void MHTMLArchive::GenerateMHTMLPart(const String& boundary,
const String& content_id,
EncodingPolicy encoding_policy,
const SerializedResource& resource,
Vector<char>& output_buffer) {
DCHECK(!boundary.IsEmpty());
DCHECK(content_id.IsEmpty() || content_id[0] == '<');
StringBuilder string_builder;
// Per the spec, the boundary must occur at the beginning of a line.
string_builder.Append("\r\n--");
string_builder.Append(boundary);
string_builder.Append("\r\n");
string_builder.Append("Content-Type: ");
string_builder.Append(resource.mime_type);
string_builder.Append("\r\n");
if (!content_id.IsEmpty()) {
string_builder.Append("Content-ID: ");
string_builder.Append(content_id);
string_builder.Append("\r\n");
}
const char* content_encoding = nullptr;
if (encoding_policy == kUseBinaryEncoding)
content_encoding = kBinary;
else if (MIMETypeRegistry::IsSupportedJavaScriptMIMEType(
resource.mime_type) ||
MIMETypeRegistry::IsSupportedNonImageMIMEType(resource.mime_type))
content_encoding = kQuotedPrintable;
else
content_encoding = kBase64;
string_builder.Append("Content-Transfer-Encoding: ");
string_builder.Append(content_encoding);
string_builder.Append("\r\n");
if (!resource.url.ProtocolIsAbout()) {
string_builder.Append("Content-Location: ");
string_builder.Append(resource.url.GetString());
string_builder.Append("\r\n");
}
string_builder.Append("\r\n");
CString ascii_string = string_builder.ToString().Utf8();
output_buffer.Append(ascii_string.data(), ascii_string.length());
if (!strcmp(content_encoding, kBinary)) {
for (const auto& span : *resource.data)
output_buffer.Append(span.data(), SafeCast<wtf_size_t>(span.size()));
} else {
// FIXME: ideally we would encode the content as a stream without having to
// fetch it all.
const SharedBuffer::DeprecatedFlatData flat_data(resource.data);
const char* data = flat_data.Data();
size_t data_length = flat_data.size();
Vector<char> encoded_data;
if (!strcmp(content_encoding, kQuotedPrintable)) {
QuotedPrintableEncode(data, SafeCast<wtf_size_t>(data_length),
false /* is_header */, encoded_data);
output_buffer.Append(encoded_data.data(), encoded_data.size());
} else {
DCHECK(!strcmp(content_encoding, kBase64));
// We are not specifying insertLFs = true below as it would cut the lines
// with LFs and MHTML requires CRLFs.
Base64Encode(data, SafeCast<wtf_size_t>(data_length), encoded_data);
wtf_size_t index = 0;
wtf_size_t encoded_data_length = encoded_data.size();
do {
wtf_size_t line_length =
std::min(encoded_data_length - index, kMaximumLineLength);
output_buffer.Append(encoded_data.data() + index, line_length);
output_buffer.Append("\r\n", 2u);
index += kMaximumLineLength;
} while (index < encoded_data_length);
}
}
}
void MHTMLArchive::GenerateMHTMLFooterForTesting(const String& boundary,
Vector<char>& output_buffer) {
DCHECK(!boundary.IsEmpty());
CString ascii_string = String("\r\n--" + boundary + "--\r\n").Utf8();
output_buffer.Append(ascii_string.data(), ascii_string.length());
}
void MHTMLArchive::SetMainResource(ArchiveResource* main_resource) {
main_resource_ = main_resource;
}
void MHTMLArchive::AddSubresource(ArchiveResource* resource) {
const KURL& url = resource->Url();
subresources_.Set(url, resource);
KURL cid_uri = MHTMLParser::ConvertContentIDToURI(resource->ContentID());
if (cid_uri.IsValid())
subresources_.Set(cid_uri, resource);
}
ArchiveResource* MHTMLArchive::SubresourceForURL(const KURL& url) const {
return subresources_.at(url.GetString());
}
void MHTMLArchive::Trace(blink::Visitor* visitor) {
visitor->Trace(main_resource_);
visitor->Trace(subresources_);
}
} // namespace blink