blob: d07cabf28fb883e989d1f75a7d16f5af1d9696d1 [file] [log] [blame]
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.
/*
* Copyright (C) 2008 Apple Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "content/browser/appcache/appcache_manifest_parser.h"
#include <stddef.h>
#include <tuple>
#include <utility>
#include "base/logging.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversions.h"
#include "url/gurl.h"
namespace content {
namespace {
// Values for the mode in the AppCache manifest parsing algorithm specification.
enum class Mode {
kExplicit, // In the CACHE: section.
kIntercept, // In the CHROMIUM-INTERCEPT: section. (non-standard)
kFallback, // In the FALLBACK: section.
kOnlineSafelist, // In the NETWORK: section.
kUnknown, // Sections that are not covered by the spec.
};
// AppCache defines whitespace as CR / LF / space (0x20) / tab (0x09).
constexpr bool IsWhiteSpace(char character) {
return (character == ' ') || (character == '\t') || (character == '\n') ||
(character == '\r');
}
// AppCache defines newline characters as CR or LF.
constexpr bool IsNewLine(char character) {
return (character == '\n') || (character == '\r');
}
// AppCache defines token separators as space (0x20) or tab (0x09).
constexpr bool IsTokenSeparator(char character) {
return (character == ' ') || (character == '\t');
}
// Removes the characters at the beginning of the string up to a newline.
base::StringPiece TrimToFirstNewLine(base::StringPiece data) {
size_t skip = 0;
while (skip < data.length() && !IsNewLine(data[skip]))
++skip;
return data.substr(skip);
}
// Removes whitespace characters at the beginning of the string.
base::StringPiece TrimStartingWhiteSpace(base::StringPiece data) {
size_t skip = 0;
while (skip < data.length() && IsWhiteSpace(data[skip]))
++skip;
return data.substr(skip);
}
// Removes whitespace characters at the end of the string.
base::StringPiece TrimTrailingWhiteSpace(base::StringPiece data) {
size_t length = data.size();
while (length != 0) {
--length;
if (!IsWhiteSpace(data[length])) {
++length;
break;
}
}
return data.substr(0, length);
}
// Splits a string at the first occurrence of a newline.
//
// Returns the first line, which is guaranteed not to include a newline, and the
// rest of the string, which may be empty.
std::pair<base::StringPiece, base::StringPiece> SplitOnNewLine(
base::StringPiece data) {
size_t split = 0;
while (split < data.length() && !IsNewLine(data[split]))
++split;
return {data.substr(0, split), data.substr(split)};
}
// True if the string does not contain any newline character.
bool IsSingleLine(base::StringPiece maybe_line) {
return !std::any_of(maybe_line.begin(), maybe_line.end(), &IsNewLine);
}
// Splits a token out of a manifest line.
//
// Tokens are separated by space (0x20) or tab (0x09) characters.
//
// The line must not start with a whitespace character.
//
// Returns the token and the rest of the line. Consumes the whitespace after the
// returned token -- the rest of the line will not start with whitespace.
std::pair<base::StringPiece, base::StringPiece> SplitLineToken(
base::StringPiece line) {
DCHECK(IsSingleLine(line));
DCHECK(line.empty() || !IsWhiteSpace(line[0]));
size_t token_end = 0;
while (token_end < line.length() && !IsTokenSeparator(line[token_end]))
++token_end;
size_t split = token_end;
while (split < line.length() && IsTokenSeparator(line[split]))
++split;
return {line.substr(0, token_end), line.substr(split)};
}
// True if the given line is a mode-setting line.
//
// In the AppCache parsing algorithm, the mode only changes when processing a
// line that ends with ':' (colon) after whitespace removal.
//
// The given string must have had whitespace stripped at both ends.
bool IsModeSettingLine(base::StringPiece line) {
DCHECK(IsSingleLine(line));
if (line.empty())
return false;
DCHECK(!IsWhiteSpace(line[0])) << "line starts with whitespace";
const auto last_character = line[line.length() - 1];
DCHECK(!IsWhiteSpace(last_character)) << "line ends with whitespace";
return last_character == ':';
}
// The mode that the AppCache parsing algorithm will be switched to.
//
// The given string must be a mode-setting line.
Mode ParseModeSettingLine(base::StringPiece line) {
DCHECK(IsModeSettingLine(line));
static constexpr base::StringPiece kCacheLine("CACHE:");
if (line == kCacheLine)
return Mode::kExplicit;
static constexpr base::StringPiece kFallbackLine("FALLBACK:");
if (line == kFallbackLine)
return Mode::kFallback;
static constexpr base::StringPiece kNetworkLine("NETWORK:");
if (line == kNetworkLine)
return Mode::kOnlineSafelist;
static constexpr base::StringPiece kInterceptLine("CHROMIUM-INTERCEPT:");
if (line == kInterceptLine)
return Mode::kIntercept;
return Mode::kUnknown;
}
// True if the next token in the manifest line is the pattern indicator flag.
//
// Pattern URLs are a non-standard feature.
bool NextTokenIsPatternMatchingFlag(base::StringPiece line) {
base::StringPiece is_pattern_token;
std::tie(is_pattern_token, line) = SplitLineToken(line);
static constexpr base::StringPiece kPatternFlag("isPattern");
return is_pattern_token == kPatternFlag;
}
// Parses a URL token in an AppCache manifest.
//
// The returned URL may not be valid, if the token does not represent a valid
// URL.
//
// Per the AppCache specification, the URL is resolved relative to the manifest
// URL, and stripped of any fragment.
GURL ParseUrlToken(base::StringPiece url_token, const GURL& manifest_url) {
GURL url = manifest_url.Resolve(url_token);
if (!url.is_valid())
return url;
if (url.has_ref()) {
GURL::Replacements replacements;
replacements.ClearRef();
url = url.ReplaceComponents(replacements);
}
return url;
}
bool ScopeMatches(const GURL& manifest_url, const GURL& namespace_url) {
return base::StartsWith(namespace_url.spec(),
manifest_url.GetWithoutFilename().spec(),
base::CompareCase::SENSITIVE);
}
} // namespace
AppCacheManifest::AppCacheManifest() = default;
AppCacheManifest::~AppCacheManifest() = default;
bool ParseManifest(const GURL& manifest_url,
const char* manifest_bytes,
int manifest_size,
ParseMode parse_mode,
AppCacheManifest& manifest) {
// The parsing algorithm is specified at
// https://html.spec.whatwg.org/multipage/offline.html
DCHECK(manifest.explicit_urls.empty());
DCHECK(manifest.fallback_namespaces.empty());
DCHECK(manifest.online_whitelist_namespaces.empty());
DCHECK(!manifest.online_whitelist_all);
DCHECK(!manifest.did_ignore_intercept_namespaces);
DCHECK(!manifest.did_ignore_fallback_namespaces);
Mode mode = Mode::kExplicit;
// The specification requires UTF-8-decoding the manifest, which replaces
// invalid UTF-8 characters with placeholders. It would be nice if
// utf_string_conversions included a UTF-8 to UTF-8 conversion for this
// purpose, but AppCache isn't important enough to add conversion code just
// to accelerate manifest decoding.
DCHECK_GE(manifest_size, 0);
base::string16 wide_manifest_bytes =
base::UTF8ToUTF16(base::StringPiece(manifest_bytes, manifest_size));
std::string decoded_manifest_bytes = base::UTF16ToUTF8(wide_manifest_bytes);
// The bytes of the manifest that haven't been consumed yet.
base::StringPiece data(decoded_manifest_bytes);
// Discard a leading UTF-8 Byte-Order-Mark (BOM) (0xEF, 0xBB, 0xBF);
static constexpr base::StringPiece kUtf8Bom("\xEF\xBB\xBF");
if (data.starts_with(kUtf8Bom))
data = data.substr(kUtf8Bom.length());
// The manifest has to start with a well-defined signature.
static constexpr base::StringPiece kSignature("CACHE MANIFEST");
static constexpr base::StringPiece kChromiumSignature(
"CHROMIUM CACHE MANIFEST");
if (data.starts_with(kSignature)) {
data = data.substr(kSignature.length());
} else if (data.starts_with(kChromiumSignature)) {
// Chrome recognizes a separate signature, CHROMIUM CACHE MANIFEST. This was
// built so that manifests that use the Chrome-only feature
// CHROMIUM-INTERCEPT will be ignored by other browsers.
// See https://crbug.com/101565
// TODO(pwnall): Add a UMA metric to see if we can remove support for this
// non-standard signature.
data = data.substr(kChromiumSignature.length());
} else {
return false;
}
// The character after "CACHE MANIFEST" must be a whitespace character.
if (!data.empty() && !IsWhiteSpace(data[0]))
return false;
// The spec requires ignoring any characters on the first line after the
// signature and its following whitespace.
data = TrimToFirstNewLine(data);
while (true) {
data = TrimStartingWhiteSpace(data);
if (data.empty())
break;
base::StringPiece line;
std::tie(line, data) = SplitOnNewLine(data);
// The checks above guarantee that the input to SplitOnNewLine() starts with
// a non-whitespace character.
DCHECK(!line.empty());
if (line[0] == '#') // Lines starting with # are comments.
continue;
line = TrimTrailingWhiteSpace(line);
// Handle all the steps checking for lines that end with ":".
if (IsModeSettingLine(line)) {
mode = ParseModeSettingLine(line);
continue;
}
if (mode == Mode::kUnknown)
continue;
static constexpr base::StringPiece kOnlineSafelistWildcard("*");
if (mode == Mode::kOnlineSafelist && line == kOnlineSafelistWildcard) {
manifest.online_whitelist_all = true;
continue;
}
// Chrome does not implement the SETTINGS: section. If we ever decided to do
// so, the implementation would go here.
// Common code for the following sections: explicit (CACHE:),
// fallback (FALLBACK:), online safelist (NETWORK:) and intercept
// (CHROMIUM-INTERCEPT:). All these sections start by parsing a URL token.
base::StringPiece namespace_url_token;
std::tie(namespace_url_token, line) = SplitLineToken(line);
GURL namespace_url = ParseUrlToken(namespace_url_token, manifest_url);
if (!namespace_url.is_valid())
continue;
if (mode == Mode::kExplicit || mode == Mode::kOnlineSafelist) {
// Scheme component must be the same as the manifest URL's.
if (namespace_url.scheme() != manifest_url.scheme()) {
continue;
}
// Deviate from the HTML5 spec by supporting the caching of cross-origin
// HTTPS resources. See https://crbug.com/69594
//
// Per the spec, explicit (CACHE:) cross-origin HTTPS resources should be
// ignored here. We've opted for a milder constraint and allow caching
// unless the resource has a "no-store" header. That condition is enforced
// in AppCacheUpdateJob.
if (mode == Mode::kExplicit) {
manifest.explicit_urls.insert(namespace_url.spec());
} else {
// Chrome supports URL patterns in manifests. This is not standardized.
// An URL record followed by the "isPattern" token is considered a
// pattern.
// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
bool is_pattern = NextTokenIsPatternMatchingFlag(line);
manifest.online_whitelist_namespaces.emplace_back(AppCacheNamespace(
APPCACHE_NETWORK_NAMESPACE, namespace_url, GURL(), is_pattern));
}
continue;
}
if (mode == Mode::kIntercept) {
// Chrome supports a CHROMIUM-INTERCEPT section. https://crbug.com/101565
//
// This section consists of entries of the form:
// namespace_url verb url_target
if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
manifest.did_ignore_intercept_namespaces = true;
continue;
}
if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
continue;
// The only supported verb is "return".
base::StringPiece verb_token;
std::tie(verb_token, line) = SplitLineToken(line);
static constexpr base::StringPiece kReturnVerb("return");
if (verb_token != kReturnVerb)
continue;
base::StringPiece target_url_token;
std::tie(target_url_token, line) = SplitLineToken(line);
if (target_url_token.empty())
continue;
GURL target_url = ParseUrlToken(target_url_token, manifest_url);
if (!target_url.is_valid())
continue;
if (manifest_url.GetOrigin() != target_url.GetOrigin())
continue;
// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
bool is_pattern = NextTokenIsPatternMatchingFlag(line);
manifest.intercept_namespaces.emplace_back(
APPCACHE_INTERCEPT_NAMESPACE, namespace_url, target_url, is_pattern);
continue;
}
if (mode == Mode::kFallback) {
if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
continue;
if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
if (!ScopeMatches(manifest_url, namespace_url)) {
manifest.did_ignore_fallback_namespaces = true;
continue;
}
}
base::StringPiece fallback_url_token;
std::tie(fallback_url_token, line) = SplitLineToken(line);
if (fallback_url_token.empty())
continue;
GURL fallback_url = ParseUrlToken(fallback_url_token, manifest_url);
if (!fallback_url.is_valid())
continue;
if (manifest_url.GetOrigin() != fallback_url.GetOrigin())
continue;
// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
bool is_pattern = NextTokenIsPatternMatchingFlag(line);
// Store regardless of duplicate namespace URL. Only the first match will
// ever be used.
manifest.fallback_namespaces.emplace_back(
AppCacheNamespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url,
fallback_url, is_pattern));
continue;
}
NOTREACHED() << "Unimplemented AppCache manifest parser mode";
}
return true;
}
} // namespace content