content/browser/appcache/appcache_manifest_parser.cc - chromium/src - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
 // This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.

 /*
  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "content/browser/appcache/appcache_manifest_parser.h"

 #include <stddef.h>

 #include <tuple>
 #include <utility>

 #include "base/logging.h"
 #include "base/strings/string_piece.h"
 #include "base/strings/utf_string_conversions.h"
 #include "url/gurl.h"

 namespace content {

 namespace {

 // Values for the mode in the AppCache manifest parsing algorithm specification.
 enum class Mode {
   kExplicit,        // In the CACHE: section.
   kIntercept,       // In the CHROMIUM-INTERCEPT: section. (non-standard)
   kFallback,        // In the FALLBACK: section.
   kOnlineSafelist,  // In the NETWORK: section.
   kUnknown,         // Sections that are not covered by the spec.
 };

 // AppCache defines whitespace as CR / LF / space (0x20) / tab (0x09).
 constexpr bool IsWhiteSpace(char character) {
   return (character == ' ') || (character == '\t') || (character == '\n') ||
          (character == '\r');
 }

 // AppCache defines newline characters as CR or LF.
 constexpr bool IsNewLine(char character) {
   return (character == '\n') || (character == '\r');
 }

 // AppCache defines token separators as space (0x20) or tab (0x09).
 constexpr bool IsTokenSeparator(char character) {
   return (character == ' ') || (character == '\t');
 }

 // Removes the characters at the beginning of the string up to a newline.
 constexpr base::StringPiece TrimToFirstNewLine(base::StringPiece data) {
   size_t skip = 0;
   while (skip < data.length() && !IsNewLine(data[skip]))
     ++skip;
   return data.substr(skip);
 }

 // Removes whitespace characters at the beginning of the string.
 constexpr base::StringPiece TrimStartingWhiteSpace(base::StringPiece data) {
   size_t skip = 0;
   while (skip < data.length() && IsWhiteSpace(data[skip]))
     ++skip;
   return data.substr(skip);
 }

 // Removes whitespace characters at the end of the string.
 constexpr base::StringPiece TrimTrailingWhiteSpace(base::StringPiece data) {
   size_t length = data.size();

   while (length != 0) {
     --length;
     if (!IsWhiteSpace(data[length])) {
       ++length;
       break;
     }
   }
   return data.substr(0, length);
 }

 // Splits a string at the first occurrence of a newline.
 //
 // Returns the first line, which is guaranteed not to include a newline, and the
 // rest of the string, which may be empty.
 constexpr std::pair<base::StringPiece, base::StringPiece> SplitOnNewLine(
     base::StringPiece data) {
   size_t split = 0;
   while (split < data.length() && !IsNewLine(data[split]))
     ++split;
   return {data.substr(0, split), data.substr(split)};
 }

 // True if the string does not contain any newline character.
 bool IsSingleLine(base::StringPiece maybe_line) {
   return !std::any_of(maybe_line.begin(), maybe_line.end(), &IsNewLine);
 }

 // Splits a token out of a manifest line.
 //
 // Tokens are separated by space (0x20) or tab (0x09) characters.
 //
 // The line must not start with a whitespace character.
 //
 // Returns the token and the rest of the line. Consumes the whitespace after the
 // returned token -- the rest of the line will not start with whitespace.
 std::pair<base::StringPiece, base::StringPiece> SplitLineToken(
     base::StringPiece line) {
   DCHECK(IsSingleLine(line));
   DCHECK(line.empty() || !IsWhiteSpace(line[0]));

   size_t token_end = 0;
   while (token_end < line.length() && !IsTokenSeparator(line[token_end]))
     ++token_end;

   size_t split = token_end;
   while (split < line.length() && IsTokenSeparator(line[split]))
     ++split;

   return {line.substr(0, token_end), line.substr(split)};
 }

 // True if the given line is a mode-setting line.
 //
 // In the AppCache parsing algorithm, the mode only changes when processing a
 // line that ends with ':' (colon) after whitespace removal.
 //
 // The given string must have had whitespace stripped at both ends.
 bool IsModeSettingLine(base::StringPiece line) {
   DCHECK(IsSingleLine(line));

   if (line.empty())
     return false;

   DCHECK(!IsWhiteSpace(line[0])) << "line starts with whitespace";

   const auto last_character = line[line.length() - 1];
   DCHECK(!IsWhiteSpace(last_character)) << "line ends with whitespace";

   return last_character == ':';
 }

 // The mode that the AppCache parsing algorithm will be switched to.
 //
 // The given string must be a mode-setting line.
 Mode ParseModeSettingLine(base::StringPiece line) {
   DCHECK(IsModeSettingLine(line));

   static constexpr base::StringPiece kCacheLine("CACHE:");
   if (line == kCacheLine)
     return Mode::kExplicit;

   static constexpr base::StringPiece kFallbackLine("FALLBACK:");
   if (line == kFallbackLine)
     return Mode::kFallback;

   static constexpr base::StringPiece kNetworkLine("NETWORK:");
   if (line == kNetworkLine)
     return Mode::kOnlineSafelist;

   static constexpr base::StringPiece kInterceptLine("CHROMIUM-INTERCEPT:");
   if (line == kInterceptLine)
     return Mode::kIntercept;

   return Mode::kUnknown;
 }

 // True if the next token in the manifest line is the pattern indicator flag.
 //
 // Pattern URLs are a non-standard feature.
 bool NextTokenIsPatternMatchingFlag(base::StringPiece line) {
   base::StringPiece is_pattern_token;
   std::tie(is_pattern_token, line) = SplitLineToken(line);

   static constexpr base::StringPiece kPatternFlag("isPattern");
   return is_pattern_token == kPatternFlag;
 }

 // Parses a URL token in an AppCache manifest.
 //
 // The returned URL may not be valid, if the token does not represent a valid
 // URL.
 //
 // Per the AppCache specification, the URL is resolved relative to the manifest
 // URL, and stripped of any fragment.
 GURL ParseUrlToken(base::StringPiece url_token, const GURL& manifest_url) {
   GURL url = manifest_url.Resolve(url_token);
   if (!url.is_valid())
     return url;

   if (url.has_ref()) {
     GURL::Replacements replacements;
     replacements.ClearRef();
     url = url.ReplaceComponents(replacements);
   }
   return url;
 }

 bool ScopeMatches(const GURL& manifest_url, const GURL& namespace_url) {
   return base::StartsWith(namespace_url.spec(),
                           manifest_url.GetWithoutFilename().spec(),
                           base::CompareCase::SENSITIVE);
 }

 }  // namespace

 AppCacheManifest::AppCacheManifest() = default;

 AppCacheManifest::~AppCacheManifest() = default;

 bool ParseManifest(const GURL& manifest_url,
                    const char* manifest_bytes,
                    int manifest_size,
                    ParseMode parse_mode,
                    AppCacheManifest& manifest) {
   // The parsing algorithm is specified at
   //   https://html.spec.whatwg.org/multipage/offline.html

   DCHECK(manifest.explicit_urls.empty());
   DCHECK(manifest.fallback_namespaces.empty());
   DCHECK(manifest.online_whitelist_namespaces.empty());
   DCHECK(!manifest.online_whitelist_all);
   DCHECK(!manifest.did_ignore_intercept_namespaces);
   DCHECK(!manifest.did_ignore_fallback_namespaces);

   Mode mode = Mode::kExplicit;

   // The specification requires UTF-8-decoding the manifest, which replaces
   // invalid UTF-8 characters with placeholders. It would be nice if
   // utf_string_conversions included a UTF-8 to UTF-8 conversion for this
   // purpose, but AppCache isn't important enough to add conversion code just
   // to accelerate manifest decoding.
   DCHECK_GE(manifest_size, 0);
   base::string16 wide_manifest_bytes =
       base::UTF8ToUTF16(base::StringPiece(manifest_bytes, manifest_size));
   std::string decoded_manifest_bytes = base::UTF16ToUTF8(wide_manifest_bytes);

   // The bytes of the manifest that haven't been consumed yet.
   base::StringPiece data(decoded_manifest_bytes);

   // Discard a leading UTF-8 Byte-Order-Mark (BOM) (0xEF, 0xBB, 0xBF);
   static constexpr base::StringPiece kUtf8Bom("\xEF\xBB\xBF");
   if (data.starts_with(kUtf8Bom))
     data = data.substr(kUtf8Bom.length());

   // The manifest has to start with a well-defined signature.
   static constexpr base::StringPiece kSignature("CACHE MANIFEST");
   static constexpr base::StringPiece kChromiumSignature(
       "CHROMIUM CACHE MANIFEST");
   if (data.starts_with(kSignature)) {
     data = data.substr(kSignature.length());
   } else if (data.starts_with(kChromiumSignature)) {
     // Chrome recognizes a separate signature, CHROMIUM CACHE MANIFEST. This was
     // built so that manifests that use the Chrome-only feature
     // CHROMIUM-INTERCEPT will be ignored by other browsers.
     // See https://crbug.com/101565

     // TODO(pwnall): Add a UMA metric to see if we can remove support for this
     //               non-standard signature.
     data = data.substr(kChromiumSignature.length());
   } else {
     return false;
   }

   // The character after "CACHE MANIFEST" must be a whitespace character.
   if (!data.empty() && !IsWhiteSpace(data[0]))
     return false;

   // The spec requires ignoring any characters on the first line after the
   // signature and its following whitespace.
   data = TrimToFirstNewLine(data);

   while (true) {
     data = TrimStartingWhiteSpace(data);
     if (data.empty())
       break;

     base::StringPiece line;
     std::tie(line, data) = SplitOnNewLine(data);

     // The checks above guarantee that the input to SplitOnNewLine() starts with
     // a non-whitespace character.
     DCHECK(!line.empty());

     if (line[0] == '#')  // Lines starting with # are comments.
       continue;

     line = TrimTrailingWhiteSpace(line);

     // Handle all the steps checking for lines that end with ":".
     if (IsModeSettingLine(line)) {
       mode = ParseModeSettingLine(line);
       continue;
     }

     if (mode == Mode::kUnknown)
       continue;

     static constexpr base::StringPiece kOnlineSafelistWildcard("*");
     if (mode == Mode::kOnlineSafelist && line == kOnlineSafelistWildcard) {
       manifest.online_whitelist_all = true;
       continue;
     }

     // Chrome does not implement the SETTINGS: section. If we ever decided to do
     // so, the implementation would go here.

     // Common code for the following sections: explicit (CACHE:),
     // fallback (FALLBACK:), online safelist (NETWORK:) and intercept
     // (CHROMIUM-INTERCEPT:). All these sections start by parsing a URL token.
     base::StringPiece namespace_url_token;
     std::tie(namespace_url_token, line) = SplitLineToken(line);
     GURL namespace_url = ParseUrlToken(namespace_url_token, manifest_url);
     if (!namespace_url.is_valid())
       continue;

     if (mode == Mode::kExplicit || mode == Mode::kOnlineSafelist) {
       // Scheme component must be the same as the manifest URL's.
       if (namespace_url.scheme() != manifest_url.scheme()) {
         continue;
       }

       // Deviate from the HTML5 spec by supporting the caching of cross-origin
       // HTTPS resources. See https://crbug.com/69594
       //
       // Per the spec, explicit (CACHE:) cross-origin HTTPS resources should be
       // ignored here. We've opted for a milder constraint and allow caching
       // unless the resource has a "no-store" header. That condition is enforced
       // in AppCacheUpdateJob.

       if (mode == Mode::kExplicit) {
         manifest.explicit_urls.insert(namespace_url.spec());
       } else {
         // Chrome supports URL patterns in manifests. This is not standardized.
         // An URL record followed by the "isPattern" token is considered a
         // pattern.

         // TODO(pwnall): Add a UMA metric to see if we can remove this feature.
         bool is_pattern = NextTokenIsPatternMatchingFlag(line);
         manifest.online_whitelist_namespaces.emplace_back(AppCacheNamespace(
             APPCACHE_NETWORK_NAMESPACE, namespace_url, GURL(), is_pattern));
       }
       continue;
     }

     if (mode == Mode::kIntercept) {
       // Chrome supports a CHROMIUM-INTERCEPT section.  https://crbug.com/101565
       //
       // This section consists of entries of the form:
       // namespace_url verb url_target

       if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
         manifest.did_ignore_intercept_namespaces = true;
         continue;
       }

       if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
         continue;

       // The only supported verb is "return".
       base::StringPiece verb_token;
       std::tie(verb_token, line) = SplitLineToken(line);
       static constexpr base::StringPiece kReturnVerb("return");
       if (verb_token != kReturnVerb)
         continue;

       base::StringPiece target_url_token;
       std::tie(target_url_token, line) = SplitLineToken(line);
       if (target_url_token.empty())
         continue;
       GURL target_url = ParseUrlToken(target_url_token, manifest_url);
       if (!target_url.is_valid())
         continue;

       if (manifest_url.GetOrigin() != target_url.GetOrigin())
         continue;

       // TODO(pwnall): Add a UMA metric to see if we can remove this feature.
       bool is_pattern = NextTokenIsPatternMatchingFlag(line);
       manifest.intercept_namespaces.emplace_back(
           APPCACHE_INTERCEPT_NAMESPACE, namespace_url, target_url, is_pattern);
       continue;
     }

     if (mode == Mode::kFallback) {
       if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
         continue;

       if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
         if (!ScopeMatches(manifest_url, namespace_url)) {
           manifest.did_ignore_fallback_namespaces = true;
           continue;
         }
       }

       base::StringPiece fallback_url_token;
       std::tie(fallback_url_token, line) = SplitLineToken(line);
       if (fallback_url_token.empty())
         continue;
       GURL fallback_url = ParseUrlToken(fallback_url_token, manifest_url);
       if (!fallback_url.is_valid())
         continue;

       if (manifest_url.GetOrigin() != fallback_url.GetOrigin())
         continue;

       // TODO(pwnall): Add a UMA metric to see if we can remove this feature.
       bool is_pattern = NextTokenIsPatternMatchingFlag(line);

       // Store regardless of duplicate namespace URL. Only the first match will
       // ever be used.
       manifest.fallback_namespaces.emplace_back(
           AppCacheNamespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url,
                             fallback_url, is_pattern));
       continue;
     }

     NOTREACHED() << "Unimplemented AppCache manifest parser mode";
   }

   return true;
 }

 }  // namespace content
	// Copyright 2014 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	//
	// This is a port of ManifestParser.cc from WebKit/WebCore/loader/appcache.

	/*
	* Copyright (C) 2008 Apple Inc. All Rights Reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "content/browser/appcache/appcache_manifest_parser.h"

	#include <stddef.h>

	#include <tuple>
	#include <utility>

	#include "base/logging.h"
	#include "base/strings/string_piece.h"
	#include "base/strings/utf_string_conversions.h"
	#include "url/gurl.h"

	namespace content {

	namespace {

	// Values for the mode in the AppCache manifest parsing algorithm specification.
	enum class Mode {
	kExplicit, // In the CACHE: section.
	kIntercept, // In the CHROMIUM-INTERCEPT: section. (non-standard)
	kFallback, // In the FALLBACK: section.
	kOnlineSafelist, // In the NETWORK: section.
	kUnknown, // Sections that are not covered by the spec.
	};

	// AppCache defines whitespace as CR / LF / space (0x20) / tab (0x09).
	constexpr bool IsWhiteSpace(char character) {
	return (character == ' ') \|\| (character == '\t') \|\| (character == '\n') \|\|
	(character == '\r');
	}

	// AppCache defines newline characters as CR or LF.
	constexpr bool IsNewLine(char character) {
	return (character == '\n') \|\| (character == '\r');
	}

	// AppCache defines token separators as space (0x20) or tab (0x09).
	constexpr bool IsTokenSeparator(char character) {
	return (character == ' ') \|\| (character == '\t');
	}

	// Removes the characters at the beginning of the string up to a newline.
	constexpr base::StringPiece TrimToFirstNewLine(base::StringPiece data) {
	size_t skip = 0;
	while (skip < data.length() && !IsNewLine(data[skip]))
	++skip;
	return data.substr(skip);
	}

	// Removes whitespace characters at the beginning of the string.
	constexpr base::StringPiece TrimStartingWhiteSpace(base::StringPiece data) {
	size_t skip = 0;
	while (skip < data.length() && IsWhiteSpace(data[skip]))
	++skip;
	return data.substr(skip);
	}

	// Removes whitespace characters at the end of the string.
	constexpr base::StringPiece TrimTrailingWhiteSpace(base::StringPiece data) {
	size_t length = data.size();

	while (length != 0) {
	--length;
	if (!IsWhiteSpace(data[length])) {
	++length;
	break;
	}
	}
	return data.substr(0, length);
	}

	// Splits a string at the first occurrence of a newline.
	//
	// Returns the first line, which is guaranteed not to include a newline, and the
	// rest of the string, which may be empty.
	constexpr std::pair<base::StringPiece, base::StringPiece> SplitOnNewLine(
	base::StringPiece data) {
	size_t split = 0;
	while (split < data.length() && !IsNewLine(data[split]))
	++split;
	return {data.substr(0, split), data.substr(split)};
	}

	// True if the string does not contain any newline character.
	bool IsSingleLine(base::StringPiece maybe_line) {
	return !std::any_of(maybe_line.begin(), maybe_line.end(), &IsNewLine);
	}

	// Splits a token out of a manifest line.
	//
	// Tokens are separated by space (0x20) or tab (0x09) characters.
	//
	// The line must not start with a whitespace character.
	//
	// Returns the token and the rest of the line. Consumes the whitespace after the
	// returned token -- the rest of the line will not start with whitespace.
	std::pair<base::StringPiece, base::StringPiece> SplitLineToken(
	base::StringPiece line) {
	DCHECK(IsSingleLine(line));
	DCHECK(line.empty() \|\| !IsWhiteSpace(line[0]));

	size_t token_end = 0;
	while (token_end < line.length() && !IsTokenSeparator(line[token_end]))
	++token_end;

	size_t split = token_end;
	while (split < line.length() && IsTokenSeparator(line[split]))
	++split;

	return {line.substr(0, token_end), line.substr(split)};
	}

	// True if the given line is a mode-setting line.
	//
	// In the AppCache parsing algorithm, the mode only changes when processing a
	// line that ends with ':' (colon) after whitespace removal.
	//
	// The given string must have had whitespace stripped at both ends.
	bool IsModeSettingLine(base::StringPiece line) {
	DCHECK(IsSingleLine(line));

	if (line.empty())
	return false;

	DCHECK(!IsWhiteSpace(line[0])) << "line starts with whitespace";

	const auto last_character = line[line.length() - 1];
	DCHECK(!IsWhiteSpace(last_character)) << "line ends with whitespace";

	return last_character == ':';
	}

	// The mode that the AppCache parsing algorithm will be switched to.
	//
	// The given string must be a mode-setting line.
	Mode ParseModeSettingLine(base::StringPiece line) {
	DCHECK(IsModeSettingLine(line));

	static constexpr base::StringPiece kCacheLine("CACHE:");
	if (line == kCacheLine)
	return Mode::kExplicit;

	static constexpr base::StringPiece kFallbackLine("FALLBACK:");
	if (line == kFallbackLine)
	return Mode::kFallback;

	static constexpr base::StringPiece kNetworkLine("NETWORK:");
	if (line == kNetworkLine)
	return Mode::kOnlineSafelist;

	static constexpr base::StringPiece kInterceptLine("CHROMIUM-INTERCEPT:");
	if (line == kInterceptLine)
	return Mode::kIntercept;

	return Mode::kUnknown;
	}

	// True if the next token in the manifest line is the pattern indicator flag.
	//
	// Pattern URLs are a non-standard feature.
	bool NextTokenIsPatternMatchingFlag(base::StringPiece line) {
	base::StringPiece is_pattern_token;
	std::tie(is_pattern_token, line) = SplitLineToken(line);

	static constexpr base::StringPiece kPatternFlag("isPattern");
	return is_pattern_token == kPatternFlag;
	}

	// Parses a URL token in an AppCache manifest.
	//
	// The returned URL may not be valid, if the token does not represent a valid
	// URL.
	//
	// Per the AppCache specification, the URL is resolved relative to the manifest
	// URL, and stripped of any fragment.
	GURL ParseUrlToken(base::StringPiece url_token, const GURL& manifest_url) {
	GURL url = manifest_url.Resolve(url_token);
	if (!url.is_valid())
	return url;

	if (url.has_ref()) {
	GURL::Replacements replacements;
	replacements.ClearRef();
	url = url.ReplaceComponents(replacements);
	}
	return url;
	}

	bool ScopeMatches(const GURL& manifest_url, const GURL& namespace_url) {
	return base::StartsWith(namespace_url.spec(),
	manifest_url.GetWithoutFilename().spec(),
	base::CompareCase::SENSITIVE);
	}

	} // namespace

	AppCacheManifest::AppCacheManifest() = default;

	AppCacheManifest::~AppCacheManifest() = default;

	bool ParseManifest(const GURL& manifest_url,
	const char* manifest_bytes,
	int manifest_size,
	ParseMode parse_mode,
	AppCacheManifest& manifest) {
	// The parsing algorithm is specified at
	// https://html.spec.whatwg.org/multipage/offline.html

	DCHECK(manifest.explicit_urls.empty());
	DCHECK(manifest.fallback_namespaces.empty());
	DCHECK(manifest.online_whitelist_namespaces.empty());
	DCHECK(!manifest.online_whitelist_all);
	DCHECK(!manifest.did_ignore_intercept_namespaces);
	DCHECK(!manifest.did_ignore_fallback_namespaces);

	Mode mode = Mode::kExplicit;

	// The specification requires UTF-8-decoding the manifest, which replaces
	// invalid UTF-8 characters with placeholders. It would be nice if
	// utf_string_conversions included a UTF-8 to UTF-8 conversion for this
	// purpose, but AppCache isn't important enough to add conversion code just
	// to accelerate manifest decoding.
	DCHECK_GE(manifest_size, 0);
	base::string16 wide_manifest_bytes =
	base::UTF8ToUTF16(base::StringPiece(manifest_bytes, manifest_size));
	std::string decoded_manifest_bytes = base::UTF16ToUTF8(wide_manifest_bytes);

	// The bytes of the manifest that haven't been consumed yet.
	base::StringPiece data(decoded_manifest_bytes);

	// Discard a leading UTF-8 Byte-Order-Mark (BOM) (0xEF, 0xBB, 0xBF);
	static constexpr base::StringPiece kUtf8Bom("\xEF\xBB\xBF");
	if (data.starts_with(kUtf8Bom))
	data = data.substr(kUtf8Bom.length());

	// The manifest has to start with a well-defined signature.
	static constexpr base::StringPiece kSignature("CACHE MANIFEST");
	static constexpr base::StringPiece kChromiumSignature(
	"CHROMIUM CACHE MANIFEST");
	if (data.starts_with(kSignature)) {
	data = data.substr(kSignature.length());
	} else if (data.starts_with(kChromiumSignature)) {
	// Chrome recognizes a separate signature, CHROMIUM CACHE MANIFEST. This was
	// built so that manifests that use the Chrome-only feature
	// CHROMIUM-INTERCEPT will be ignored by other browsers.
	// See https://crbug.com/101565

	// TODO(pwnall): Add a UMA metric to see if we can remove support for this
	// non-standard signature.
	data = data.substr(kChromiumSignature.length());
	} else {
	return false;
	}

	// The character after "CACHE MANIFEST" must be a whitespace character.
	if (!data.empty() && !IsWhiteSpace(data[0]))
	return false;

	// The spec requires ignoring any characters on the first line after the
	// signature and its following whitespace.
	data = TrimToFirstNewLine(data);

	while (true) {
	data = TrimStartingWhiteSpace(data);
	if (data.empty())
	break;

	base::StringPiece line;
	std::tie(line, data) = SplitOnNewLine(data);

	// The checks above guarantee that the input to SplitOnNewLine() starts with
	// a non-whitespace character.
	DCHECK(!line.empty());

	if (line[0] == '#') // Lines starting with # are comments.
	continue;

	line = TrimTrailingWhiteSpace(line);

	// Handle all the steps checking for lines that end with ":".
	if (IsModeSettingLine(line)) {
	mode = ParseModeSettingLine(line);
	continue;
	}

	if (mode == Mode::kUnknown)
	continue;

	static constexpr base::StringPiece kOnlineSafelistWildcard("*");
	if (mode == Mode::kOnlineSafelist && line == kOnlineSafelistWildcard) {
	manifest.online_whitelist_all = true;
	continue;
	}

	// Chrome does not implement the SETTINGS: section. If we ever decided to do
	// so, the implementation would go here.

	// Common code for the following sections: explicit (CACHE:),
	// fallback (FALLBACK:), online safelist (NETWORK:) and intercept
	// (CHROMIUM-INTERCEPT:). All these sections start by parsing a URL token.
	base::StringPiece namespace_url_token;
	std::tie(namespace_url_token, line) = SplitLineToken(line);
	GURL namespace_url = ParseUrlToken(namespace_url_token, manifest_url);
	if (!namespace_url.is_valid())
	continue;

	if (mode == Mode::kExplicit \|\| mode == Mode::kOnlineSafelist) {
	// Scheme component must be the same as the manifest URL's.
	if (namespace_url.scheme() != manifest_url.scheme()) {
	continue;
	}

	// Deviate from the HTML5 spec by supporting the caching of cross-origin
	// HTTPS resources. See https://crbug.com/69594
	//
	// Per the spec, explicit (CACHE:) cross-origin HTTPS resources should be
	// ignored here. We've opted for a milder constraint and allow caching
	// unless the resource has a "no-store" header. That condition is enforced
	// in AppCacheUpdateJob.

	if (mode == Mode::kExplicit) {
	manifest.explicit_urls.insert(namespace_url.spec());
	} else {
	// Chrome supports URL patterns in manifests. This is not standardized.
	// An URL record followed by the "isPattern" token is considered a
	// pattern.

	// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
	bool is_pattern = NextTokenIsPatternMatchingFlag(line);
	manifest.online_whitelist_namespaces.emplace_back(AppCacheNamespace(
	APPCACHE_NETWORK_NAMESPACE, namespace_url, GURL(), is_pattern));
	}
	continue;
	}

	if (mode == Mode::kIntercept) {
	// Chrome supports a CHROMIUM-INTERCEPT section. https://crbug.com/101565
	//
	// This section consists of entries of the form:
	// namespace_url verb url_target

	if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
	manifest.did_ignore_intercept_namespaces = true;
	continue;
	}

	if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
	continue;

	// The only supported verb is "return".
	base::StringPiece verb_token;
	std::tie(verb_token, line) = SplitLineToken(line);
	static constexpr base::StringPiece kReturnVerb("return");
	if (verb_token != kReturnVerb)
	continue;

	base::StringPiece target_url_token;
	std::tie(target_url_token, line) = SplitLineToken(line);
	if (target_url_token.empty())
	continue;
	GURL target_url = ParseUrlToken(target_url_token, manifest_url);
	if (!target_url.is_valid())
	continue;

	if (manifest_url.GetOrigin() != target_url.GetOrigin())
	continue;

	// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
	bool is_pattern = NextTokenIsPatternMatchingFlag(line);
	manifest.intercept_namespaces.emplace_back(
	APPCACHE_INTERCEPT_NAMESPACE, namespace_url, target_url, is_pattern);
	continue;
	}

	if (mode == Mode::kFallback) {
	if (manifest_url.GetOrigin() != namespace_url.GetOrigin())
	continue;

	if (parse_mode != PARSE_MANIFEST_ALLOWING_DANGEROUS_FEATURES) {
	if (!ScopeMatches(manifest_url, namespace_url)) {
	manifest.did_ignore_fallback_namespaces = true;
	continue;
	}
	}

	base::StringPiece fallback_url_token;
	std::tie(fallback_url_token, line) = SplitLineToken(line);
	if (fallback_url_token.empty())
	continue;
	GURL fallback_url = ParseUrlToken(fallback_url_token, manifest_url);
	if (!fallback_url.is_valid())
	continue;

	if (manifest_url.GetOrigin() != fallback_url.GetOrigin())
	continue;

	// TODO(pwnall): Add a UMA metric to see if we can remove this feature.
	bool is_pattern = NextTokenIsPatternMatchingFlag(line);

	// Store regardless of duplicate namespace URL. Only the first match will
	// ever be used.
	manifest.fallback_namespaces.emplace_back(
	AppCacheNamespace(APPCACHE_FALLBACK_NAMESPACE, namespace_url,
	fallback_url, is_pattern));
	continue;
	}

	NOTREACHED() << "Unimplemented AppCache manifest parser mode";
	}

	return true;
	}

	} // namespace content