extensions/common/url_pattern.cc - chromium/src - Git at Google

 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "extensions/common/url_pattern.h"

 #include <stddef.h>

 #include <ostream>

 #include "base/macros.h"
 #include "base/strings/pattern.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/stringprintf.h"
 #include "content/public/common/url_constants.h"
 #include "extensions/common/constants.h"
 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
 #include "net/base/url_util.h"
 #include "url/gurl.h"
 #include "url/url_util.h"

 const char URLPattern::kAllUrlsPattern[] = "<all_urls>";

 namespace {

 // TODO(aa): What about more obscure schemes like javascript: ?
 // Note: keep this array in sync with kValidSchemeMasks.
 const char* const kValidSchemes[] = {
     url::kHttpScheme,         url::kHttpsScheme,
     url::kFileScheme,         url::kFtpScheme,
     content::kChromeUIScheme, extensions::kExtensionScheme,
     url::kFileSystemScheme,   url::kWsScheme,
     url::kWssScheme,          url::kDataScheme,
 };

 const int kValidSchemeMasks[] = {
     URLPattern::SCHEME_HTTP,       URLPattern::SCHEME_HTTPS,
     URLPattern::SCHEME_FILE,       URLPattern::SCHEME_FTP,
     URLPattern::SCHEME_CHROMEUI,   URLPattern::SCHEME_EXTENSION,
     URLPattern::SCHEME_FILESYSTEM, URLPattern::SCHEME_WS,
     URLPattern::SCHEME_WSS,        URLPattern::SCHEME_DATA,
 };

 static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
               "must keep these arrays in sync");

 const char kParseSuccess[] = "Success.";
 const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
 const char kParseErrorInvalidScheme[] = "Invalid scheme.";
 const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
 const char kParseErrorEmptyHost[] = "Host can not be empty.";
 const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
 const char kParseErrorEmptyPath[] = "Empty path.";
 const char kParseErrorInvalidPort[] = "Invalid port.";
 const char kParseErrorInvalidHost[] = "Invalid host.";

 // Message explaining each URLPattern::ParseResult.
 const char* const kParseResultMessages[] = {
   kParseSuccess,
   kParseErrorMissingSchemeSeparator,
   kParseErrorInvalidScheme,
   kParseErrorWrongSchemeType,
   kParseErrorEmptyHost,
   kParseErrorInvalidHostWildcard,
   kParseErrorEmptyPath,
   kParseErrorInvalidPort,
   kParseErrorInvalidHost,
 };

 static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
               "must add message for each parse result");

 const char kPathSeparator[] = "/";

 bool IsStandardScheme(base::StringPiece scheme) {
   // "*" gets the same treatment as a standard scheme.
   if (scheme == "*")
     return true;

   return url::IsStandard(scheme.data(),
                          url::Component(0, static_cast<int>(scheme.length())));
 }

 bool IsValidPortForScheme(base::StringPiece scheme, base::StringPiece port) {
   if (port == "*")
     return true;

   // Only accept non-wildcard ports if the scheme uses ports.
   if (url::DefaultPortForScheme(scheme.data(), scheme.length()) ==
       url::PORT_UNSPECIFIED) {
     return false;
   }

   int parsed_port = url::PORT_UNSPECIFIED;
   if (!base::StringToInt(port, &parsed_port))
     return false;
   return (parsed_port >= 0) && (parsed_port < 65536);
 }

 // Returns |path| with the trailing wildcard stripped if one existed.
 //
 // The functions that rely on this (OverlapsWith and Contains) are only
 // called for the patterns inside URLPatternSet. In those cases, we know that
 // the path will have only a single wildcard at the end. This makes figuring
 // out overlap much easier. It seems like there is probably a computer-sciency
 // way to solve the general case, but we don't need that yet.
 base::StringPiece StripTrailingWildcard(base::StringPiece path) {
   if (path.ends_with("*"))
     path.remove_suffix(1);
   return path;
 }

 // Removes trailing dot from |host_piece| if any.
 base::StringPiece CanonicalizeHostForMatching(base::StringPiece host_piece) {
   if (host_piece.ends_with("."))
     host_piece.remove_suffix(1);
   return host_piece;
 }

 }  // namespace

 // static
 bool URLPattern::IsValidSchemeForExtensions(base::StringPiece scheme) {
   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
     if (scheme == kValidSchemes[i])
       return true;
   }
   return false;
 }

 // static
 int URLPattern::GetValidSchemeMaskForExtensions() {
   int result = 0;
   for (size_t i = 0; i < arraysize(kValidSchemeMasks); ++i)
     result |= kValidSchemeMasks[i];
   return result;
 }

 URLPattern::URLPattern()
     : valid_schemes_(SCHEME_NONE),
       match_all_urls_(false),
       match_subdomains_(false),
       match_effective_tld_(true),
       port_("*") {}

 URLPattern::URLPattern(int valid_schemes)
     : valid_schemes_(valid_schemes),
       match_all_urls_(false),
       match_subdomains_(false),
       match_effective_tld_(true),
       port_("*") {}

 URLPattern::URLPattern(int valid_schemes, base::StringPiece pattern)
     // Strict error checking is used, because this constructor is only
     // appropriate when we know |pattern| is valid.
     : valid_schemes_(valid_schemes),
       match_all_urls_(false),
       match_subdomains_(false),
       match_effective_tld_(true),
       port_("*") {
   ParseResult result = Parse(pattern);
   if (PARSE_SUCCESS != result) {
     // Temporarily add more logging to investigate why this code path is
     // reached. For http://crbug.com/856948
     LOG(ERROR) << "Invalid pattern was given " << pattern << " result "
                << result;
     NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
   }
 }

 URLPattern::URLPattern(const URLPattern& other) = default;

 URLPattern::~URLPattern() {
 }

 bool URLPattern::operator<(const URLPattern& other) const {
   return GetAsString() < other.GetAsString();
 }

 bool URLPattern::operator>(const URLPattern& other) const {
   return GetAsString() > other.GetAsString();
 }

 bool URLPattern::operator==(const URLPattern& other) const {
   return GetAsString() == other.GetAsString();
 }

 std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
   return out << '"' << url_pattern.GetAsString() << '"';
 }

 URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern) {
   return Parse(pattern, DENY_WILDCARD_FOR_EFFECTIVE_TLD);
 }

 URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern,
                                           ParseOptions parse_options) {
   spec_.clear();
   SetMatchAllURLs(false);
   SetMatchSubdomains(false);
   SetMatchEffectiveTld(true);
   SetPort("*");

   // Special case pattern to match every valid URL.
   if (pattern == kAllUrlsPattern) {
     SetMatchAllURLs(true);
     return PARSE_SUCCESS;
   }

   // Parse out the scheme.
   size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
   bool has_standard_scheme_separator = true;

   // Some urls also use ':' alone as the scheme separator.
   if (scheme_end_pos == base::StringPiece::npos) {
     scheme_end_pos = pattern.find(':');
     has_standard_scheme_separator = false;
   }

   if (scheme_end_pos == base::StringPiece::npos)
     return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;

   if (!SetScheme(pattern.substr(0, scheme_end_pos)))
     return PARSE_ERROR_INVALID_SCHEME;

   bool standard_scheme = IsStandardScheme(scheme_);
   if (standard_scheme != has_standard_scheme_separator)
     return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;

   // Advance past the scheme separator.
   scheme_end_pos +=
       (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
   if (scheme_end_pos >= pattern.size())
     return PARSE_ERROR_EMPTY_HOST;

   // Parse out the host and path.
   size_t host_start_pos = scheme_end_pos;
   size_t path_start_pos = 0;

   if (!standard_scheme) {
     path_start_pos = host_start_pos;
   } else if (scheme_ == url::kFileScheme) {
     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
     if (host_end_pos == base::StringPiece::npos) {
       // Allow hostname omission.
       // e.g. file://* is interpreted as file:///*,
       // file://foo* is interpreted as file:///foo*.
       path_start_pos = host_start_pos - 1;
     } else {
       // Ignore hostname if scheme is file://.
       // e.g. file://localhost/foo is equal to file:///foo.
       path_start_pos = host_end_pos;
     }
   } else {
     size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);

     // Host is required.
     if (host_start_pos == host_end_pos)
       return PARSE_ERROR_EMPTY_HOST;

     if (host_end_pos == base::StringPiece::npos)
       return PARSE_ERROR_EMPTY_PATH;

     base::StringPiece host_and_port =
         pattern.substr(host_start_pos, host_end_pos - host_start_pos);

     size_t port_separator_pos = base::StringPiece::npos;
     if (host_and_port[0] != '[') {
       // Not IPv6 (either IPv4 or just a normal address).
       port_separator_pos = host_and_port.find(':');
     } else {  // IPv6.
       size_t host_end_pos = host_and_port.find(']');
       if (host_end_pos == base::StringPiece::npos)
         return PARSE_ERROR_INVALID_HOST;
       if (host_end_pos == 1)
         return PARSE_ERROR_EMPTY_HOST;

       if (host_end_pos < host_and_port.length() - 1) {
         // The host isn't the only component. Check for a port. This would
         // require a ':' to follow the closing ']' from the host.
         if (host_and_port[host_end_pos + 1] != ':')
           return PARSE_ERROR_INVALID_HOST;

         port_separator_pos = host_end_pos + 1;
       }
     }

     if (port_separator_pos != base::StringPiece::npos &&
         !SetPort(host_and_port.substr(port_separator_pos + 1))) {
       return PARSE_ERROR_INVALID_PORT;
     }

     // Note: this substr() will be the entire string if the port position
     // wasn't found.
     base::StringPiece host_piece = host_and_port.substr(0, port_separator_pos);

     // The first component can optionally be '*' to match all subdomains.
     std::vector<base::StringPiece> host_components = base::SplitStringPiece(
         host_piece, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

     // Could be empty if the host only consists of whitespace characters.
     if (host_components.empty() ||
         (host_components.size() == 1 && host_components[0].empty()))
       return PARSE_ERROR_EMPTY_HOST;

     if (host_components[0] == "*") {
       match_subdomains_ = true;
       host_components.erase(host_components.begin());
     }

     // If explicitly allowed, the last component can optionally be '*' to
     // match all effective TLDs.
     if (parse_options == ALLOW_WILDCARD_FOR_EFFECTIVE_TLD &&
         host_components.size() > 1 && host_components.back() == "*") {
       match_effective_tld_ = false;
       host_components.pop_back();
     }
     host_ = base::JoinString(host_components, ".");

     path_start_pos = host_end_pos;
   }

   SetPath(pattern.substr(path_start_pos));

   // No other '*' can occur in the host, though. This isn't necessary, but is
   // done as a convenience to developers who might otherwise be confused and
   // think '*' works as a glob in the host.
   if (host_.find('*') != std::string::npos)
     return PARSE_ERROR_INVALID_HOST_WILDCARD;

   if (!host_.empty()) {
     // If |host_| is present (i.e., isn't a wildcard), we need to canonicalize
     // it.
     url::CanonHostInfo host_info;
     host_ = net::CanonicalizeHost(host_, &host_info);
     // net::CanonicalizeHost() returns an empty string on failure.
     if (host_.empty())
       return PARSE_ERROR_INVALID_HOST;
   }

   // Null characters are not allowed in hosts.
   if (host_.find('\0') != std::string::npos)
     return PARSE_ERROR_INVALID_HOST;

   return PARSE_SUCCESS;
 }

 void URLPattern::SetValidSchemes(int valid_schemes) {
   spec_.clear();
   valid_schemes_ = valid_schemes;
 }

 void URLPattern::SetHost(base::StringPiece host) {
   spec_.clear();
   host.CopyToString(&host_);
 }

 void URLPattern::SetMatchAllURLs(bool val) {
   spec_.clear();
   match_all_urls_ = val;

   if (val) {
     match_subdomains_ = true;
     scheme_ = "*";
     host_.clear();
     SetPath("/*");
   }
 }

 void URLPattern::SetMatchSubdomains(bool val) {
   spec_.clear();
   match_subdomains_ = val;
 }

 void URLPattern::SetMatchEffectiveTld(bool val) {
   spec_.clear();
   match_effective_tld_ = val;
 }

 bool URLPattern::SetScheme(base::StringPiece scheme) {
   spec_.clear();
   scheme.CopyToString(&scheme_);
   if (scheme_ == "*") {
     valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
   } else if (!IsValidScheme(scheme_)) {
     return false;
   }
   return true;
 }

 bool URLPattern::IsValidScheme(base::StringPiece scheme) const {
   if (valid_schemes_ == SCHEME_ALL)
     return true;

   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
     if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
       return true;
   }

   return false;
 }

 void URLPattern::SetPath(base::StringPiece path) {
   spec_.clear();
   path.CopyToString(&path_);
   path_escaped_ = path_;
   base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
   base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
 }

 bool URLPattern::SetPort(base::StringPiece port) {
   spec_.clear();
   if (IsValidPortForScheme(scheme_, port)) {
     port.CopyToString(&port_);
     return true;
   }
   return false;
 }

 bool URLPattern::MatchesURL(const GURL& test) const {
   const GURL* test_url = &test;
   bool has_inner_url = test.inner_url() != NULL;

   if (has_inner_url) {
     if (!test.SchemeIsFileSystem())
       return false;  // The only nested URLs we handle are filesystem URLs.
     test_url = test.inner_url();
   }

   if (!MatchesScheme(test_url->scheme_piece()))
     return false;

   if (match_all_urls_)
     return true;

   std::string path_for_request = test.PathForRequest();
   if (has_inner_url) {
     path_for_request = base::StringPrintf("%s%s", test_url->path_piece().data(),
                                           path_for_request.c_str());
   }

   return MatchesSecurityOriginHelper(*test_url) &&
          MatchesPath(path_for_request);
 }

 bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
   const GURL* test_url = &test;
   bool has_inner_url = test.inner_url() != NULL;

   if (has_inner_url) {
     if (!test.SchemeIsFileSystem())
       return false;  // The only nested URLs we handle are filesystem URLs.
     test_url = test.inner_url();
   }

   if (!MatchesScheme(test_url->scheme()))
     return false;

   if (match_all_urls_)
     return true;

   return MatchesSecurityOriginHelper(*test_url);
 }

 bool URLPattern::MatchesScheme(base::StringPiece test) const {
   if (!IsValidScheme(test))
     return false;

   return scheme_ == "*" || test == scheme_;
 }

 bool URLPattern::MatchesHost(base::StringPiece host) const {
   // TODO(devlin): This is a bit sad. Parsing urls is expensive. However, it's
   // important that we do this conversion to a GURL in order to canonicalize the
   // host (the pattern's host_ already is canonicalized from Parse()). We can't
   // just do string comparison.
   return MatchesHost(
       GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
                               url::kStandardSchemeSeparator, host.data())));
 }

 bool URLPattern::MatchesHost(const GURL& test) const {
   base::StringPiece test_host(CanonicalizeHostForMatching(test.host_piece()));
   const base::StringPiece pattern_host(CanonicalizeHostForMatching(host_));

   // If we don't care about matching the effective TLD, remove it.
   if (!match_effective_tld_) {
     int reg_length = net::registry_controlled_domains::GetRegistryLength(
         test, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
         net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
     if (reg_length > 0) {
       test_host = test_host.substr(0, test_host.size() - reg_length - 1);
     }
   }

   // If the hosts are exactly equal, we have a match.
   if (test_host == pattern_host)
     return true;

   // If we're matching subdomains, and we have no host in the match pattern,
   // that means that we're matching all hosts, which means we have a match no
   // matter what the test host is.
   if (match_subdomains_ && pattern_host.empty())
     return true;

   // Otherwise, we can only match if our match pattern matches subdomains.
   if (!match_subdomains_)
     return false;

   // We don't do subdomain matching against IP addresses, so we can give up now
   // if the test host is an IP address.
   if (test.HostIsIPAddress())
     return false;

   // Check if the test host is a subdomain of our host.
   if (test_host.length() <= (pattern_host.length() + 1))
     return false;

   if (!test_host.ends_with(pattern_host))
     return false;

   return test_host[test_host.length() - pattern_host.length() - 1] == '.';
 }

 bool URLPattern::MatchesEffectiveTld(
     net::registry_controlled_domains::PrivateRegistryFilter private_filter,
     net::registry_controlled_domains::UnknownRegistryFilter unknown_filter)
     const {
   // Check if it matches all urls or is a pattern like http://*/*.
   if (match_all_urls_ || (match_subdomains_ && host_.empty()))
     return true;

   // If this doesn't even match subdomains, it can't possibly be a TLD wildcard.
   if (!match_subdomains_)
     return false;

   // If there was more than just a TLD in the host (e.g., *.foobar.com), it
   // doesn't match all hosts in an effective TLD.
   if (net::registry_controlled_domains::HostHasRegistryControlledDomain(
           host_, unknown_filter, private_filter)) {
     return false;
   }

   // At this point the host could either be just a TLD ("com") or some unknown
   // TLD-like string ("notatld"). To disambiguate between them construct a
   // fake URL, and check the registry.
   //
   // If we recognized this TLD, then this is a pattern like *.com, and it
   // matches an effective TLD.
   return net::registry_controlled_domains::HostHasRegistryControlledDomain(
       "notatld." + host_, unknown_filter, private_filter);
 }

 bool URLPattern::MatchesSingleOrigin() const {
   // Strictly speaking, the port is part of the origin, but in URLPattern it
   // defaults to *. It's not very interesting anyway, so leave it out.
   return !MatchesEffectiveTld() && scheme_ != "*" && !match_subdomains_;
 }

 bool URLPattern::MatchesPath(base::StringPiece test) const {
   // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
   // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
   // The below if is a no-copy way of doing (test + "/*" == path_escaped_).
   if (path_escaped_.length() == test.length() + 2 &&
       base::StartsWith(path_escaped_.c_str(), test,
                        base::CompareCase::SENSITIVE) &&
       base::EndsWith(path_escaped_, "/*", base::CompareCase::SENSITIVE)) {
     return true;
   }

   return base::MatchPattern(test, path_escaped_);
 }

 const std::string& URLPattern::GetAsString() const {
   if (!spec_.empty())
     return spec_;

   if (match_all_urls_) {
     spec_ = kAllUrlsPattern;
     return spec_;
   }

   bool standard_scheme = IsStandardScheme(scheme_);

   std::string spec = scheme_ +
       (standard_scheme ? url::kStandardSchemeSeparator : ":");

   if (scheme_ != url::kFileScheme && standard_scheme) {
     if (match_subdomains_) {
       spec += "*";
       if (!host_.empty())
         spec += ".";
     }

     if (!host_.empty())
       spec += host_;

     if (!match_effective_tld_) {
       if (!host_.empty())
         spec += ".";
       spec += "*";
     }

     if (port_ != "*") {
       spec += ":";
       spec += port_;
     }
   }

   if (!path_.empty())
     spec += path_;

   spec_ = std::move(spec);
   return spec_;
 }

 bool URLPattern::OverlapsWith(const URLPattern& other) const {
   if (match_all_urls() || other.match_all_urls())
     return true;
   return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
           other.MatchesAnyScheme(GetExplicitSchemes()))
       && (MatchesHost(other.host()) || other.MatchesHost(host()))
       && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
       && (MatchesPath(StripTrailingWildcard(other.path())) ||
           other.MatchesPath(StripTrailingWildcard(path())));
 }

 bool URLPattern::Contains(const URLPattern& other) const {
   if (match_all_urls())
     return true;
   return MatchesAllSchemes(other.GetExplicitSchemes()) &&
          MatchesHost(other.host()) &&
          (!other.match_subdomains_ || match_subdomains_) &&
          MatchesPortPattern(other.port()) &&
          MatchesPath(StripTrailingWildcard(other.path()));
 }

 bool URLPattern::MatchesAnyScheme(
     const std::vector<std::string>& schemes) const {
   for (std::vector<std::string>::const_iterator i = schemes.begin();
        i != schemes.end(); ++i) {
     if (MatchesScheme(*i))
       return true;
   }

   return false;
 }

 bool URLPattern::MatchesAllSchemes(
     const std::vector<std::string>& schemes) const {
   for (std::vector<std::string>::const_iterator i = schemes.begin();
        i != schemes.end(); ++i) {
     if (!MatchesScheme(*i))
       return false;
   }

   return true;
 }

 bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
   // Ignore hostname if scheme is file://.
   if (scheme_ != url::kFileScheme && !MatchesHost(test))
     return false;

   if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
     return false;

   return true;
 }

 bool URLPattern::MatchesPortPattern(base::StringPiece port) const {
   return port_ == "*" || port_ == port;
 }

 std::vector<std::string> URLPattern::GetExplicitSchemes() const {
   std::vector<std::string> result;

   if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
     result.push_back(scheme_);
     return result;
   }

   for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
     if (MatchesScheme(kValidSchemes[i])) {
       result.push_back(kValidSchemes[i]);
     }
   }

   return result;
 }

 std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
   std::vector<std::string> explicit_schemes = GetExplicitSchemes();
   std::vector<URLPattern> result;

   for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
        i != explicit_schemes.end(); ++i) {
     URLPattern temp = *this;
     temp.SetScheme(*i);
     temp.SetMatchAllURLs(false);
     result.push_back(temp);
   }

   return result;
 }

 // static
 const char* URLPattern::GetParseResultString(
     URLPattern::ParseResult parse_result) {
   return kParseResultMessages[parse_result];
 }
	// Copyright (c) 2012 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "extensions/common/url_pattern.h"

	#include <stddef.h>

	#include <ostream>

	#include "base/macros.h"
	#include "base/strings/pattern.h"
	#include "base/strings/string_number_conversions.h"
	#include "base/strings/string_split.h"
	#include "base/strings/string_util.h"
	#include "base/strings/stringprintf.h"
	#include "content/public/common/url_constants.h"
	#include "extensions/common/constants.h"
	#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
	#include "net/base/url_util.h"
	#include "url/gurl.h"
	#include "url/url_util.h"

	const char URLPattern::kAllUrlsPattern[] = "<all_urls>";

	namespace {

	// TODO(aa): What about more obscure schemes like javascript: ?
	// Note: keep this array in sync with kValidSchemeMasks.
	const char* const kValidSchemes[] = {
	url::kHttpScheme, url::kHttpsScheme,
	url::kFileScheme, url::kFtpScheme,
	content::kChromeUIScheme, extensions::kExtensionScheme,
	url::kFileSystemScheme, url::kWsScheme,
	url::kWssScheme, url::kDataScheme,
	};

	const int kValidSchemeMasks[] = {
	URLPattern::SCHEME_HTTP, URLPattern::SCHEME_HTTPS,
	URLPattern::SCHEME_FILE, URLPattern::SCHEME_FTP,
	URLPattern::SCHEME_CHROMEUI, URLPattern::SCHEME_EXTENSION,
	URLPattern::SCHEME_FILESYSTEM, URLPattern::SCHEME_WS,
	URLPattern::SCHEME_WSS, URLPattern::SCHEME_DATA,
	};

	static_assert(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
	"must keep these arrays in sync");

	const char kParseSuccess[] = "Success.";
	const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
	const char kParseErrorInvalidScheme[] = "Invalid scheme.";
	const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
	const char kParseErrorEmptyHost[] = "Host can not be empty.";
	const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
	const char kParseErrorEmptyPath[] = "Empty path.";
	const char kParseErrorInvalidPort[] = "Invalid port.";
	const char kParseErrorInvalidHost[] = "Invalid host.";

	// Message explaining each URLPattern::ParseResult.
	const char* const kParseResultMessages[] = {
	kParseSuccess,
	kParseErrorMissingSchemeSeparator,
	kParseErrorInvalidScheme,
	kParseErrorWrongSchemeType,
	kParseErrorEmptyHost,
	kParseErrorInvalidHostWildcard,
	kParseErrorEmptyPath,
	kParseErrorInvalidPort,
	kParseErrorInvalidHost,
	};

	static_assert(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
	"must add message for each parse result");

	const char kPathSeparator[] = "/";

	bool IsStandardScheme(base::StringPiece scheme) {
	// "*" gets the same treatment as a standard scheme.
	if (scheme == "*")
	return true;

	return url::IsStandard(scheme.data(),
	url::Component(0, static_cast<int>(scheme.length())));
	}

	bool IsValidPortForScheme(base::StringPiece scheme, base::StringPiece port) {
	if (port == "*")
	return true;

	// Only accept non-wildcard ports if the scheme uses ports.
	if (url::DefaultPortForScheme(scheme.data(), scheme.length()) ==
	url::PORT_UNSPECIFIED) {
	return false;
	}

	int parsed_port = url::PORT_UNSPECIFIED;
	if (!base::StringToInt(port, &parsed_port))
	return false;
	return (parsed_port >= 0) && (parsed_port < 65536);
	}

	// Returns \|path\| with the trailing wildcard stripped if one existed.
	//
	// The functions that rely on this (OverlapsWith and Contains) are only
	// called for the patterns inside URLPatternSet. In those cases, we know that
	// the path will have only a single wildcard at the end. This makes figuring
	// out overlap much easier. It seems like there is probably a computer-sciency
	// way to solve the general case, but we don't need that yet.
	base::StringPiece StripTrailingWildcard(base::StringPiece path) {
	if (path.ends_with("*"))
	path.remove_suffix(1);
	return path;
	}

	// Removes trailing dot from \|host_piece\| if any.
	base::StringPiece CanonicalizeHostForMatching(base::StringPiece host_piece) {
	if (host_piece.ends_with("."))
	host_piece.remove_suffix(1);
	return host_piece;
	}

	} // namespace

	// static
	bool URLPattern::IsValidSchemeForExtensions(base::StringPiece scheme) {
	for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
	if (scheme == kValidSchemes[i])
	return true;
	}
	return false;
	}

	// static
	int URLPattern::GetValidSchemeMaskForExtensions() {
	int result = 0;
	for (size_t i = 0; i < arraysize(kValidSchemeMasks); ++i)
	result \|= kValidSchemeMasks[i];
	return result;
	}

	URLPattern::URLPattern()
	: valid_schemes_(SCHEME_NONE),
	match_all_urls_(false),
	match_subdomains_(false),
	match_effective_tld_(true),
	port_("*") {}

	URLPattern::URLPattern(int valid_schemes)
	: valid_schemes_(valid_schemes),
	match_all_urls_(false),
	match_subdomains_(false),
	match_effective_tld_(true),
	port_("*") {}

	URLPattern::URLPattern(int valid_schemes, base::StringPiece pattern)
	// Strict error checking is used, because this constructor is only
	// appropriate when we know \|pattern\| is valid.
	: valid_schemes_(valid_schemes),
	match_all_urls_(false),
	match_subdomains_(false),
	match_effective_tld_(true),
	port_("*") {
	ParseResult result = Parse(pattern);
	if (PARSE_SUCCESS != result) {
	// Temporarily add more logging to investigate why this code path is
	// reached. For http://crbug.com/856948
	LOG(ERROR) << "Invalid pattern was given " << pattern << " result "
	<< result;
	NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
	}
	}

	URLPattern::URLPattern(const URLPattern& other) = default;

	URLPattern::~URLPattern() {
	}

	bool URLPattern::operator<(const URLPattern& other) const {
	return GetAsString() < other.GetAsString();
	}

	bool URLPattern::operator>(const URLPattern& other) const {
	return GetAsString() > other.GetAsString();
	}

	bool URLPattern::operator==(const URLPattern& other) const {
	return GetAsString() == other.GetAsString();
	}

	std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
	return out << '"' << url_pattern.GetAsString() << '"';
	}

	URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern) {
	return Parse(pattern, DENY_WILDCARD_FOR_EFFECTIVE_TLD);
	}

	URLPattern::ParseResult URLPattern::Parse(base::StringPiece pattern,
	ParseOptions parse_options) {
	spec_.clear();
	SetMatchAllURLs(false);
	SetMatchSubdomains(false);
	SetMatchEffectiveTld(true);
	SetPort("*");

	// Special case pattern to match every valid URL.
	if (pattern == kAllUrlsPattern) {
	SetMatchAllURLs(true);
	return PARSE_SUCCESS;
	}

	// Parse out the scheme.
	size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
	bool has_standard_scheme_separator = true;

	// Some urls also use ':' alone as the scheme separator.
	if (scheme_end_pos == base::StringPiece::npos) {
	scheme_end_pos = pattern.find(':');
	has_standard_scheme_separator = false;
	}

	if (scheme_end_pos == base::StringPiece::npos)
	return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;

	if (!SetScheme(pattern.substr(0, scheme_end_pos)))
	return PARSE_ERROR_INVALID_SCHEME;

	bool standard_scheme = IsStandardScheme(scheme_);
	if (standard_scheme != has_standard_scheme_separator)
	return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;

	// Advance past the scheme separator.
	scheme_end_pos +=
	(standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
	if (scheme_end_pos >= pattern.size())
	return PARSE_ERROR_EMPTY_HOST;

	// Parse out the host and path.
	size_t host_start_pos = scheme_end_pos;
	size_t path_start_pos = 0;

	if (!standard_scheme) {
	path_start_pos = host_start_pos;
	} else if (scheme_ == url::kFileScheme) {
	size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
	if (host_end_pos == base::StringPiece::npos) {
	// Allow hostname omission.
	// e.g. file://* is interpreted as file:///*,
	// file://foo* is interpreted as file:///foo*.
	path_start_pos = host_start_pos - 1;
	} else {
	// Ignore hostname if scheme is file://.
	// e.g. file://localhost/foo is equal to file:///foo.
	path_start_pos = host_end_pos;
	}
	} else {
	size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);

	// Host is required.
	if (host_start_pos == host_end_pos)
	return PARSE_ERROR_EMPTY_HOST;

	if (host_end_pos == base::StringPiece::npos)
	return PARSE_ERROR_EMPTY_PATH;

	base::StringPiece host_and_port =
	pattern.substr(host_start_pos, host_end_pos - host_start_pos);

	size_t port_separator_pos = base::StringPiece::npos;
	if (host_and_port[0] != '[') {
	// Not IPv6 (either IPv4 or just a normal address).
	port_separator_pos = host_and_port.find(':');
	} else { // IPv6.
	size_t host_end_pos = host_and_port.find(']');
	if (host_end_pos == base::StringPiece::npos)
	return PARSE_ERROR_INVALID_HOST;
	if (host_end_pos == 1)
	return PARSE_ERROR_EMPTY_HOST;

	if (host_end_pos < host_and_port.length() - 1) {
	// The host isn't the only component. Check for a port. This would
	// require a ':' to follow the closing ']' from the host.
	if (host_and_port[host_end_pos + 1] != ':')
	return PARSE_ERROR_INVALID_HOST;

	port_separator_pos = host_end_pos + 1;
	}
	}

	if (port_separator_pos != base::StringPiece::npos &&
	!SetPort(host_and_port.substr(port_separator_pos + 1))) {
	return PARSE_ERROR_INVALID_PORT;
	}

	// Note: this substr() will be the entire string if the port position
	// wasn't found.
	base::StringPiece host_piece = host_and_port.substr(0, port_separator_pos);

	// The first component can optionally be '*' to match all subdomains.
	std::vector<base::StringPiece> host_components = base::SplitStringPiece(
	host_piece, ".", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

	// Could be empty if the host only consists of whitespace characters.
	if (host_components.empty() \|\|
	(host_components.size() == 1 && host_components[0].empty()))
	return PARSE_ERROR_EMPTY_HOST;

	if (host_components[0] == "*") {
	match_subdomains_ = true;
	host_components.erase(host_components.begin());
	}

	// If explicitly allowed, the last component can optionally be '*' to
	// match all effective TLDs.
	if (parse_options == ALLOW_WILDCARD_FOR_EFFECTIVE_TLD &&
	host_components.size() > 1 && host_components.back() == "*") {
	match_effective_tld_ = false;
	host_components.pop_back();
	}
	host_ = base::JoinString(host_components, ".");

	path_start_pos = host_end_pos;
	}

	SetPath(pattern.substr(path_start_pos));

	// No other '*' can occur in the host, though. This isn't necessary, but is
	// done as a convenience to developers who might otherwise be confused and
	// think '*' works as a glob in the host.
	if (host_.find('*') != std::string::npos)
	return PARSE_ERROR_INVALID_HOST_WILDCARD;

	if (!host_.empty()) {
	// If \|host_\| is present (i.e., isn't a wildcard), we need to canonicalize
	// it.
	url::CanonHostInfo host_info;
	host_ = net::CanonicalizeHost(host_, &host_info);
	// net::CanonicalizeHost() returns an empty string on failure.
	if (host_.empty())
	return PARSE_ERROR_INVALID_HOST;
	}

	// Null characters are not allowed in hosts.
	if (host_.find('\0') != std::string::npos)
	return PARSE_ERROR_INVALID_HOST;

	return PARSE_SUCCESS;
	}

	void URLPattern::SetValidSchemes(int valid_schemes) {
	spec_.clear();
	valid_schemes_ = valid_schemes;
	}

	void URLPattern::SetHost(base::StringPiece host) {
	spec_.clear();
	host.CopyToString(&host_);
	}

	void URLPattern::SetMatchAllURLs(bool val) {
	spec_.clear();
	match_all_urls_ = val;

	if (val) {
	match_subdomains_ = true;
	scheme_ = "*";
	host_.clear();
	SetPath("/*");
	}
	}

	void URLPattern::SetMatchSubdomains(bool val) {
	spec_.clear();
	match_subdomains_ = val;
	}

	void URLPattern::SetMatchEffectiveTld(bool val) {
	spec_.clear();
	match_effective_tld_ = val;
	}

	bool URLPattern::SetScheme(base::StringPiece scheme) {
	spec_.clear();
	scheme.CopyToString(&scheme_);
	if (scheme_ == "*") {
	valid_schemes_ &= (SCHEME_HTTP \| SCHEME_HTTPS);
	} else if (!IsValidScheme(scheme_)) {
	return false;
	}
	return true;
	}

	bool URLPattern::IsValidScheme(base::StringPiece scheme) const {
	if (valid_schemes_ == SCHEME_ALL)
	return true;

	for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
	if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
	return true;
	}

	return false;
	}

	void URLPattern::SetPath(base::StringPiece path) {
	spec_.clear();
	path.CopyToString(&path_);
	path_escaped_ = path_;
	base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
	base::ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
	}

	bool URLPattern::SetPort(base::StringPiece port) {
	spec_.clear();
	if (IsValidPortForScheme(scheme_, port)) {
	port.CopyToString(&port_);
	return true;
	}
	return false;
	}

	bool URLPattern::MatchesURL(const GURL& test) const {
	const GURL* test_url = &test;
	bool has_inner_url = test.inner_url() != NULL;

	if (has_inner_url) {
	if (!test.SchemeIsFileSystem())
	return false; // The only nested URLs we handle are filesystem URLs.
	test_url = test.inner_url();
	}

	if (!MatchesScheme(test_url->scheme_piece()))
	return false;

	if (match_all_urls_)
	return true;

	std::string path_for_request = test.PathForRequest();
	if (has_inner_url) {
	path_for_request = base::StringPrintf("%s%s", test_url->path_piece().data(),
	path_for_request.c_str());
	}

	return MatchesSecurityOriginHelper(*test_url) &&
	MatchesPath(path_for_request);
	}

	bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
	const GURL* test_url = &test;
	bool has_inner_url = test.inner_url() != NULL;

	if (has_inner_url) {
	if (!test.SchemeIsFileSystem())
	return false; // The only nested URLs we handle are filesystem URLs.
	test_url = test.inner_url();
	}

	if (!MatchesScheme(test_url->scheme()))
	return false;

	if (match_all_urls_)
	return true;

	return MatchesSecurityOriginHelper(*test_url);
	}

	bool URLPattern::MatchesScheme(base::StringPiece test) const {
	if (!IsValidScheme(test))
	return false;

	return scheme_ == "*" \|\| test == scheme_;
	}

	bool URLPattern::MatchesHost(base::StringPiece host) const {
	// TODO(devlin): This is a bit sad. Parsing urls is expensive. However, it's
	// important that we do this conversion to a GURL in order to canonicalize the
	// host (the pattern's host_ already is canonicalized from Parse()). We can't
	// just do string comparison.
	return MatchesHost(
	GURL(base::StringPrintf("%s%s%s/", url::kHttpScheme,
	url::kStandardSchemeSeparator, host.data())));
	}

	bool URLPattern::MatchesHost(const GURL& test) const {
	base::StringPiece test_host(CanonicalizeHostForMatching(test.host_piece()));
	const base::StringPiece pattern_host(CanonicalizeHostForMatching(host_));

	// If we don't care about matching the effective TLD, remove it.
	if (!match_effective_tld_) {
	int reg_length = net::registry_controlled_domains::GetRegistryLength(
	test, net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
	net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
	if (reg_length > 0) {
	test_host = test_host.substr(0, test_host.size() - reg_length - 1);
	}
	}

	// If the hosts are exactly equal, we have a match.
	if (test_host == pattern_host)
	return true;

	// If we're matching subdomains, and we have no host in the match pattern,
	// that means that we're matching all hosts, which means we have a match no
	// matter what the test host is.
	if (match_subdomains_ && pattern_host.empty())
	return true;

	// Otherwise, we can only match if our match pattern matches subdomains.
	if (!match_subdomains_)
	return false;

	// We don't do subdomain matching against IP addresses, so we can give up now
	// if the test host is an IP address.
	if (test.HostIsIPAddress())
	return false;

	// Check if the test host is a subdomain of our host.
	if (test_host.length() <= (pattern_host.length() + 1))
	return false;

	if (!test_host.ends_with(pattern_host))
	return false;

	return test_host[test_host.length() - pattern_host.length() - 1] == '.';
	}

	bool URLPattern::MatchesEffectiveTld(
	net::registry_controlled_domains::PrivateRegistryFilter private_filter,
	net::registry_controlled_domains::UnknownRegistryFilter unknown_filter)
	const {
	// Check if it matches all urls or is a pattern like http:///.
	if (match_all_urls_ \|\| (match_subdomains_ && host_.empty()))
	return true;

	// If this doesn't even match subdomains, it can't possibly be a TLD wildcard.
	if (!match_subdomains_)
	return false;

	// If there was more than just a TLD in the host (e.g., *.foobar.com), it
	// doesn't match all hosts in an effective TLD.
	if (net::registry_controlled_domains::HostHasRegistryControlledDomain(
	host_, unknown_filter, private_filter)) {
	return false;
	}

	// At this point the host could either be just a TLD ("com") or some unknown
	// TLD-like string ("notatld"). To disambiguate between them construct a
	// fake URL, and check the registry.
	//
	// If we recognized this TLD, then this is a pattern like *.com, and it
	// matches an effective TLD.
	return net::registry_controlled_domains::HostHasRegistryControlledDomain(
	"notatld." + host_, unknown_filter, private_filter);
	}

	bool URLPattern::MatchesSingleOrigin() const {
	// Strictly speaking, the port is part of the origin, but in URLPattern it
	// defaults to *. It's not very interesting anyway, so leave it out.
	return !MatchesEffectiveTld() && scheme_ != "*" && !match_subdomains_;
	}

	bool URLPattern::MatchesPath(base::StringPiece test) const {
	// Make the behaviour of OverlapsWith consistent with MatchesURL, which is
	// need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
	// The below if is a no-copy way of doing (test + "/*" == path_escaped_).
	if (path_escaped_.length() == test.length() + 2 &&
	base::StartsWith(path_escaped_.c_str(), test,
	base::CompareCase::SENSITIVE) &&
	base::EndsWith(path_escaped_, "/*", base::CompareCase::SENSITIVE)) {
	return true;
	}

	return base::MatchPattern(test, path_escaped_);
	}

	const std::string& URLPattern::GetAsString() const {
	if (!spec_.empty())
	return spec_;

	if (match_all_urls_) {
	spec_ = kAllUrlsPattern;
	return spec_;
	}

	bool standard_scheme = IsStandardScheme(scheme_);

	std::string spec = scheme_ +
	(standard_scheme ? url::kStandardSchemeSeparator : ":");

	if (scheme_ != url::kFileScheme && standard_scheme) {
	if (match_subdomains_) {
	spec += "*";
	if (!host_.empty())
	spec += ".";
	}

	if (!host_.empty())
	spec += host_;

	if (!match_effective_tld_) {
	if (!host_.empty())
	spec += ".";
	spec += "*";
	}

	if (port_ != "*") {
	spec += ":";
	spec += port_;
	}
	}

	if (!path_.empty())
	spec += path_;

	spec_ = std::move(spec);
	return spec_;
	}

	bool URLPattern::OverlapsWith(const URLPattern& other) const {
	if (match_all_urls() \|\| other.match_all_urls())
	return true;
	return (MatchesAnyScheme(other.GetExplicitSchemes()) \|\|
	other.MatchesAnyScheme(GetExplicitSchemes()))
	&& (MatchesHost(other.host()) \|\| other.MatchesHost(host()))
	&& (MatchesPortPattern(other.port()) \|\| other.MatchesPortPattern(port()))
	&& (MatchesPath(StripTrailingWildcard(other.path())) \|\|
	other.MatchesPath(StripTrailingWildcard(path())));
	}

	bool URLPattern::Contains(const URLPattern& other) const {
	if (match_all_urls())
	return true;
	return MatchesAllSchemes(other.GetExplicitSchemes()) &&
	MatchesHost(other.host()) &&
	(!other.match_subdomains_ \|\| match_subdomains_) &&
	MatchesPortPattern(other.port()) &&
	MatchesPath(StripTrailingWildcard(other.path()));
	}

	bool URLPattern::MatchesAnyScheme(
	const std::vector<std::string>& schemes) const {
	for (std::vector<std::string>::const_iterator i = schemes.begin();
	i != schemes.end(); ++i) {
	if (MatchesScheme(*i))
	return true;
	}

	return false;
	}

	bool URLPattern::MatchesAllSchemes(
	const std::vector<std::string>& schemes) const {
	for (std::vector<std::string>::const_iterator i = schemes.begin();
	i != schemes.end(); ++i) {
	if (!MatchesScheme(*i))
	return false;
	}

	return true;
	}

	bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
	// Ignore hostname if scheme is file://.
	if (scheme_ != url::kFileScheme && !MatchesHost(test))
	return false;

	if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
	return false;

	return true;
	}

	bool URLPattern::MatchesPortPattern(base::StringPiece port) const {
	return port_ == "*" \|\| port_ == port;
	}

	std::vector<std::string> URLPattern::GetExplicitSchemes() const {
	std::vector<std::string> result;

	if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
	result.push_back(scheme_);
	return result;
	}

	for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
	if (MatchesScheme(kValidSchemes[i])) {
	result.push_back(kValidSchemes[i]);
	}
	}

	return result;
	}

	std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
	std::vector<std::string> explicit_schemes = GetExplicitSchemes();
	std::vector<URLPattern> result;

	for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
	i != explicit_schemes.end(); ++i) {
	URLPattern temp = *this;
	temp.SetScheme(*i);
	temp.SetMatchAllURLs(false);
	result.push_back(temp);
	}

	return result;
	}

	// static
	const char* URLPattern::GetParseResultString(
	URLPattern::ParseResult parse_result) {
	return kParseResultMessages[parse_result];
	}