net/base/url_util.cc - chromium/src - Git at Google

 // Copyright 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "net/base/url_util.h"

 #include "base/logging.h"
 #include "base/strings/string_util.h"
 #include "base/strings/stringprintf.h"
 #include "net/base/escape.h"
 #include "url/gurl.h"
 #include "url/third_party/mozilla/url_parse.h"
 #include "url/url_canon.h"
 #include "url/url_canon_ip.h"

 namespace net {

 namespace {

 bool IsHostCharAlphanumeric(char c) {
   // We can just check lowercase because uppercase characters have already been
   // normalized.
   return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
 }

 }  // namespace

 GURL AppendQueryParameter(const GURL& url,
                           const std::string& name,
                           const std::string& value) {
   std::string query(url.query());

   if (!query.empty())
     query += "&";

   query += (EscapeQueryParamValue(name, true) + "=" +
             EscapeQueryParamValue(value, true));
   GURL::Replacements replacements;
   replacements.SetQueryStr(query);
   return url.ReplaceComponents(replacements);
 }

 GURL AppendOrReplaceQueryParameter(const GURL& url,
                                    const std::string& name,
                                    const std::string& value) {
   bool replaced = false;
   std::string param_name = EscapeQueryParamValue(name, true);
   std::string param_value = EscapeQueryParamValue(value, true);

   const std::string input = url.query();
   url::Component cursor(0, input.size());
   std::string output;
   url::Component key_range, value_range;
   while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
                                    &value_range)) {
     const base::StringPiece key(
         input.data() + key_range.begin, key_range.len);
     std::string key_value_pair;
     // Check |replaced| as only the first pair should be replaced.
     if (!replaced && key == param_name) {
       replaced = true;
       key_value_pair = (param_name + "=" + param_value);
     } else {
       key_value_pair.assign(input.data(),
                             key_range.begin,
                             value_range.end() - key_range.begin);
     }
     if (!output.empty())
       output += "&";

     output += key_value_pair;
   }
   if (!replaced) {
     if (!output.empty())
       output += "&";

     output += (param_name + "=" + param_value);
   }
   GURL::Replacements replacements;
   replacements.SetQueryStr(output);
   return url.ReplaceComponents(replacements);
 }

 QueryIterator::QueryIterator(const GURL& url)
     : url_(url),
       at_end_(!url.is_valid()) {
   if (!at_end_) {
     query_ = url.parsed_for_possibly_invalid_spec().query;
     Advance();
   }
 }

 QueryIterator::~QueryIterator() {
 }

 std::string QueryIterator::GetKey() const {
   DCHECK(!at_end_);
   if (key_.is_nonempty())
     return url_.spec().substr(key_.begin, key_.len);
   return std::string();
 }

 std::string QueryIterator::GetValue() const {
   DCHECK(!at_end_);
   if (value_.is_nonempty())
     return url_.spec().substr(value_.begin, value_.len);
   return std::string();
 }

 const std::string& QueryIterator::GetUnescapedValue() {
   DCHECK(!at_end_);
   if (value_.is_nonempty() && unescaped_value_.empty()) {
     unescaped_value_ = UnescapeURLComponent(
         GetValue(),
         UnescapeRule::SPACES |
         UnescapeRule::URL_SPECIAL_CHARS |
         UnescapeRule::REPLACE_PLUS_WITH_SPACE);
   }
   return unescaped_value_;
 }

 bool QueryIterator::IsAtEnd() const {
   return at_end_;
 }

 void QueryIterator::Advance() {
   DCHECK (!at_end_);
   key_.reset();
   value_.reset();
   unescaped_value_.clear();
   at_end_ =
       !url::ExtractQueryKeyValue(url_.spec().c_str(), &query_, &key_, &value_);
 }

 bool GetValueForKeyInQuery(const GURL& url,
                            const std::string& search_key,
                            std::string* out_value) {
   for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
     if (it.GetKey() == search_key) {
       *out_value = it.GetUnescapedValue();
       return true;
     }
   }
   return false;
 }

 bool ParseHostAndPort(std::string::const_iterator host_and_port_begin,
                       std::string::const_iterator host_and_port_end,
                       std::string* host,
                       int* port) {
   if (host_and_port_begin >= host_and_port_end)
     return false;

   // When using url, we use char*.
   const char* auth_begin = &(*host_and_port_begin);
   int auth_len = host_and_port_end - host_and_port_begin;

   url::Component auth_component(0, auth_len);
   url::Component username_component;
   url::Component password_component;
   url::Component hostname_component;
   url::Component port_component;

   url::ParseAuthority(auth_begin, auth_component, &username_component,
       &password_component, &hostname_component, &port_component);

   // There shouldn't be a username/password.
   if (username_component.is_valid() || password_component.is_valid())
     return false;

   if (!hostname_component.is_nonempty())
     return false;  // Failed parsing.

   int parsed_port_number = -1;
   if (port_component.is_nonempty()) {
     parsed_port_number = url::ParsePort(auth_begin, port_component);

     // If parsing failed, port_number will be either PORT_INVALID or
     // PORT_UNSPECIFIED, both of which are negative.
     if (parsed_port_number < 0)
       return false;  // Failed parsing the port number.
   }

   if (port_component.len == 0)
     return false;  // Reject inputs like "foo:"

   unsigned char tmp_ipv6_addr[16];

   // If the hostname starts with a bracket, it is either an IPv6 literal or
   // invalid. If it is an IPv6 literal then strip the brackets.
   if (hostname_component.len > 0 &&
       auth_begin[hostname_component.begin] == '[') {
     if (auth_begin[hostname_component.end() - 1] == ']' &&
         url::IPv6AddressToNumber(
             auth_begin, hostname_component, tmp_ipv6_addr)) {
       // Strip the brackets.
       hostname_component.begin++;
       hostname_component.len -= 2;
     } else {
       return false;
     }
   }

   // Pass results back to caller.
   host->assign(auth_begin + hostname_component.begin, hostname_component.len);
   *port = parsed_port_number;

   return true;  // Success.
 }

 bool ParseHostAndPort(const std::string& host_and_port,
                       std::string* host,
                       int* port) {
   return ParseHostAndPort(
       host_and_port.begin(), host_and_port.end(), host, port);
 }


 std::string GetHostAndPort(const GURL& url) {
   // For IPv6 literals, GURL::host() already includes the brackets so it is
   // safe to just append a colon.
   return base::StringPrintf("%s:%d", url.host().c_str(),
                             url.EffectiveIntPort());
 }

 std::string GetHostAndOptionalPort(const GURL& url) {
   // For IPv6 literals, GURL::host() already includes the brackets
   // so it is safe to just append a colon.
   if (url.has_port())
     return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
   return url.host();
 }

 std::string TrimEndingDot(base::StringPiece host) {
   base::StringPiece host_trimmed = host;
   size_t len = host_trimmed.length();
   if (len > 1 && host_trimmed[len - 1] == '.') {
     host_trimmed.remove_suffix(1);
   }
   return host_trimmed.as_string();
 }

 std::string GetHostOrSpecFromURL(const GURL& url) {
   return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
 }

 std::string CanonicalizeHost(base::StringPiece host,
                              url::CanonHostInfo* host_info) {
   // Try to canonicalize the host.
   const url::Component raw_host_component(0, static_cast<int>(host.length()));
   std::string canon_host;
   url::StdStringCanonOutput canon_host_output(&canon_host);
   url::CanonicalizeHostVerbose(host.data(), raw_host_component,
                                &canon_host_output, host_info);

   if (host_info->out_host.is_nonempty() &&
       host_info->family != url::CanonHostInfo::BROKEN) {
     // Success!  Assert that there's no extra garbage.
     canon_host_output.Complete();
     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
   } else {
     // Empty host, or canonicalization failed.  We'll return empty.
     canon_host.clear();
   }

   return canon_host;
 }

 bool IsCanonicalizedHostCompliant(const std::string& host) {
   if (host.empty())
     return false;

   bool in_component = false;
   bool most_recent_component_started_alphanumeric = false;

   for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
     const char c = *i;
     if (!in_component) {
       most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
       if (!most_recent_component_started_alphanumeric && (c != '-') &&
           (c != '_')) {
         return false;
       }
       in_component = true;
     } else if (c == '.') {
       in_component = false;
     } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
       return false;
     }
   }

   return most_recent_component_started_alphanumeric;
 }

 GURL SimplifyUrlForRequest(const GURL& url) {
   DCHECK(url.is_valid());
   GURL::Replacements replacements;
   replacements.ClearUsername();
   replacements.ClearPassword();
   replacements.ClearRef();
   return url.ReplaceComponents(replacements);
 }

 void GetIdentityFromURL(const GURL& url,
                         base::string16* username,
                         base::string16* password) {
   UnescapeRule::Type flags =
       UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS;
   *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags);
   *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags);
 }

 bool HasGoogleHost(const GURL& url) {
   static const char* kGoogleHostSuffixes[] = {
       ".google.com",
       ".youtube.com",
       ".gmail.com",
       ".doubleclick.net",
       ".gstatic.com",
       ".googlevideo.com",
       ".googleusercontent.com",
       ".googlesyndication.com",
       ".google-analytics.com",
       ".googleadservices.com",
       ".googleapis.com",
       ".ytimg.com",
   };
   base::StringPiece host = url.host_piece();
   for (const char* suffix : kGoogleHostSuffixes) {
     // Here it's possible to get away with faster case-sensitive comparisons
     // because the list above is all lowercase, and a GURL's host name will
     // always be canonicalized to lowercase as well.
     if (base::EndsWith(host, suffix, base::CompareCase::SENSITIVE))
       return true;
   }
   return false;
 }

 }  // namespace net
	// Copyright 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "net/base/url_util.h"

	#include "base/logging.h"
	#include "base/strings/string_util.h"
	#include "base/strings/stringprintf.h"
	#include "net/base/escape.h"
	#include "url/gurl.h"
	#include "url/third_party/mozilla/url_parse.h"
	#include "url/url_canon.h"
	#include "url/url_canon_ip.h"

	namespace net {

	namespace {

	bool IsHostCharAlphanumeric(char c) {
	// We can just check lowercase because uppercase characters have already been
	// normalized.
	return ((c >= 'a') && (c <= 'z')) \|\| ((c >= '0') && (c <= '9'));
	}

	} // namespace

	GURL AppendQueryParameter(const GURL& url,
	const std::string& name,
	const std::string& value) {
	std::string query(url.query());

	if (!query.empty())
	query += "&";

	query += (EscapeQueryParamValue(name, true) + "=" +
	EscapeQueryParamValue(value, true));
	GURL::Replacements replacements;
	replacements.SetQueryStr(query);
	return url.ReplaceComponents(replacements);
	}

	GURL AppendOrReplaceQueryParameter(const GURL& url,
	const std::string& name,
	const std::string& value) {
	bool replaced = false;
	std::string param_name = EscapeQueryParamValue(name, true);
	std::string param_value = EscapeQueryParamValue(value, true);

	const std::string input = url.query();
	url::Component cursor(0, input.size());
	std::string output;
	url::Component key_range, value_range;
	while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
	&value_range)) {
	const base::StringPiece key(
	input.data() + key_range.begin, key_range.len);
	std::string key_value_pair;
	// Check \|replaced\| as only the first pair should be replaced.
	if (!replaced && key == param_name) {
	replaced = true;
	key_value_pair = (param_name + "=" + param_value);
	} else {
	key_value_pair.assign(input.data(),
	key_range.begin,
	value_range.end() - key_range.begin);
	}
	if (!output.empty())
	output += "&";

	output += key_value_pair;
	}
	if (!replaced) {
	if (!output.empty())
	output += "&";

	output += (param_name + "=" + param_value);
	}
	GURL::Replacements replacements;
	replacements.SetQueryStr(output);
	return url.ReplaceComponents(replacements);
	}

	QueryIterator::QueryIterator(const GURL& url)
	: url_(url),
	at_end_(!url.is_valid()) {
	if (!at_end_) {
	query_ = url.parsed_for_possibly_invalid_spec().query;
	Advance();
	}
	}

	QueryIterator::~QueryIterator() {
	}

	std::string QueryIterator::GetKey() const {
	DCHECK(!at_end_);
	if (key_.is_nonempty())
	return url_.spec().substr(key_.begin, key_.len);
	return std::string();
	}

	std::string QueryIterator::GetValue() const {
	DCHECK(!at_end_);
	if (value_.is_nonempty())
	return url_.spec().substr(value_.begin, value_.len);
	return std::string();
	}

	const std::string& QueryIterator::GetUnescapedValue() {
	DCHECK(!at_end_);
	if (value_.is_nonempty() && unescaped_value_.empty()) {
	unescaped_value_ = UnescapeURLComponent(
	GetValue(),
	UnescapeRule::SPACES \|
	UnescapeRule::URL_SPECIAL_CHARS \|
	UnescapeRule::REPLACE_PLUS_WITH_SPACE);
	}
	return unescaped_value_;
	}

	bool QueryIterator::IsAtEnd() const {
	return at_end_;
	}

	void QueryIterator::Advance() {
	DCHECK (!at_end_);
	key_.reset();
	value_.reset();
	unescaped_value_.clear();
	at_end_ =
	!url::ExtractQueryKeyValue(url_.spec().c_str(), &query_, &key_, &value_);
	}

	bool GetValueForKeyInQuery(const GURL& url,
	const std::string& search_key,
	std::string* out_value) {
	for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
	if (it.GetKey() == search_key) {
	*out_value = it.GetUnescapedValue();
	return true;
	}
	}
	return false;
	}

	bool ParseHostAndPort(std::string::const_iterator host_and_port_begin,
	std::string::const_iterator host_and_port_end,
	std::string* host,
	int* port) {
	if (host_and_port_begin >= host_and_port_end)
	return false;

	// When using url, we use char*.
	const char* auth_begin = &(*host_and_port_begin);
	int auth_len = host_and_port_end - host_and_port_begin;

	url::Component auth_component(0, auth_len);
	url::Component username_component;
	url::Component password_component;
	url::Component hostname_component;
	url::Component port_component;

	url::ParseAuthority(auth_begin, auth_component, &username_component,
	&password_component, &hostname_component, &port_component);

	// There shouldn't be a username/password.
	if (username_component.is_valid() \|\| password_component.is_valid())
	return false;

	if (!hostname_component.is_nonempty())
	return false; // Failed parsing.

	int parsed_port_number = -1;
	if (port_component.is_nonempty()) {
	parsed_port_number = url::ParsePort(auth_begin, port_component);

	// If parsing failed, port_number will be either PORT_INVALID or
	// PORT_UNSPECIFIED, both of which are negative.
	if (parsed_port_number < 0)
	return false; // Failed parsing the port number.
	}

	if (port_component.len == 0)
	return false; // Reject inputs like "foo:"

	unsigned char tmp_ipv6_addr[16];

	// If the hostname starts with a bracket, it is either an IPv6 literal or
	// invalid. If it is an IPv6 literal then strip the brackets.
	if (hostname_component.len > 0 &&
	auth_begin[hostname_component.begin] == '[') {
	if (auth_begin[hostname_component.end() - 1] == ']' &&
	url::IPv6AddressToNumber(
	auth_begin, hostname_component, tmp_ipv6_addr)) {
	// Strip the brackets.
	hostname_component.begin++;
	hostname_component.len -= 2;
	} else {
	return false;
	}
	}

	// Pass results back to caller.
	host->assign(auth_begin + hostname_component.begin, hostname_component.len);
	*port = parsed_port_number;

	return true; // Success.
	}

	bool ParseHostAndPort(const std::string& host_and_port,
	std::string* host,
	int* port) {
	return ParseHostAndPort(
	host_and_port.begin(), host_and_port.end(), host, port);
	}


	std::string GetHostAndPort(const GURL& url) {
	// For IPv6 literals, GURL::host() already includes the brackets so it is
	// safe to just append a colon.
	return base::StringPrintf("%s:%d", url.host().c_str(),
	url.EffectiveIntPort());
	}

	std::string GetHostAndOptionalPort(const GURL& url) {
	// For IPv6 literals, GURL::host() already includes the brackets
	// so it is safe to just append a colon.
	if (url.has_port())
	return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
	return url.host();
	}

	std::string TrimEndingDot(base::StringPiece host) {
	base::StringPiece host_trimmed = host;
	size_t len = host_trimmed.length();
	if (len > 1 && host_trimmed[len - 1] == '.') {
	host_trimmed.remove_suffix(1);
	}
	return host_trimmed.as_string();
	}

	std::string GetHostOrSpecFromURL(const GURL& url) {
	return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
	}

	std::string CanonicalizeHost(base::StringPiece host,
	url::CanonHostInfo* host_info) {
	// Try to canonicalize the host.
	const url::Component raw_host_component(0, static_cast<int>(host.length()));
	std::string canon_host;
	url::StdStringCanonOutput canon_host_output(&canon_host);
	url::CanonicalizeHostVerbose(host.data(), raw_host_component,
	&canon_host_output, host_info);

	if (host_info->out_host.is_nonempty() &&
	host_info->family != url::CanonHostInfo::BROKEN) {
	// Success! Assert that there's no extra garbage.
	canon_host_output.Complete();
	DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
	} else {
	// Empty host, or canonicalization failed. We'll return empty.
	canon_host.clear();
	}

	return canon_host;
	}

	bool IsCanonicalizedHostCompliant(const std::string& host) {
	if (host.empty())
	return false;

	bool in_component = false;
	bool most_recent_component_started_alphanumeric = false;

	for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
	const char c = *i;
	if (!in_component) {
	most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
	if (!most_recent_component_started_alphanumeric && (c != '-') &&
	(c != '_')) {
	return false;
	}
	in_component = true;
	} else if (c == '.') {
	in_component = false;
	} else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
	return false;
	}
	}

	return most_recent_component_started_alphanumeric;
	}

	GURL SimplifyUrlForRequest(const GURL& url) {
	DCHECK(url.is_valid());
	GURL::Replacements replacements;
	replacements.ClearUsername();
	replacements.ClearPassword();
	replacements.ClearRef();
	return url.ReplaceComponents(replacements);
	}

	void GetIdentityFromURL(const GURL& url,
	base::string16* username,
	base::string16* password) {
	UnescapeRule::Type flags =
	UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS;
	*username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags);
	*password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags);
	}

	bool HasGoogleHost(const GURL& url) {
	static const char* kGoogleHostSuffixes[] = {
	".google.com",
	".youtube.com",
	".gmail.com",
	".doubleclick.net",
	".gstatic.com",
	".googlevideo.com",
	".googleusercontent.com",
	".googlesyndication.com",
	".google-analytics.com",
	".googleadservices.com",
	".googleapis.com",
	".ytimg.com",
	};
	base::StringPiece host = url.host_piece();
	for (const char* suffix : kGoogleHostSuffixes) {
	// Here it's possible to get away with faster case-sensitive comparisons
	// because the list above is all lowercase, and a GURL's host name will
	// always be canonicalized to lowercase as well.
	if (base::EndsWith(host, suffix, base::CompareCase::SENSITIVE))
	return true;
	}
	return false;
	}

	} // namespace net