components/link_header_util/link_header_util.cc - codesearch/chromium/src - Git at Google

 // Copyright 2016 The Chromium Authors
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/link_header_util/link_header_util.h"

 #include <algorithm>
 #include <string>
 #include <string_view>
 #include <unordered_map>

 #include "base/strings/string_util.h"
 #include "net/http/http_util.h"

 namespace link_header_util {

 namespace {

 // A variation of base::StringTokenizer and net::HttpUtil::ValuesIterator.
 // Takes the parsing of StringTokenizer and adds support for quoted strings that
 // are quoted by matching <> (and does not support escaping in those strings).
 // Also has the behavior of ValuesIterator where it strips whitespace from all
 // values and only outputs non-empty values.
 // Only supports ',' as separator and supports "" and <> as quote chars.
 class ValueTokenizer {
  public:
   ValueTokenizer(std::string::const_iterator begin,
                  std::string::const_iterator end)
       : token_begin_(begin), token_end_(begin), end_(end) {}

   std::string::const_iterator token_begin() const { return token_begin_; }
   std::string::const_iterator token_end() const { return token_end_; }

   bool GetNext() {
     while (GetNextInternal()) {
       net::HttpUtil::TrimLWS(&token_begin_, &token_end_);

       // Only return non-empty values.
       if (token_begin_ != token_end_)
         return true;
     }
     return false;
   }

  private:
   // Updates token_begin_ and token_end_ to point to the (possibly empty) next
   // token. Returns false if end-of-string was reached first.
   bool GetNextInternal() {
     // First time this is called token_end_ points to the first character in the
     // input. Every other time token_end_ points to the delimiter at the end of
     // the last returned token (which could be the end of the string).

     // End of string, return false.
     if (token_end_ == end_)
       return false;

     // Skip past the delimiter.
     if (*token_end_ == ',')
       ++token_end_;

     // Make token_begin_ point to the beginning of the next token, and search
     // for the end of the token in token_end_.
     token_begin_ = token_end_;

     // Set to true if we're currently inside a quoted string.
     bool in_quote = false;
     // Set to true if we're currently inside a quoted string, and have just
     // encountered an escape character. In this case a closing quote will be
     // ignored.
     bool in_escape = false;
     // If currently in a quoted string, this is the character that (when not
     // escaped) indicates the end of the string.
     char quote_close_char = '\0';
     // If currently in a quoted string, this is set to true if it is possible to
     // escape the closing quote using '\'.
     bool quote_allows_escape = false;

     while (token_end_ != end_) {
       char c = *token_end_;
       if (in_quote) {
         if (in_escape) {
           in_escape = false;
         } else if (quote_allows_escape && c == '\\') {
           in_escape = true;
         } else if (c == quote_close_char) {
           in_quote = false;
         }
       } else {
         if (c == ',')
           break;
         if (c == '"' || c == '<') {
           in_quote = true;
           quote_close_char = (c == '<' ? '>' : c);
           quote_allows_escape = (c != '<');
         }
       }
       ++token_end_;
     }
     return true;
   }

   std::string::const_iterator token_begin_;
   std::string::const_iterator token_end_;
   std::string::const_iterator end_;
 };

 // Parses the URL part of a Link header. When successful, returns the URL and
 // sets `params_string` to include the portion of the header after the
 // '>' character at the end of the URL.
 std::optional<std::string> ExtractURL(std::string_view header,
                                       std::string_view& params_string) {
   // Extract the URL part (everything between '<' and first '>' character).
   // ParseLinkHeaderValue() ensures `header` is non-empty, so no need to check
   // for that.
   if (header.front() != '<') {
     return std::nullopt;
   }

   size_t url_begin = 1;
   size_t url_end = header.find('>');

   // Fail if we did not find a '>'.
   if (url_end == std::string_view::npos) {
     return std::nullopt;
   }

   // Skip the '>' at the end of the URL.
   params_string = header.substr(url_end + 1);

   // Trim whitespace around the URL, and copy to a string.
   return std::string(
       net::HttpUtil::TrimLWS(header.substr(url_begin, url_end - url_begin)));
 }

 }  // namespace

 std::vector<StringIteratorPair> SplitLinkHeader(const std::string& header) {
   std::vector<StringIteratorPair> values;
   ValueTokenizer tokenizer(header.begin(), header.end());
   while (tokenizer.GetNext()) {
     values.push_back(
         StringIteratorPair(tokenizer.token_begin(), tokenizer.token_end()));
   }
   return values;
 }

 // Parses one link in a link header into its url and parameters.
 // A link is of the form "<some-url>; param1=value1; param2=value2".
 // Returns nullopt if parsing the link failed, returns the URL as a string on
 // success. This method is more lenient than the RFC. It doesn't fail on things
 // like invalid characters in the URL, and also doesn't verify that certain
 // parameters should or shouldn't be quoted strings.
 //
 // If a parameter occurs more than once in the link, only the first value is
 // returned in params as this is the required behavior for all attributes chrome
 // currently cares about in link headers.
 std::optional<std::string> ParseLinkHeaderValue(
     std::string_view header,
     std::unordered_map<std::string, std::optional<std::string>>& params) {
   // Can't parse an empty string.
   if (header.empty()) {
     return std::nullopt;
   }

   // Extract the URL part (everything between '<' and first '>' character).
   std::string_view params_string;
   auto url = ExtractURL(header, params_string);
   if (!url) {
     return std::nullopt;
   }

   // Trim any remaining whitespace, and make sure there is a ';' separating
   // parameters from the URL.
   params_string = net::HttpUtil::TrimLWS(params_string);
   if (!params_string.empty() && params_string.front() != ';') {
     return std::nullopt;
   }

   // Parse all the parameters.
   net::HttpUtil::NameValuePairsIterator params_iterator(
       params_string, /*delimiter=*/';',
       net::HttpUtil::NameValuePairsIterator::Values::NOT_REQUIRED,
       net::HttpUtil::NameValuePairsIterator::Quotes::STRICT_QUOTES);
   while (params_iterator.GetNext()) {
     if (!net::HttpUtil::IsParmName(params_iterator.name())) {
       return std::nullopt;
     }
     std::string name = base::ToLowerASCII(params_iterator.name());
     if (!params_iterator.value_is_quoted() && params_iterator.value().empty()) {
       params.emplace(std::move(name), std::nullopt);
     } else {
       params.emplace(std::move(name), params_iterator.value());
     }
   }
   if (!params_iterator.valid()) {
     return std::nullopt;
   }
   return url;
 }

 std::optional<std::string> ParseLinkHeaderValue(
     const StringIteratorPair& string_iterator_pair,
     std::unordered_map<std::string, std::optional<std::string>>& params) {
   return ParseLinkHeaderValue(
       std::string_view(string_iterator_pair.first, string_iterator_pair.second),
       params);
 }

 }  // namespace link_header_util
	// Copyright 2016 The Chromium Authors
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/link_header_util/link_header_util.h"

	#include <algorithm>
	#include <string>
	#include <string_view>
	#include <unordered_map>

	#include "base/strings/string_util.h"
	#include "net/http/http_util.h"

	namespace link_header_util {

	namespace {

	// A variation of base::StringTokenizer and net::HttpUtil::ValuesIterator.
	// Takes the parsing of StringTokenizer and adds support for quoted strings that
	// are quoted by matching <> (and does not support escaping in those strings).
	// Also has the behavior of ValuesIterator where it strips whitespace from all
	// values and only outputs non-empty values.
	// Only supports ',' as separator and supports "" and <> as quote chars.
	class ValueTokenizer {
	public:
	ValueTokenizer(std::string::const_iterator begin,
	std::string::const_iterator end)
	: token_begin_(begin), token_end_(begin), end_(end) {}

	std::string::const_iterator token_begin() const { return token_begin_; }
	std::string::const_iterator token_end() const { return token_end_; }

	bool GetNext() {
	while (GetNextInternal()) {
	net::HttpUtil::TrimLWS(&token_begin_, &token_end_);

	// Only return non-empty values.
	if (token_begin_ != token_end_)
	return true;
	}
	return false;
	}

	private:
	// Updates token_begin_ and token_end_ to point to the (possibly empty) next
	// token. Returns false if end-of-string was reached first.
	bool GetNextInternal() {
	// First time this is called token_end_ points to the first character in the
	// input. Every other time token_end_ points to the delimiter at the end of
	// the last returned token (which could be the end of the string).

	// End of string, return false.
	if (token_end_ == end_)
	return false;

	// Skip past the delimiter.
	if (*token_end_ == ',')
	++token_end_;

	// Make token_begin_ point to the beginning of the next token, and search
	// for the end of the token in token_end_.
	token_begin_ = token_end_;

	// Set to true if we're currently inside a quoted string.
	bool in_quote = false;
	// Set to true if we're currently inside a quoted string, and have just
	// encountered an escape character. In this case a closing quote will be
	// ignored.
	bool in_escape = false;
	// If currently in a quoted string, this is the character that (when not
	// escaped) indicates the end of the string.
	char quote_close_char = '\0';
	// If currently in a quoted string, this is set to true if it is possible to
	// escape the closing quote using '\'.
	bool quote_allows_escape = false;

	while (token_end_ != end_) {
	char c = *token_end_;
	if (in_quote) {
	if (in_escape) {
	in_escape = false;
	} else if (quote_allows_escape && c == '\\') {
	in_escape = true;
	} else if (c == quote_close_char) {
	in_quote = false;
	}
	} else {
	if (c == ',')
	break;
	if (c == '"' \|\| c == '<') {
	in_quote = true;
	quote_close_char = (c == '<' ? '>' : c);
	quote_allows_escape = (c != '<');
	}
	}
	++token_end_;
	}
	return true;
	}

	std::string::const_iterator token_begin_;
	std::string::const_iterator token_end_;
	std::string::const_iterator end_;
	};

	// Parses the URL part of a Link header. When successful, returns the URL and
	// sets `params_string` to include the portion of the header after the
	// '>' character at the end of the URL.
	std::optional<std::string> ExtractURL(std::string_view header,
	std::string_view& params_string) {
	// Extract the URL part (everything between '<' and first '>' character).
	// ParseLinkHeaderValue() ensures `header` is non-empty, so no need to check
	// for that.
	if (header.front() != '<') {
	return std::nullopt;
	}

	size_t url_begin = 1;
	size_t url_end = header.find('>');

	// Fail if we did not find a '>'.
	if (url_end == std::string_view::npos) {
	return std::nullopt;
	}

	// Skip the '>' at the end of the URL.
	params_string = header.substr(url_end + 1);

	// Trim whitespace around the URL, and copy to a string.
	return std::string(
	net::HttpUtil::TrimLWS(header.substr(url_begin, url_end - url_begin)));
	}

	} // namespace

	std::vector<StringIteratorPair> SplitLinkHeader(const std::string& header) {
	std::vector<StringIteratorPair> values;
	ValueTokenizer tokenizer(header.begin(), header.end());
	while (tokenizer.GetNext()) {
	values.push_back(
	StringIteratorPair(tokenizer.token_begin(), tokenizer.token_end()));
	}
	return values;
	}

	// Parses one link in a link header into its url and parameters.
	// A link is of the form "<some-url>; param1=value1; param2=value2".
	// Returns nullopt if parsing the link failed, returns the URL as a string on
	// success. This method is more lenient than the RFC. It doesn't fail on things
	// like invalid characters in the URL, and also doesn't verify that certain
	// parameters should or shouldn't be quoted strings.
	//
	// If a parameter occurs more than once in the link, only the first value is
	// returned in params as this is the required behavior for all attributes chrome
	// currently cares about in link headers.
	std::optional<std::string> ParseLinkHeaderValue(
	std::string_view header,
	std::unordered_map<std::string, std::optional<std::string>>& params) {
	// Can't parse an empty string.
	if (header.empty()) {
	return std::nullopt;
	}

	// Extract the URL part (everything between '<' and first '>' character).
	std::string_view params_string;
	auto url = ExtractURL(header, params_string);
	if (!url) {
	return std::nullopt;
	}

	// Trim any remaining whitespace, and make sure there is a ';' separating
	// parameters from the URL.
	params_string = net::HttpUtil::TrimLWS(params_string);
	if (!params_string.empty() && params_string.front() != ';') {
	return std::nullopt;
	}

	// Parse all the parameters.
	net::HttpUtil::NameValuePairsIterator params_iterator(
	params_string, /delimiter=/';',
	net::HttpUtil::NameValuePairsIterator::Values::NOT_REQUIRED,
	net::HttpUtil::NameValuePairsIterator::Quotes::STRICT_QUOTES);
	while (params_iterator.GetNext()) {
	if (!net::HttpUtil::IsParmName(params_iterator.name())) {
	return std::nullopt;
	}
	std::string name = base::ToLowerASCII(params_iterator.name());
	if (!params_iterator.value_is_quoted() && params_iterator.value().empty()) {
	params.emplace(std::move(name), std::nullopt);
	} else {
	params.emplace(std::move(name), params_iterator.value());
	}
	}
	if (!params_iterator.valid()) {
	return std::nullopt;
	}
	return url;
	}

	std::optional<std::string> ParseLinkHeaderValue(
	const StringIteratorPair& string_iterator_pair,
	std::unordered_map<std::string, std::optional<std::string>>& params) {
	return ParseLinkHeaderValue(
	std::string_view(string_iterator_pair.first, string_iterator_pair.second),
	params);
	}

	} // namespace link_header_util