third_party/liburlpattern/tokenize.cc - chromium/src - Git at Google

 // Copyright 2020 The Chromium Authors
 // Copyright 2014 Blake Embrey (hello@blakeembrey.com)
 // Use of this source code is governed by an MIT-style license that can be
 // found in the LICENSE file or at https://opensource.org/licenses/MIT.

 #include "third_party/liburlpattern/tokenize.h"

 #include "third_party/abseil-cpp/absl/strings/str_format.h"
 #include "third_party/icu/source/common/unicode/uchar.h"
 #include "third_party/icu/source/common/unicode/utf8.h"
 #include "third_party/liburlpattern/utils.h"

 // The following code is a translation from the path-to-regexp typescript at:
 //
 //  https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124

 namespace liburlpattern {

 namespace {

 bool IsASCII(UChar32 c) {
   // Characters should be valid ASCII code points:
   // https://infra.spec.whatwg.org/#ascii-code-point
   return c >= 0x00 && c <= 0x7f;
 }

 class Tokenizer {
  public:
   Tokenizer(absl::string_view pattern, TokenizePolicy policy)
       : pattern_(std::move(pattern)), policy_(policy) {
     token_list_.reserve(pattern_.size());
   }

   absl::StatusOr<std::vector<Token>> Tokenize() {
     while (index_ < pattern_.size()) {
       if (!status_.ok())
         return std::move(status_);

       if (!NextAt(index_)) {
         Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_));
         continue;
       }
       if (codepoint_ == '*') {
         AddToken(TokenType::kAsterisk);
         continue;
       }

       if (codepoint_ == '+' || codepoint_ == '?') {
         AddToken(TokenType::kOtherModifier);
         continue;
       }

       // Escape sequences always escape a single following character at the
       // level of the pattern.
       if (codepoint_ == '\\') {
         if (index_ == (pattern_.size() - 1)) {
           Error(absl::StrFormat("Trailing escape character at index %d.",
                                 index_));
           continue;
         }
         size_t escaped_i = next_index_;
         if (!Next()) {
           Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
                                 next_index_));
           continue;
         }

         AddToken(TokenType::kEscapedChar, next_index_, escaped_i);
         continue;
       }

       if (codepoint_ == '{') {
         AddToken(TokenType::kOpen);
         continue;
       }

       if (codepoint_ == '}') {
         AddToken(TokenType::kClose);
         continue;
       }

       if (codepoint_ == ':') {
         size_t pos = next_index_;
         size_t name_start = pos;

         // Iterate over codepoints until we find the first non-name codepoint.
         while (pos < pattern_.size()) {
           if (!status_.ok())
             return std::move(status_);
           if (!NextAt(pos)) {
             Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos));
             continue;
           }
           if (!IsNameCodepoint(codepoint_, pos == name_start))
             break;
           pos = next_index_;
         }

         if (pos <= name_start) {
           Error(absl::StrFormat("Missing parameter name at index %d.", index_),
                 name_start, index_);
           continue;
         }

         AddToken(TokenType::kName, pos, name_start);
         continue;
       }

       if (codepoint_ == '(') {
         size_t paren_nesting = 1;
         size_t j = next_index_;
         const size_t regex_start = next_index_;
         bool error = false;

         while (j < pattern_.size()) {
           if (!NextAt(j)) {
             Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j));
             error = true;
             break;
           }

           if (!IsASCII(codepoint_)) {
             Error(absl::StrFormat(
                       "Invalid non-ASCII character 0x%02x at index %d.",
                       codepoint_, j),
                   regex_start, index_);
             error = true;
             break;
           }
           if (j == regex_start && codepoint_ == '?') {
             Error(absl::StrFormat("Regex cannot start with '?' at index %d", j),
                   regex_start, index_);
             error = true;
             break;
           }

           // This escape processing only handles single character escapes since
           // we only need to understand escaped paren characters for our state
           // processing.  The escape `\` character is propagated to the embedded
           // regex expression.  This permits authors to write longer escape
           // sequences such as `\x22` since the later characters will be
           // propagated on subsequent loop iterations.
           if (codepoint_ == '\\') {
             if (j == (pattern_.size() - 1)) {
               Error(
                   absl::StrFormat("Trailing escape character at index %d.", j),
                   regex_start, index_);
               error = true;
               break;
             }
             size_t escaped_j = next_index_;
             if (!Next()) {
               Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
                                     next_index_));
               error = true;
               break;
             }
             if (!IsASCII(codepoint_)) {
               Error(absl::StrFormat(
                         "Invalid non-ASCII character 0x%02x at index %d.",
                         codepoint_, escaped_j),
                     regex_start, index_);
               error = true;
               break;
             }
             j = next_index_;
             continue;
           }

           if (codepoint_ == ')') {
             paren_nesting -= 1;
             if (paren_nesting == 0) {
               j = next_index_;
               break;
             }
           } else if (codepoint_ == '(') {
             paren_nesting += 1;
             if (j == (pattern_.size() - 1)) {
               Error(absl::StrFormat("Unbalanced regex at index %d.", j),
                     regex_start, index_);
               error = true;
               break;
             }
             size_t tmp_j = next_index_;
             if (!Next()) {
               Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
                                     next_index_));
               error = true;
               break;
             }
             // Require the the first character after an open paren is `?`.  This
             // permits assertions, named capture groups, and non-capturing
             // groups. It blocks, however, unnamed capture groups.
             if (codepoint_ != '?') {
               Error(absl::StrFormat(
                         "Unnamed capturing groups are not allowed at index %d.",
                         tmp_j),
                     regex_start, index_);
               error = true;
               break;
             }
             next_index_ = tmp_j;
           }

           j = next_index_;
         }

         if (error)
           continue;

         if (paren_nesting) {
           Error(absl::StrFormat("Unbalanced regex at index %d.", index_),
                 regex_start, index_);
           continue;
         }

         const size_t regex_length = j - regex_start - 1;
         if (regex_length == 0) {
           Error(absl::StrFormat("Missing regex at index %d.", index_),
                 regex_start, index_);
           continue;
         }

         AddToken(TokenType::kRegex, j, regex_start, regex_length);
         continue;
       }

       AddToken(TokenType::kChar);
     }

     if (!status_.ok())
       return std::move(status_);

     AddToken(TokenType::kEnd, index_, index_);

     return std::move(token_list_);
   }

  private:
   // Read the codepoint at `next_index_` in `pattern_` and store it in
   // `codepoint_`.  In addition, `next_index_` is updated to the codepoint to be
   // read next.  Returns true iff the codepoint was read successfully. On
   // success, `codepoint_` is non-negative.
   [[nodiscard]] bool Next() {
     U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_);
     return codepoint_ >= 0;
   }

   // Read the codepoint at the specified `index` in `pattern_` and store it in
   // `codepoint_`.  In addition, `next_index_` is updated to the codepoint to be
   // read next.  Returns true iff the codepoint was read successfully. On
   // success, `codepoint_` is non-negative.
   [[nodiscard]] bool NextAt(size_t index) {
     next_index_ = index;
     return Next();
   }

   // Append a Token to our list of the given `type` and with a value consisting
   // of the codepoints in `pattern_` starting at `value_pos` with
   // `value_length`. Update `index_` to `next_pos` automatically.
   void AddToken(TokenType type,
                 size_t next_pos,
                 size_t value_pos,
                 size_t value_length) {
     token_list_.emplace_back(type, index_,
                              pattern_.substr(value_pos, value_length));
     index_ = next_pos;
   }

   // Append a Token to our list of the given `type` and with a value consisting
   // of the codepoints in `pattern_` starting at `value_pos`.  The value length
   // is automatically computed as the difference between `next_pos` and
   // `value_pos`. Update `index_` to `next_pos` automatically.
   void AddToken(TokenType type, size_t next_pos, size_t value_pos) {
     AddToken(type, next_pos, value_pos, next_pos - value_pos);
   }

   // Append a Token to our list of the given `type` and with a value consisting
   // of the codepoints in `pattern_` starting at `index_`.  The value length
   // is automatically computed as the difference between `next_index_` and
   // `index_`. Update `index_` to `next_index_` automatically.
   void AddToken(TokenType type) { AddToken(type, next_index_, index_); }

   void Error(absl::string_view message, size_t next_pos, size_t value_pos) {
     if (policy_ == TokenizePolicy::kLenient)
       AddToken(TokenType::kInvalidChar, next_pos, value_pos);
     else
       status_ = absl::InvalidArgumentError(message);
   }

   void Error(absl::string_view message) { Error(message, next_index_, index_); }

   const absl::string_view pattern_;
   const TokenizePolicy policy_;
   std::vector<Token> token_list_;
   absl::Status status_;

   // `index_` tracks our "current" byte index in the input string.  Typically
   // this will be updated every time we commit a token to `token_list_`.  It may
   // stay frozen in place if we have a sub-loop consuming a larger token like
   // a named group or regex group.
   size_t index_ = 0;

   // The `next_index_` member is used to find the next UTF8 codepoint in the
   // string.  This is used as both an input and output from the U8_NEXT()
   // function.  We keep this separate from `index_` because there are many
   // cases where we need to read ahead of the last token consumed.
   size_t next_index_ = 0;

   UChar32 codepoint_ = U_SENTINEL;
 };

 }  // namespace

 const char* TokenTypeToString(TokenType type) {
   switch (type) {
     case TokenType::kOpen:
       return "'{'";
     case TokenType::kClose:
       return "'}'";
     case TokenType::kRegex:
       return "regex group";
     case TokenType::kName:
       return "named group";
     case TokenType::kChar:
       return "character";
     case TokenType::kEscapedChar:
       return "escaped character";
     case TokenType::kOtherModifier:
       return "modifier";
     case TokenType::kAsterisk:
       return "asterisk";
     case TokenType::kEnd:
       return "end of pattern";
     case TokenType::kInvalidChar:
       return "invalid character";
   }
 }

 std::ostream& operator<<(std::ostream& o, Token token) {
   o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index
     << ", value:" << token.value << " }";
   return o;
 }

 // Split the input pattern into a list of tokens.
 absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern,
                                             TokenizePolicy policy) {
   Tokenizer tokenizer(std::move(pattern), policy);
   return tokenizer.Tokenize();
 }

 }  // namespace liburlpattern
	// Copyright 2020 The Chromium Authors
	// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
	// Use of this source code is governed by an MIT-style license that can be
	// found in the LICENSE file or at https://opensource.org/licenses/MIT.

	#include "third_party/liburlpattern/tokenize.h"

	#include "third_party/abseil-cpp/absl/strings/str_format.h"
	#include "third_party/icu/source/common/unicode/uchar.h"
	#include "third_party/icu/source/common/unicode/utf8.h"
	#include "third_party/liburlpattern/utils.h"

	// The following code is a translation from the path-to-regexp typescript at:
	//
	// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124

	namespace liburlpattern {

	namespace {

	bool IsASCII(UChar32 c) {
	// Characters should be valid ASCII code points:
	// https://infra.spec.whatwg.org/#ascii-code-point
	return c >= 0x00 && c <= 0x7f;
	}

	class Tokenizer {
	public:
	Tokenizer(absl::string_view pattern, TokenizePolicy policy)
	: pattern_(std::move(pattern)), policy_(policy) {
	token_list_.reserve(pattern_.size());
	}

	absl::StatusOr<std::vector<Token>> Tokenize() {
	while (index_ < pattern_.size()) {
	if (!status_.ok())
	return std::move(status_);

	if (!NextAt(index_)) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_));
	continue;
	}
	if (codepoint_ == '*') {
	AddToken(TokenType::kAsterisk);
	continue;
	}

	if (codepoint_ == '+' \|\| codepoint_ == '?') {
	AddToken(TokenType::kOtherModifier);
	continue;
	}

	// Escape sequences always escape a single following character at the
	// level of the pattern.
	if (codepoint_ == '\\') {
	if (index_ == (pattern_.size() - 1)) {
	Error(absl::StrFormat("Trailing escape character at index %d.",
	index_));
	continue;
	}
	size_t escaped_i = next_index_;
	if (!Next()) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
	next_index_));
	continue;
	}

	AddToken(TokenType::kEscapedChar, next_index_, escaped_i);
	continue;
	}

	if (codepoint_ == '{') {
	AddToken(TokenType::kOpen);
	continue;
	}

	if (codepoint_ == '}') {
	AddToken(TokenType::kClose);
	continue;
	}

	if (codepoint_ == ':') {
	size_t pos = next_index_;
	size_t name_start = pos;

	// Iterate over codepoints until we find the first non-name codepoint.
	while (pos < pattern_.size()) {
	if (!status_.ok())
	return std::move(status_);
	if (!NextAt(pos)) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos));
	continue;
	}
	if (!IsNameCodepoint(codepoint_, pos == name_start))
	break;
	pos = next_index_;
	}

	if (pos <= name_start) {
	Error(absl::StrFormat("Missing parameter name at index %d.", index_),
	name_start, index_);
	continue;
	}

	AddToken(TokenType::kName, pos, name_start);
	continue;
	}

	if (codepoint_ == '(') {
	size_t paren_nesting = 1;
	size_t j = next_index_;
	const size_t regex_start = next_index_;
	bool error = false;

	while (j < pattern_.size()) {
	if (!NextAt(j)) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j));
	error = true;
	break;
	}

	if (!IsASCII(codepoint_)) {
	Error(absl::StrFormat(
	"Invalid non-ASCII character 0x%02x at index %d.",
	codepoint_, j),
	regex_start, index_);
	error = true;
	break;
	}
	if (j == regex_start && codepoint_ == '?') {
	Error(absl::StrFormat("Regex cannot start with '?' at index %d", j),
	regex_start, index_);
	error = true;
	break;
	}

	// This escape processing only handles single character escapes since
	// we only need to understand escaped paren characters for our state
	// processing. The escape `\` character is propagated to the embedded
	// regex expression. This permits authors to write longer escape
	// sequences such as `\x22` since the later characters will be
	// propagated on subsequent loop iterations.
	if (codepoint_ == '\\') {
	if (j == (pattern_.size() - 1)) {
	Error(
	absl::StrFormat("Trailing escape character at index %d.", j),
	regex_start, index_);
	error = true;
	break;
	}
	size_t escaped_j = next_index_;
	if (!Next()) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
	next_index_));
	error = true;
	break;
	}
	if (!IsASCII(codepoint_)) {
	Error(absl::StrFormat(
	"Invalid non-ASCII character 0x%02x at index %d.",
	codepoint_, escaped_j),
	regex_start, index_);
	error = true;
	break;
	}
	j = next_index_;
	continue;
	}

	if (codepoint_ == ')') {
	paren_nesting -= 1;
	if (paren_nesting == 0) {
	j = next_index_;
	break;
	}
	} else if (codepoint_ == '(') {
	paren_nesting += 1;
	if (j == (pattern_.size() - 1)) {
	Error(absl::StrFormat("Unbalanced regex at index %d.", j),
	regex_start, index_);
	error = true;
	break;
	}
	size_t tmp_j = next_index_;
	if (!Next()) {
	Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
	next_index_));
	error = true;
	break;
	}
	// Require the the first character after an open paren is `?`. This
	// permits assertions, named capture groups, and non-capturing
	// groups. It blocks, however, unnamed capture groups.
	if (codepoint_ != '?') {
	Error(absl::StrFormat(
	"Unnamed capturing groups are not allowed at index %d.",
	tmp_j),
	regex_start, index_);
	error = true;
	break;
	}
	next_index_ = tmp_j;
	}

	j = next_index_;
	}

	if (error)
	continue;

	if (paren_nesting) {
	Error(absl::StrFormat("Unbalanced regex at index %d.", index_),
	regex_start, index_);
	continue;
	}

	const size_t regex_length = j - regex_start - 1;
	if (regex_length == 0) {
	Error(absl::StrFormat("Missing regex at index %d.", index_),
	regex_start, index_);
	continue;
	}

	AddToken(TokenType::kRegex, j, regex_start, regex_length);
	continue;
	}

	AddToken(TokenType::kChar);
	}

	if (!status_.ok())
	return std::move(status_);

	AddToken(TokenType::kEnd, index_, index_);

	return std::move(token_list_);
	}

	private:
	// Read the codepoint at `next_index_` in `pattern_` and store it in
	// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
	// read next. Returns true iff the codepoint was read successfully. On
	// success, `codepoint_` is non-negative.
	[[nodiscard]] bool Next() {
	U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_);
	return codepoint_ >= 0;
	}

	// Read the codepoint at the specified `index` in `pattern_` and store it in
	// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
	// read next. Returns true iff the codepoint was read successfully. On
	// success, `codepoint_` is non-negative.
	[[nodiscard]] bool NextAt(size_t index) {
	next_index_ = index;
	return Next();
	}

	// Append a Token to our list of the given `type` and with a value consisting
	// of the codepoints in `pattern_` starting at `value_pos` with
	// `value_length`. Update `index_` to `next_pos` automatically.
	void AddToken(TokenType type,
	size_t next_pos,
	size_t value_pos,
	size_t value_length) {
	token_list_.emplace_back(type, index_,
	pattern_.substr(value_pos, value_length));
	index_ = next_pos;
	}

	// Append a Token to our list of the given `type` and with a value consisting
	// of the codepoints in `pattern_` starting at `value_pos`. The value length
	// is automatically computed as the difference between `next_pos` and
	// `value_pos`. Update `index_` to `next_pos` automatically.
	void AddToken(TokenType type, size_t next_pos, size_t value_pos) {
	AddToken(type, next_pos, value_pos, next_pos - value_pos);
	}

	// Append a Token to our list of the given `type` and with a value consisting
	// of the codepoints in `pattern_` starting at `index_`. The value length
	// is automatically computed as the difference between `next_index_` and
	// `index_`. Update `index_` to `next_index_` automatically.
	void AddToken(TokenType type) { AddToken(type, next_index_, index_); }

	void Error(absl::string_view message, size_t next_pos, size_t value_pos) {
	if (policy_ == TokenizePolicy::kLenient)
	AddToken(TokenType::kInvalidChar, next_pos, value_pos);
	else
	status_ = absl::InvalidArgumentError(message);
	}

	void Error(absl::string_view message) { Error(message, next_index_, index_); }

	const absl::string_view pattern_;
	const TokenizePolicy policy_;
	std::vector<Token> token_list_;
	absl::Status status_;

	// `index_` tracks our "current" byte index in the input string. Typically
	// this will be updated every time we commit a token to `token_list_`. It may
	// stay frozen in place if we have a sub-loop consuming a larger token like
	// a named group or regex group.
	size_t index_ = 0;

	// The `next_index_` member is used to find the next UTF8 codepoint in the
	// string. This is used as both an input and output from the U8_NEXT()
	// function. We keep this separate from `index_` because there are many
	// cases where we need to read ahead of the last token consumed.
	size_t next_index_ = 0;

	UChar32 codepoint_ = U_SENTINEL;
	};

	} // namespace

	const char* TokenTypeToString(TokenType type) {
	switch (type) {
	case TokenType::kOpen:
	return "'{'";
	case TokenType::kClose:
	return "'}'";
	case TokenType::kRegex:
	return "regex group";
	case TokenType::kName:
	return "named group";
	case TokenType::kChar:
	return "character";
	case TokenType::kEscapedChar:
	return "escaped character";
	case TokenType::kOtherModifier:
	return "modifier";
	case TokenType::kAsterisk:
	return "asterisk";
	case TokenType::kEnd:
	return "end of pattern";
	case TokenType::kInvalidChar:
	return "invalid character";
	}
	}

	std::ostream& operator<<(std::ostream& o, Token token) {
	o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index
	<< ", value:" << token.value << " }";
	return o;
	}

	// Split the input pattern into a list of tokens.
	absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern,
	TokenizePolicy policy) {
	Tokenizer tokenizer(std::move(pattern), policy);
	return tokenizer.Tokenize();
	}

	} // namespace liburlpattern