third_party/liburlpattern/tokenize.cc - chromium/src.git - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Copyright 2014 Blake Embrey (hello@blakeembrey.com)
 // Use of this source code is governed by an MIT-style license that can be
 // found in the LICENSE file or at https://opensource.org/licenses/MIT.

 #include "third_party/liburlpattern/tokenize.h"

 #include "third_party/abseil-cpp/absl/strings/str_format.h"

 // The following code is a translation from the path-to-regexp typescript at:
 //
 //  https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124

 namespace liburlpattern {

 namespace {

 bool IsValidChar(char c) {
   // Characters should be valid ASCII code points:
   // https://infra.spec.whatwg.org/#ascii-code-point
   return c >= 0x00 && c <= 0x7f;
 }

 bool IsNameChar(char c) {
   return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
          (c >= 'a' && c <= 'z') || c == '_';
 }

 }  // namespace

 const char* TokenTypeToString(TokenType type) {
   switch (type) {
     case TokenType::kOpen:
       return "OPEN";
     case TokenType::kClose:
       return "CLOSE";
     case TokenType::kRegex:
       return "REGEX";
     case TokenType::kName:
       return "NAME";
     case TokenType::kChar:
       return "CHAR";
     case TokenType::kEscapedChar:
       return "ESCAPED_CHAR";
     case TokenType::kModifier:
       return "MODIFIER";
     case TokenType::kEnd:
       return "END";
   }
 }

 std::ostream& operator<<(std::ostream& o, Token token) {
   o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index
     << ", value:" << token.value << " }";
   return o;
 }

 // Split the input pattern into a list of tokens.
 absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern) {
   // Verify that all characters are valid before parsing.  This simplifies the
   // following logic.
   for (size_t i = 0; i < pattern.size(); ++i) {
     if (!IsValidChar(pattern[i])) {
       return absl::InvalidArgumentError(
           absl::StrFormat("Invalid character 0x%02x at %d.", pattern[i], i));
     }
   }

   std::vector<Token> token_list;
   token_list.reserve(pattern.size());

   size_t i = 0;
   while (i < pattern.size()) {
     char c = pattern[i];
     if (c == '*' || c == '+' || c == '?') {
       token_list.emplace_back(TokenType::kModifier, i, pattern.substr(i, 1));
       i += 1;
       continue;
     }

     // Escape sequences always escape a single following character at the
     // level of the pattern.
     if (c == '\\') {
       if (i == (pattern.size() - 1)) {
         return absl::InvalidArgumentError(
             absl::StrFormat("Trailing escape character at %d.", i));
       }
       token_list.emplace_back(TokenType::kEscapedChar, i,
                               pattern.substr(i + 1, 1));
       i += 2;
       continue;
     }

     if (c == '{') {
       token_list.emplace_back(TokenType::kOpen, i, pattern.substr(i, 1));
       i += 1;
       continue;
     }

     if (c == '}') {
       token_list.emplace_back(TokenType::kClose, i, pattern.substr(i, 1));
       i += 1;
       continue;
     }

     if (c == ':') {
       size_t j = i + 1;
       size_t name_start = j;
       while (j < pattern.size() && IsNameChar(pattern[j]))
         j += 1;

       if (j <= name_start) {
         return absl::InvalidArgumentError(
             absl::StrFormat("Missing parameter name at %d.", i));
       }
       size_t name_length = j - name_start;

       token_list.emplace_back(TokenType::kName, i,
                               pattern.substr(name_start, name_length));
       i = j;
       continue;
     }

     if (c == '(') {
       size_t paren_nesting = 1;
       size_t j = i + 1;
       const size_t regex_start = j;

       while (j < pattern.size()) {
         if (j == regex_start && pattern[j] == '?') {
           return absl::InvalidArgumentError(
               absl::StrFormat("Regex cannot start with '?' at %d", j));
         }

         // This escape processing only handles single character escapes since
         // we only need to understand escaped paren characters for our state
         // processing.  The escape `\` character is propagated to the embedded
         // regex expression.  This permits authors to write longer escape
         // sequences such as `\x22` since the later characters will be
         // propagated on subsequent loop iterations.
         if (pattern[j] == '\\') {
           if (j == (pattern.size() - 1)) {
             return absl::InvalidArgumentError(
                 absl::StrFormat("Trailing escape character at %d.", j));
           }
           j += 2;
           continue;
         }

         if (pattern[j] == ')') {
           paren_nesting -= 1;
           if (paren_nesting == 0) {
             j += 1;
             break;
           }
         } else if (pattern[j] == '(') {
           paren_nesting += 1;
           if (j == (pattern.size() - 1)) {
             return absl::InvalidArgumentError(
                 absl::StrFormat("Unbalanced regex at %d.", i));
           }
           // Require the the first character after an open paren is `?`.  This
           // permits assertions, named capture groups, and non-capturing groups.
           // It blocks, however, unnamed capture groups.
           if (pattern[j + 1] != '?') {
             return absl::InvalidArgumentError(absl::StrFormat(
                 "Unnamed capturing groups are not allowed at %d.", j));
           }
         }

         j += 1;
       }

       if (paren_nesting) {
         return absl::InvalidArgumentError(
             absl::StrFormat("Unbalanced regex at %d.", i));
       }

       const size_t regex_length = j - regex_start - 1;
       if (regex_length == 0) {
         return absl::InvalidArgumentError(
             absl::StrFormat("Missing regex at %d.", i));
       }

       token_list.emplace_back(TokenType::kRegex, i,
                               pattern.substr(regex_start, regex_length));
       i = j;
       continue;
     }

     token_list.emplace_back(TokenType::kChar, i, pattern.substr(i, 1));
     i += 1;
   }

   token_list.emplace_back(TokenType::kEnd, i, absl::string_view());

   return token_list;
 }

 }  // namespace liburlpattern
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
	// Use of this source code is governed by an MIT-style license that can be
	// found in the LICENSE file or at https://opensource.org/licenses/MIT.

	#include "third_party/liburlpattern/tokenize.h"

	#include "third_party/abseil-cpp/absl/strings/str_format.h"

	// The following code is a translation from the path-to-regexp typescript at:
	//
	// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124

	namespace liburlpattern {

	namespace {

	bool IsValidChar(char c) {
	// Characters should be valid ASCII code points:
	// https://infra.spec.whatwg.org/#ascii-code-point
	return c >= 0x00 && c <= 0x7f;
	}

	bool IsNameChar(char c) {
	return (c >= '0' && c <= '9') \|\| (c >= 'A' && c <= 'Z') \|\|
	(c >= 'a' && c <= 'z') \|\| c == '_';
	}

	} // namespace

	const char* TokenTypeToString(TokenType type) {
	switch (type) {
	case TokenType::kOpen:
	return "OPEN";
	case TokenType::kClose:
	return "CLOSE";
	case TokenType::kRegex:
	return "REGEX";
	case TokenType::kName:
	return "NAME";
	case TokenType::kChar:
	return "CHAR";
	case TokenType::kEscapedChar:
	return "ESCAPED_CHAR";
	case TokenType::kModifier:
	return "MODIFIER";
	case TokenType::kEnd:
	return "END";
	}
	}

	std::ostream& operator<<(std::ostream& o, Token token) {
	o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index
	<< ", value:" << token.value << " }";
	return o;
	}

	// Split the input pattern into a list of tokens.
	absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern) {
	// Verify that all characters are valid before parsing. This simplifies the
	// following logic.
	for (size_t i = 0; i < pattern.size(); ++i) {
	if (!IsValidChar(pattern[i])) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Invalid character 0x%02x at %d.", pattern[i], i));
	}
	}

	std::vector<Token> token_list;
	token_list.reserve(pattern.size());

	size_t i = 0;
	while (i < pattern.size()) {
	char c = pattern[i];
	if (c == '*' \|\| c == '+' \|\| c == '?') {
	token_list.emplace_back(TokenType::kModifier, i, pattern.substr(i, 1));
	i += 1;
	continue;
	}

	// Escape sequences always escape a single following character at the
	// level of the pattern.
	if (c == '\\') {
	if (i == (pattern.size() - 1)) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Trailing escape character at %d.", i));
	}
	token_list.emplace_back(TokenType::kEscapedChar, i,
	pattern.substr(i + 1, 1));
	i += 2;
	continue;
	}

	if (c == '{') {
	token_list.emplace_back(TokenType::kOpen, i, pattern.substr(i, 1));
	i += 1;
	continue;
	}

	if (c == '}') {
	token_list.emplace_back(TokenType::kClose, i, pattern.substr(i, 1));
	i += 1;
	continue;
	}

	if (c == ':') {
	size_t j = i + 1;
	size_t name_start = j;
	while (j < pattern.size() && IsNameChar(pattern[j]))
	j += 1;

	if (j <= name_start) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Missing parameter name at %d.", i));
	}
	size_t name_length = j - name_start;

	token_list.emplace_back(TokenType::kName, i,
	pattern.substr(name_start, name_length));
	i = j;
	continue;
	}

	if (c == '(') {
	size_t paren_nesting = 1;
	size_t j = i + 1;
	const size_t regex_start = j;

	while (j < pattern.size()) {
	if (j == regex_start && pattern[j] == '?') {
	return absl::InvalidArgumentError(
	absl::StrFormat("Regex cannot start with '?' at %d", j));
	}

	// This escape processing only handles single character escapes since
	// we only need to understand escaped paren characters for our state
	// processing. The escape `\` character is propagated to the embedded
	// regex expression. This permits authors to write longer escape
	// sequences such as `\x22` since the later characters will be
	// propagated on subsequent loop iterations.
	if (pattern[j] == '\\') {
	if (j == (pattern.size() - 1)) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Trailing escape character at %d.", j));
	}
	j += 2;
	continue;
	}

	if (pattern[j] == ')') {
	paren_nesting -= 1;
	if (paren_nesting == 0) {
	j += 1;
	break;
	}
	} else if (pattern[j] == '(') {
	paren_nesting += 1;
	if (j == (pattern.size() - 1)) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Unbalanced regex at %d.", i));
	}
	// Require the the first character after an open paren is `?`. This
	// permits assertions, named capture groups, and non-capturing groups.
	// It blocks, however, unnamed capture groups.
	if (pattern[j + 1] != '?') {
	return absl::InvalidArgumentError(absl::StrFormat(
	"Unnamed capturing groups are not allowed at %d.", j));
	}
	}

	j += 1;
	}

	if (paren_nesting) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Unbalanced regex at %d.", i));
	}

	const size_t regex_length = j - regex_start - 1;
	if (regex_length == 0) {
	return absl::InvalidArgumentError(
	absl::StrFormat("Missing regex at %d.", i));
	}

	token_list.emplace_back(TokenType::kRegex, i,
	pattern.substr(regex_start, regex_length));
	i = j;
	continue;
	}

	token_list.emplace_back(TokenType::kChar, i, pattern.substr(i, 1));
	i += 1;
	}

	token_list.emplace_back(TokenType::kEnd, i, absl::string_view());

	return token_list;
	}

	} // namespace liburlpattern