| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Copyright 2014 Blake Embrey (hello@blakeembrey.com) |
| // Use of this source code is governed by an MIT-style license that can be |
| // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| |
| #include "third_party/liburlpattern/tokenize.h" |
| |
| #include "third_party/abseil-cpp/absl/strings/str_format.h" |
| |
| // The following code is a translation from the path-to-regexp typescript at: |
| // |
| // https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124 |
| |
| namespace liburlpattern { |
| |
| namespace { |
| |
| bool IsValidChar(char c) { |
| // Characters should be valid ASCII code points: |
| // https://infra.spec.whatwg.org/#ascii-code-point |
| return c >= 0x00 && c <= 0x7f; |
| } |
| |
| bool IsNameChar(char c) { |
| return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || |
| (c >= 'a' && c <= 'z') || c == '_'; |
| } |
| |
| } // namespace |
| |
| const char* TokenTypeToString(TokenType type) { |
| switch (type) { |
| case TokenType::kOpen: |
| return "OPEN"; |
| case TokenType::kClose: |
| return "CLOSE"; |
| case TokenType::kRegex: |
| return "REGEX"; |
| case TokenType::kName: |
| return "NAME"; |
| case TokenType::kChar: |
| return "CHAR"; |
| case TokenType::kEscapedChar: |
| return "ESCAPED_CHAR"; |
| case TokenType::kModifier: |
| return "MODIFIER"; |
| case TokenType::kEnd: |
| return "END"; |
| } |
| } |
| |
| std::ostream& operator<<(std::ostream& o, Token token) { |
| o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index |
| << ", value:" << token.value << " }"; |
| return o; |
| } |
| |
| // Split the input pattern into a list of tokens. |
| absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern) { |
| // Verify that all characters are valid before parsing. This simplifies the |
| // following logic. |
| for (size_t i = 0; i < pattern.size(); ++i) { |
| if (!IsValidChar(pattern[i])) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Invalid character 0x%02x at %d.", pattern[i], i)); |
| } |
| } |
| |
| std::vector<Token> token_list; |
| token_list.reserve(pattern.size()); |
| |
| size_t i = 0; |
| while (i < pattern.size()) { |
| char c = pattern[i]; |
| if (c == '*' || c == '+' || c == '?') { |
| token_list.emplace_back(TokenType::kModifier, i, pattern.substr(i, 1)); |
| i += 1; |
| continue; |
| } |
| |
| // Escape sequences always escape a single following character at the |
| // level of the pattern. |
| if (c == '\\') { |
| if (i == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Trailing escape character at %d.", i)); |
| } |
| token_list.emplace_back(TokenType::kEscapedChar, i, |
| pattern.substr(i + 1, 1)); |
| i += 2; |
| continue; |
| } |
| |
| if (c == '{') { |
| token_list.emplace_back(TokenType::kOpen, i, pattern.substr(i, 1)); |
| i += 1; |
| continue; |
| } |
| |
| if (c == '}') { |
| token_list.emplace_back(TokenType::kClose, i, pattern.substr(i, 1)); |
| i += 1; |
| continue; |
| } |
| |
| if (c == ':') { |
| size_t j = i + 1; |
| size_t name_start = j; |
| while (j < pattern.size() && IsNameChar(pattern[j])) |
| j += 1; |
| |
| if (j <= name_start) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Missing parameter name at %d.", i)); |
| } |
| size_t name_length = j - name_start; |
| |
| token_list.emplace_back(TokenType::kName, i, |
| pattern.substr(name_start, name_length)); |
| i = j; |
| continue; |
| } |
| |
| if (c == '(') { |
| size_t paren_nesting = 1; |
| size_t j = i + 1; |
| const size_t regex_start = j; |
| |
| while (j < pattern.size()) { |
| if (j == regex_start && pattern[j] == '?') { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Regex cannot start with '?' at %d", j)); |
| } |
| |
| // This escape processing only handles single character escapes since |
| // we only need to understand escaped paren characters for our state |
| // processing. The escape `\` character is propagated to the embedded |
| // regex expression. This permits authors to write longer escape |
| // sequences such as `\x22` since the later characters will be |
| // propagated on subsequent loop iterations. |
| if (pattern[j] == '\\') { |
| if (j == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Trailing escape character at %d.", j)); |
| } |
| j += 2; |
| continue; |
| } |
| |
| if (pattern[j] == ')') { |
| paren_nesting -= 1; |
| if (paren_nesting == 0) { |
| j += 1; |
| break; |
| } |
| } else if (pattern[j] == '(') { |
| paren_nesting += 1; |
| if (j == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Unbalanced regex at %d.", i)); |
| } |
| // Require the the first character after an open paren is `?`. This |
| // permits assertions, named capture groups, and non-capturing groups. |
| // It blocks, however, unnamed capture groups. |
| if (pattern[j + 1] != '?') { |
| return absl::InvalidArgumentError(absl::StrFormat( |
| "Unnamed capturing groups are not allowed at %d.", j)); |
| } |
| } |
| |
| j += 1; |
| } |
| |
| if (paren_nesting) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Unbalanced regex at %d.", i)); |
| } |
| |
| const size_t regex_length = j - regex_start - 1; |
| if (regex_length == 0) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Missing regex at %d.", i)); |
| } |
| |
| token_list.emplace_back(TokenType::kRegex, i, |
| pattern.substr(regex_start, regex_length)); |
| i = j; |
| continue; |
| } |
| |
| token_list.emplace_back(TokenType::kChar, i, pattern.substr(i, 1)); |
| i += 1; |
| } |
| |
| token_list.emplace_back(TokenType::kEnd, i, absl::string_view()); |
| |
| return token_list; |
| } |
| |
| } // namespace liburlpattern |