| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Copyright 2014 Blake Embrey (hello@blakeembrey.com) |
| // Use of this source code is governed by an MIT-style license that can be |
| // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| |
| #include "third_party/liburlpattern/tokenize.h" |
| |
| #include "third_party/abseil-cpp/absl/strings/str_format.h" |
| #include "third_party/icu/source/common/unicode/uchar.h" |
| #include "third_party/icu/source/common/unicode/utf8.h" |
| |
| // The following code is a translation from the path-to-regexp typescript at: |
| // |
| // https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124 |
| |
| namespace liburlpattern { |
| |
| namespace { |
| |
| bool IsASCII(UChar32 c) { |
| // Characters should be valid ASCII code points: |
| // https://infra.spec.whatwg.org/#ascii-code-point |
| return c >= 0x00 && c <= 0x7f; |
| } |
| |
| bool IsNameCodepoint(UChar32 c, bool first_codepoint) { |
| // Require group names to follow the same character restrictions as |
| // javascript identifiers. This code originates from v8 at: |
| // |
| // https://source.chromium.org/chromium/chromium/src/+/master:v8/src/strings/char-predicates.cc;l=17-34;drc=be014256adea1552d4a044ef80616cdab6a7d549 |
| // |
| // We deviate from js identifiers, however, in not support the backslash |
| // character. This is mainly used in js identifiers to allow escaped |
| // unicode sequences to be written in ascii. The js engine, however, |
| // should take care of this long before we reach this level of code. So |
| // we don't need to handle it here. |
| if (first_codepoint) { |
| return u_hasBinaryProperty(c, UCHAR_ID_START) || c == '$' || c == '_'; |
| } |
| return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || c == '$' || c == '_' || |
| c == 0x200c || c == 0x200d; |
| } |
| |
| } // namespace |
| |
| const char* TokenTypeToString(TokenType type) { |
| switch (type) { |
| case TokenType::kOpen: |
| return "OPEN"; |
| case TokenType::kClose: |
| return "CLOSE"; |
| case TokenType::kRegex: |
| return "REGEX"; |
| case TokenType::kName: |
| return "NAME"; |
| case TokenType::kChar: |
| return "CHAR"; |
| case TokenType::kEscapedChar: |
| return "ESCAPED_CHAR"; |
| case TokenType::kOtherModifier: |
| return "OTHER_MODIFIER"; |
| case TokenType::kAsterisk: |
| return "ASTERISK"; |
| case TokenType::kEnd: |
| return "END"; |
| } |
| } |
| |
| std::ostream& operator<<(std::ostream& o, Token token) { |
| o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index |
| << ", value:" << token.value << " }"; |
| return o; |
| } |
| |
| // Split the input pattern into a list of tokens. |
| absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern) { |
| // Verify that all characters are valid before parsing. This simplifies the |
| // following logic. |
| for (size_t i = 0; i < pattern.size(); ++i) { |
| } |
| |
| std::vector<Token> token_list; |
| token_list.reserve(pattern.size()); |
| |
| size_t i = 0; |
| while (i < pattern.size()) { |
| size_t next_i = i; |
| UChar32 c = U_SENTINEL; |
| U8_NEXT(pattern.data(), next_i, pattern.size(), c); |
| if (c == '*') { |
| token_list.emplace_back(TokenType::kAsterisk, i, |
| pattern.substr(i, next_i - i)); |
| i = next_i; |
| continue; |
| } |
| |
| if (c == '+' || c == '?') { |
| token_list.emplace_back(TokenType::kOtherModifier, i, |
| pattern.substr(i, next_i - i)); |
| i = next_i; |
| continue; |
| } |
| |
| // Escape sequences always escape a single following character at the |
| // level of the pattern. |
| if (c == '\\') { |
| if (i == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Trailing escape character at %d.", i)); |
| } |
| size_t escaped_i = next_i; |
| U8_NEXT(pattern.data(), next_i, pattern.size(), c); |
| token_list.emplace_back(TokenType::kEscapedChar, i, |
| pattern.substr(escaped_i, next_i - escaped_i)); |
| i = next_i; |
| continue; |
| } |
| |
| if (c == '{') { |
| token_list.emplace_back(TokenType::kOpen, i, |
| pattern.substr(i, next_i - i)); |
| i = next_i; |
| continue; |
| } |
| |
| if (c == '}') { |
| token_list.emplace_back(TokenType::kClose, i, |
| pattern.substr(i, next_i - i)); |
| i = next_i; |
| continue; |
| } |
| |
| if (c == ':') { |
| size_t pos = next_i; |
| size_t name_start = pos; |
| while (pos < pattern.size()) { |
| // Reads next unicode codepoint from utf8 sequence and automatically |
| // updates the position index. Since we're not sure this is a valid |
| // codepoint yet, use |tmp_pos| for the position and only commit to |
| // it once we check validity. |
| size_t tmp_pos = pos; |
| UChar32 codepoint; |
| U8_NEXT(pattern.data(), tmp_pos, pattern.size(), codepoint); |
| |
| if (!IsNameCodepoint(codepoint, pos == name_start)) |
| break; |
| |
| pos = tmp_pos; |
| } |
| |
| if (pos <= name_start) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Missing parameter name at %d.", i)); |
| } |
| size_t name_length = pos - name_start; |
| |
| token_list.emplace_back(TokenType::kName, i, |
| pattern.substr(name_start, name_length)); |
| i = pos; |
| continue; |
| } |
| |
| if (c == '(') { |
| size_t paren_nesting = 1; |
| size_t j = next_i; |
| const size_t regex_start = j; |
| |
| while (j < pattern.size()) { |
| size_t next_j = j; |
| U8_NEXT(pattern.data(), next_j, pattern.size(), c); |
| |
| if (!IsASCII(c)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Invalid character 0x%02x at %d.", c, j)); |
| } |
| if (j == regex_start && c == '?') { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Regex cannot start with '?' at %d", j)); |
| } |
| |
| // This escape processing only handles single character escapes since |
| // we only need to understand escaped paren characters for our state |
| // processing. The escape `\` character is propagated to the embedded |
| // regex expression. This permits authors to write longer escape |
| // sequences such as `\x22` since the later characters will be |
| // propagated on subsequent loop iterations. |
| if (c == '\\') { |
| if (j == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Trailing escape character at %d.", j)); |
| } |
| size_t escaped_j = next_j; |
| U8_NEXT(pattern.data(), next_j, pattern.size(), c); |
| if (!IsASCII(c)) { |
| return absl::InvalidArgumentError(absl::StrFormat( |
| "Invalid character 0x%02x at %d.", c, escaped_j)); |
| } |
| j = next_j; |
| continue; |
| } |
| |
| if (c == ')') { |
| paren_nesting -= 1; |
| if (paren_nesting == 0) { |
| j = next_j; |
| break; |
| } |
| } else if (c == '(') { |
| paren_nesting += 1; |
| if (j == (pattern.size() - 1)) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Unbalanced regex at %d.", j)); |
| } |
| size_t tmp_j = next_j; |
| U8_NEXT(pattern.data(), tmp_j, pattern.size(), c); |
| // Require the the first character after an open paren is `?`. This |
| // permits assertions, named capture groups, and non-capturing groups. |
| // It blocks, however, unnamed capture groups. |
| if (c != '?') { |
| return absl::InvalidArgumentError(absl::StrFormat( |
| "Unnamed capturing groups are not allowed at %d.", next_j)); |
| } |
| } |
| |
| j = next_j; |
| } |
| |
| if (paren_nesting) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Unbalanced regex at %d.", i)); |
| } |
| |
| const size_t regex_length = j - regex_start - 1; |
| if (regex_length == 0) { |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Missing regex at %d.", i)); |
| } |
| |
| token_list.emplace_back(TokenType::kRegex, i, |
| pattern.substr(regex_start, regex_length)); |
| i = j; |
| continue; |
| } |
| |
| token_list.emplace_back(TokenType::kChar, i, pattern.substr(i, next_i - i)); |
| i = next_i; |
| } |
| |
| token_list.emplace_back(TokenType::kEnd, i, absl::string_view()); |
| |
| return token_list; |
| } |
| |
| } // namespace liburlpattern |