blob: 65e99eaa5110026ead525a2dc0208a57723c479e [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.
#include "third_party/liburlpattern/tokenize.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
#include "third_party/icu/source/common/unicode/uchar.h"
#include "third_party/icu/source/common/unicode/utf8.h"
#include "third_party/liburlpattern/utils.h"
// The following code is a translation from the path-to-regexp typescript at:
//
// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L4-L124
namespace liburlpattern {
namespace {
bool IsASCII(UChar32 c) {
// Characters should be valid ASCII code points:
// https://infra.spec.whatwg.org/#ascii-code-point
return c >= 0x00 && c <= 0x7f;
}
class Tokenizer {
public:
Tokenizer(absl::string_view pattern, TokenizePolicy policy)
: pattern_(std::move(pattern)), policy_(policy) {
token_list_.reserve(pattern_.size());
}
absl::StatusOr<std::vector<Token>> Tokenize() {
while (index_ < pattern_.size()) {
if (!status_.ok())
return std::move(status_);
if (!NextAt(index_)) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", index_));
continue;
}
if (codepoint_ == '*') {
AddToken(TokenType::kAsterisk);
continue;
}
if (codepoint_ == '+' || codepoint_ == '?') {
AddToken(TokenType::kOtherModifier);
continue;
}
// Escape sequences always escape a single following character at the
// level of the pattern.
if (codepoint_ == '\\') {
if (index_ == (pattern_.size() - 1)) {
Error(absl::StrFormat("Trailing escape character at index %d.",
index_));
continue;
}
size_t escaped_i = next_index_;
if (!Next()) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
next_index_));
continue;
}
AddToken(TokenType::kEscapedChar, next_index_, escaped_i);
continue;
}
if (codepoint_ == '{') {
AddToken(TokenType::kOpen);
continue;
}
if (codepoint_ == '}') {
AddToken(TokenType::kClose);
continue;
}
if (codepoint_ == ':') {
size_t pos = next_index_;
size_t name_start = pos;
// Iterate over codepoints until we find the first non-name codepoint.
while (pos < pattern_.size()) {
if (!status_.ok())
return std::move(status_);
if (!NextAt(pos)) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", pos));
continue;
}
if (!IsNameCodepoint(codepoint_, pos == name_start))
break;
pos = next_index_;
}
if (pos <= name_start) {
Error(absl::StrFormat("Missing parameter name at index %d.", index_),
name_start, index_);
continue;
}
AddToken(TokenType::kName, pos, name_start);
continue;
}
if (codepoint_ == '(') {
size_t paren_nesting = 1;
size_t j = next_index_;
const size_t regex_start = next_index_;
bool error = false;
while (j < pattern_.size()) {
if (!NextAt(j)) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.", j));
error = true;
break;
}
if (!IsASCII(codepoint_)) {
Error(absl::StrFormat(
"Invalid non-ASCII character 0x%02x at index %d.",
codepoint_, j),
regex_start, index_);
error = true;
break;
}
if (j == regex_start && codepoint_ == '?') {
Error(absl::StrFormat("Regex cannot start with '?' at index %d", j),
regex_start, index_);
error = true;
break;
}
// This escape processing only handles single character escapes since
// we only need to understand escaped paren characters for our state
// processing. The escape `\` character is propagated to the embedded
// regex expression. This permits authors to write longer escape
// sequences such as `\x22` since the later characters will be
// propagated on subsequent loop iterations.
if (codepoint_ == '\\') {
if (j == (pattern_.size() - 1)) {
Error(
absl::StrFormat("Trailing escape character at index %d.", j),
regex_start, index_);
error = true;
break;
}
size_t escaped_j = next_index_;
if (!Next()) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
next_index_));
error = true;
break;
}
if (!IsASCII(codepoint_)) {
Error(absl::StrFormat(
"Invalid non-ASCII character 0x%02x at index %d.",
codepoint_, escaped_j),
regex_start, index_);
error = true;
break;
}
j = next_index_;
continue;
}
if (codepoint_ == ')') {
paren_nesting -= 1;
if (paren_nesting == 0) {
j = next_index_;
break;
}
} else if (codepoint_ == '(') {
paren_nesting += 1;
if (j == (pattern_.size() - 1)) {
Error(absl::StrFormat("Unbalanced regex at index %d.", j),
regex_start, index_);
error = true;
break;
}
size_t tmp_j = next_index_;
if (!Next()) {
Error(absl::StrFormat("Invalid UTF-8 codepoint at index %d.",
next_index_));
error = true;
break;
}
// Require the the first character after an open paren is `?`. This
// permits assertions, named capture groups, and non-capturing
// groups. It blocks, however, unnamed capture groups.
if (codepoint_ != '?') {
Error(absl::StrFormat(
"Unnamed capturing groups are not allowed at index %d.",
tmp_j),
regex_start, index_);
error = true;
break;
}
next_index_ = tmp_j;
}
j = next_index_;
}
if (error)
continue;
if (paren_nesting) {
Error(absl::StrFormat("Unbalanced regex at index %d.", index_),
regex_start, index_);
continue;
}
const size_t regex_length = j - regex_start - 1;
if (regex_length == 0) {
Error(absl::StrFormat("Missing regex at index %d.", index_),
regex_start, index_);
continue;
}
AddToken(TokenType::kRegex, j, regex_start, regex_length);
continue;
}
AddToken(TokenType::kChar);
}
if (!status_.ok())
return std::move(status_);
AddToken(TokenType::kEnd, index_, index_);
return std::move(token_list_);
}
private:
// Read the codepoint at `next_index_` in `pattern_` and store it in
// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
// read next. Returns true iff the codepoint was read successfully. On
// success, `codepoint_` is non-negative.
[[nodiscard]] bool Next() {
U8_NEXT(pattern_.data(), next_index_, pattern_.size(), codepoint_);
return codepoint_ >= 0;
}
// Read the codepoint at the specified `index` in `pattern_` and store it in
// `codepoint_`. In addition, `next_index_` is updated to the codepoint to be
// read next. Returns true iff the codepoint was read successfully. On
// success, `codepoint_` is non-negative.
[[nodiscard]] bool NextAt(size_t index) {
next_index_ = index;
return Next();
}
// Append a Token to our list of the given `type` and with a value consisting
// of the codepoints in `pattern_` starting at `value_pos` with
// `value_length`. Update `index_` to `next_pos` automatically.
void AddToken(TokenType type,
size_t next_pos,
size_t value_pos,
size_t value_length) {
token_list_.emplace_back(type, index_,
pattern_.substr(value_pos, value_length));
index_ = next_pos;
}
// Append a Token to our list of the given `type` and with a value consisting
// of the codepoints in `pattern_` starting at `value_pos`. The value length
// is automatically computed as the difference between `next_pos` and
// `value_pos`. Update `index_` to `next_pos` automatically.
void AddToken(TokenType type, size_t next_pos, size_t value_pos) {
AddToken(type, next_pos, value_pos, next_pos - value_pos);
}
// Append a Token to our list of the given `type` and with a value consisting
// of the codepoints in `pattern_` starting at `index_`. The value length
// is automatically computed as the difference between `next_index_` and
// `index_`. Update `index_` to `next_index_` automatically.
void AddToken(TokenType type) { AddToken(type, next_index_, index_); }
void Error(absl::string_view message, size_t next_pos, size_t value_pos) {
if (policy_ == TokenizePolicy::kLenient)
AddToken(TokenType::kInvalidChar, next_pos, value_pos);
else
status_ = absl::InvalidArgumentError(message);
}
void Error(absl::string_view message) { Error(message, next_index_, index_); }
const absl::string_view pattern_;
const TokenizePolicy policy_;
std::vector<Token> token_list_;
absl::Status status_;
// `index_` tracks our "current" byte index in the input string. Typically
// this will be updated every time we commit a token to `token_list_`. It may
// stay frozen in place if we have a sub-loop consuming a larger token like
// a named group or regex group.
size_t index_ = 0;
// The `next_index_` member is used to find the next UTF8 codepoint in the
// string. This is used as both an input and output from the U8_NEXT()
// function. We keep this separate from `index_` because there are many
// cases where we need to read ahead of the last token consumed.
size_t next_index_ = 0;
UChar32 codepoint_ = U_SENTINEL;
};
} // namespace
const char* TokenTypeToString(TokenType type) {
switch (type) {
case TokenType::kOpen:
return "'{'";
case TokenType::kClose:
return "'}'";
case TokenType::kRegex:
return "regex group";
case TokenType::kName:
return "named group";
case TokenType::kChar:
return "character";
case TokenType::kEscapedChar:
return "escaped character";
case TokenType::kOtherModifier:
return "modifier";
case TokenType::kAsterisk:
return "asterisk";
case TokenType::kEnd:
return "end of pattern";
case TokenType::kInvalidChar:
return "invalid character";
}
}
std::ostream& operator<<(std::ostream& o, Token token) {
o << "{ type:" << static_cast<int>(token.type) << ", index:" << token.index
<< ", value:" << token.value << " }";
return o;
}
// Split the input pattern into a list of tokens.
absl::StatusOr<std::vector<Token>> Tokenize(absl::string_view pattern,
TokenizePolicy policy) {
Tokenizer tokenizer(std::move(pattern), policy);
return tokenizer.Tokenize();
}
} // namespace liburlpattern