blob: ec08e6a18322d67ba8c32393e074819a9e469db6 [file] [log] [blame]
// Copyright 2020 The Chromium Authors
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.
#include "third_party/liburlpattern/pattern.h"
#include "third_party/abseil-cpp/absl/base/macros.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
#include "third_party/icu/source/common/unicode/utf8.h"
#include "third_party/liburlpattern/utils.h"
namespace liburlpattern {
namespace {
void AppendModifier(Modifier modifier, std::string& append_target) {
switch (modifier) {
case Modifier::kZeroOrMore:
append_target += '*';
break;
case Modifier::kOptional:
append_target += '?';
break;
case Modifier::kOneOrMore:
append_target += '+';
break;
case Modifier::kNone:
break;
}
}
size_t ModifierLength(Modifier modifier) {
switch (modifier) {
case Modifier::kZeroOrMore:
case Modifier::kOptional:
case Modifier::kOneOrMore:
return 1;
case Modifier::kNone:
return 0;
}
}
} // namespace
std::ostream& operator<<(std::ostream& o, Part part) {
o << "{ type:" << static_cast<int>(part.type) << ", name:" << part.name
<< ", prefix:" << part.prefix << ", value:" << part.value
<< ", suffix:" << part.suffix
<< ", modifier:" << static_cast<int>(part.modifier) << " }";
return o;
}
Part::Part(PartType t, std::string v, Modifier m)
: type(t), value(std::move(v)), modifier(m) {
ABSL_ASSERT(type == PartType::kFixed);
}
Part::Part(PartType t,
std::string n,
std::string p,
std::string v,
std::string s,
Modifier m)
: type(t),
name(std::move(n)),
prefix(std::move(p)),
value(std::move(v)),
suffix(std::move(s)),
modifier(m) {
ABSL_ASSERT(type != PartType::kFixed);
ABSL_ASSERT(!name.empty());
if (type == PartType::kFullWildcard || type == PartType::kSegmentWildcard)
ABSL_ASSERT(value.empty());
}
bool Part::HasCustomName() const {
// Determine if the part name was custom, like `:foo`, or an
// automatically assigned numeric value. Since custom group
// names follow javascript identifier rules the first character
// cannot be a digit, so that is all we need to check here.
return !name.empty() && !std::isdigit(name[0]);
}
Pattern::Pattern(std::vector<Part> part_list,
Options options,
std::string segment_wildcard_regex)
: part_list_(std::move(part_list)),
options_(std::move(options)),
segment_wildcard_regex_(std::move(segment_wildcard_regex)) {}
std::string Pattern::GeneratePatternString() const {
std::string result;
// Estimate the final length and reserve a reasonable sized string
// buffer to avoid reallocations.
size_t estimated_length = 0;
for (const Part& part : part_list_) {
// Add an arbitrary extra 3 per Part to account for braces, modifier, etc.
estimated_length +=
part.prefix.size() + part.value.size() + part.suffix.size() + 3;
}
result.reserve(estimated_length);
for (size_t i = 0; i < part_list_.size(); ++i) {
const Part& part = part_list_[i];
if (part.type == PartType::kFixed) {
// A simple fixed string part.
if (part.modifier == Modifier::kNone) {
EscapePatternStringAndAppend(part.value, result);
continue;
}
// A fixed string, but with a modifier which requires a grouping.
// For example, `{foo}?`.
result += "{";
EscapePatternStringAndAppend(part.value, result);
result += "}";
AppendModifier(part.modifier, result);
continue;
}
bool custom_name = part.HasCustomName();
// Determine if the part needs a grouping like `{ ... }`. This is
// necessary when the group:
//
// 1. is using a non-automatic prefix or any suffix.
bool needs_grouping =
!part.suffix.empty() ||
(!part.prefix.empty() &&
(part.prefix.size() != 1 ||
options_.prefix_list.find(part.prefix[0]) == std::string::npos));
// 2. followed by a matching group that may be expressed in a way that can
// be mistakenly interpreted as part of this matching group. For
// example:
//
// a. An `(...)` expression following a `:foo` group. We want to
// output `{:foo}(...)` and not `:foo(...)`.
// b. A plaint text expression following a `:foo` group where the text
// could be mistakenly interpreted as part of the name. We want to
// output `{:foo}bar` and not `:foobar`.
const Part* next_part =
(i + 1) < part_list_.size() ? &part_list_[i + 1] : nullptr;
if (!needs_grouping && custom_name &&
part.type == PartType::kSegmentWildcard &&
part.modifier == Modifier::kNone && next_part &&
next_part->prefix.empty() && next_part->suffix.empty()) {
if (next_part->type == PartType::kFixed) {
UChar32 codepoint = -1;
U8_GET(reinterpret_cast<const uint8_t*>(next_part->value.data()), 0, 0,
static_cast<int>(next_part->value.size()), codepoint);
needs_grouping = IsNameCodepoint(codepoint, /*first_codepoint=*/false);
} else {
needs_grouping = !next_part->HasCustomName();
}
}
// 3. preceded by a fixed text part that ends with an implicit prefix
// character (like `/`). This occurs when the original pattern used
// an escape or grouping to prevent the implicit prefix; e.g.
// `\\/*` or `/{*}`. In these cases we use a grouping to prevent the
// implicit prefix in the generated string.
const Part* last_part = i > 0 ? &part_list_[i - 1] : nullptr;
if (!needs_grouping && part.prefix.empty() && last_part &&
last_part->type == PartType::kFixed) {
needs_grouping = options_.prefix_list.find(last_part->value.back()) !=
std::string::npos;
}
// This is a full featured part. We must generate a string that looks
// like:
//
// { <prefix> <value> <suffix> } <modifier>
//
// Where the { and } may not be needed. The <value> will be a regexp,
// named group, or wildcard.
if (needs_grouping)
result += "{";
EscapePatternStringAndAppend(part.prefix, result);
if (custom_name) {
result += ":";
result += part.name;
}
if (part.type == PartType::kRegex) {
result += "(";
result += part.value;
result += ")";
} else if (part.type == PartType::kSegmentWildcard) {
// We only need to emit a regexp if a custom name was
// not specified. A custom name like `:foo` gets the
// kSegmentWildcard type automatically.
if (!custom_name) {
result += "(";
result += segment_wildcard_regex_;
result += ")";
}
} else if (part.type == PartType::kFullWildcard) {
// We can only use the `*` wildcard card if we meet a number
// of conditions. We must use an explicit `(.*)` group if:
//
// 1. A custom name was used; e.g. `:foo(.*)`.
// 2. If the preceding group is a matching group without a modifier; e.g.
// `(foo)(.*)`. In that case we cannot emit the `*` shorthand without
// it being mistakenly interpreted as the modifier for the previous
// group.
// 3. The current group is not enclosed in a `{ }` grouping.
// 4. The current group does not have an implicit prefix like `/`.
if (!custom_name && (!last_part || last_part->type == PartType::kFixed ||
last_part->modifier != Modifier::kNone ||
needs_grouping || !part.prefix.empty())) {
result += "*";
} else {
result += "(";
result += kFullWildcardRegex;
result += ")";
}
}
// If the matching group is a simple `:foo` custom name with the default
// segment wildcard, then we must check for a trailing suffix that could
// be interpreted as a trailing part of the name itself. In these cases
// we must escape the beginning of the suffix in order to separate it
// from the end of the custom name; e.g. `:foo\\bar` instead of `:foobar`.
if (part.type == PartType::kSegmentWildcard && custom_name &&
!part.suffix.empty()) {
UChar32 codepoint = -1;
U8_GET(reinterpret_cast<const uint8_t*>(part.suffix.data()), 0, 0,
static_cast<int>(part.suffix.size()), codepoint);
if (IsNameCodepoint(codepoint, /*first_codepoint=*/false)) {
result += "\\";
}
}
EscapePatternStringAndAppend(part.suffix, result);
if (needs_grouping)
result += "}";
if (part.modifier != Modifier::kNone)
AppendModifier(part.modifier, result);
}
return result;
}
// The following code is a translation from the path-to-regexp typescript at:
//
// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L532-L596
std::string Pattern::GenerateRegexString(
std::vector<std::string>* name_list_out) const {
std::string result;
// This method mirrors the logic and structure of RegexStringLength(). If
// one changes, so should the other.
// Perform a full pass of the |part_list| to compute the length of the regex
// string to avoid additional allocations.
size_t expected_length = RegexStringLength();
result.reserve(RegexStringLength());
// Anchor to the start of the string if configured to in the options.
if (options_.start)
result += "^";
// Iterate over each Part and append its equivalent value to the expression
// string.
for (const Part& part : part_list_) {
// Handle kFixed Parts. If there is a modifier we must wrap the escaped
// value in a non-capturing group. Otherwise we just append the escaped
// value. For example:
//
// <escaped-fixed-value>
//
// Or:
//
// (?:<escaped-fixed-value>)<modifier>
//
if (part.type == PartType::kFixed) {
if (part.modifier == Modifier::kNone) {
EscapeRegexpStringAndAppend(part.value, result);
} else {
result += "(?:";
EscapeRegexpStringAndAppend(part.value, result);
result += ")";
AppendModifier(part.modifier, result);
}
continue;
}
// All remaining Part types must have a name. Append it to the output
// list if provided.
ABSL_ASSERT(!part.name.empty());
if (name_list_out)
name_list_out->push_back(part.name);
// Compute the Part regex value. For kSegmentWildcard and kFullWildcard
// types we must convert the type enum back to the defined regex value.
absl::string_view regex_value = part.value;
if (part.type == PartType::kSegmentWildcard)
regex_value = segment_wildcard_regex_;
else if (part.type == PartType::kFullWildcard)
regex_value = kFullWildcardRegex;
// Handle the case where there is no prefix or suffix value. This varies a
// bit depending on the modifier.
//
// If there is no modifier or an optional modifier, then we simply wrap the
// regex value in a capturing group:
//
// (<regex-value>)<modifier>
//
// If there is a modifier, then we need to use a non-capturing group for the
// regex value and an outer capturing group that includes the modifier as
// well. Like:
//
// ((?:<regex-value>)<modifier>)
if (part.prefix.empty() && part.suffix.empty()) {
if (part.modifier == Modifier::kNone ||
part.modifier == Modifier::kOptional) {
absl::StrAppendFormat(&result, "(%s)", regex_value);
AppendModifier(part.modifier, result);
} else {
absl::StrAppendFormat(&result, "((?:%s)", regex_value);
AppendModifier(part.modifier, result);
result += ")";
}
continue;
}
// Handle non-repeating regex Parts with a prefix and/or suffix. The
// capturing group again only contains the regex value. This inner group
// is compined with the prefix and/or suffix in an outer non-capturing
// group. Finally the modifier is applied to the entire outer group.
// For example:
//
// (?:<prefix>(<regex-value>)<suffix>)<modifier>
//
if (part.modifier == Modifier::kNone ||
part.modifier == Modifier::kOptional) {
result += "(?:";
EscapeRegexpStringAndAppend(part.prefix, result);
absl::StrAppendFormat(&result, "(%s)", regex_value);
EscapeRegexpStringAndAppend(part.suffix, result);
result += ")";
AppendModifier(part.modifier, result);
continue;
}
// Repeating Parts are dramatically more complicated. We want to exclude
// the initial prefix and the final suffix, but include them between any
// repeated elements. To achieve this we provide a separate initial
// part that excludes the prefix. Then the part is duplicated with the
// prefix/suffix values included in an optional repeating element. If
// zero values are permitted then a final optional modifier may be added.
// For example:
//
// (?:<prefix>((?:<regex-value>)(?:<suffix><prefix>(?:<regex-value>))*)<suffix>)?
//
result += "(?:";
EscapeRegexpStringAndAppend(part.prefix, result);
absl::StrAppendFormat(&result, "((?:%s)(?:", regex_value);
EscapeRegexpStringAndAppend(part.suffix, result);
EscapeRegexpStringAndAppend(part.prefix, result);
absl::StrAppendFormat(&result, "(?:%s))*)", regex_value);
EscapeRegexpStringAndAppend(part.suffix, result);
result += ")";
if (part.modifier == Modifier::kZeroOrMore)
result += "?";
}
// Should we anchor the pattern to the end of the input string?
if (options_.end) {
// In non-strict mode an optional delimiter character is always
// permitted at the end of the string. For example, if the pattern
// is "/foo/bar" then it would match "/foo/bar/".
//
// [<delimiter chars>]?
//
if (!options_.strict) {
AppendDelimiterList(result);
result += "?";
}
// The options ends_with value contains a list of characters that
// may also signal the end of the pattern match.
if (options_.ends_with.empty()) {
// Simply anchor to the end of the input string.
result += "$";
} else {
// Anchor to either a ends_with character or the end of the input
// string. This uses a lookahead assertion.
//
// (?=[<ends_with chars>]|$)
//
result += "(?=";
AppendEndsWith(result);
result += ")";
}
return result;
}
// We are not anchored to the end of the input string.
// Again, if not in strict mode we permit an optional trailing delimiter
// character before anchoring to any ends_with characters with a lookahead
// assertion.
//
// (?:[<delimiter chars>](?=[<ends_with chars>]|$))?
//
if (!options_.strict) {
result += "(?:";
AppendDelimiterList(result);
result += "(?=";
AppendEndsWith(result);
result += "))?";
}
// Further, if the pattern does not end with a trailing delimiter character
// we also anchor to a delimiter character in our lookahead assertion. So
// a pattern "/foo/bar" would match "/foo/bar/baz", but not "/foo/barbaz".
//
// (?=[<delimiter chars>]|[<ends_with chars>]|$)
//
bool end_delimited = false;
if (!part_list_.empty()) {
auto& last_part = part_list_.back();
if (last_part.type == PartType::kFixed &&
last_part.modifier == Modifier::kNone) {
ABSL_ASSERT(!last_part.value.empty());
end_delimited = options_.delimiter_list.find(last_part.value.back()) !=
std::string::npos;
}
}
if (!end_delimited) {
result += "(?=";
AppendDelimiterList(result);
result += "|";
AppendEndsWith(result);
result += ")";
}
ABSL_ASSERT(result.size() == expected_length);
return result;
}
bool Pattern::HasRegexGroups() const {
for (const Part& part : part_list_) {
if (part.type == PartType::kRegex) {
return true;
}
}
return false;
}
bool Pattern::CanDirectMatch() const {
// We currently only support direct matching with the options used by
// URLPattern.
if (!options_.start || !options_.end || !options_.strict ||
!options_.sensitive) {
return false;
}
return part_list_.empty() || IsOnlyFullWildcard() || IsOnlyFixedText();
}
bool Pattern::DirectMatch(
absl::string_view input,
std::vector<
std::pair<absl::string_view, absl::optional<absl::string_view>>>*
group_list_out) const {
ABSL_ASSERT(CanDirectMatch());
if (part_list_.empty())
return input.empty();
if (IsOnlyFullWildcard()) {
if (group_list_out)
group_list_out->emplace_back(part_list_[0].name, input);
return true;
}
if (IsOnlyFixedText()) {
return part_list_[0].value == input;
}
return false;
}
size_t Pattern::RegexStringLength() const {
size_t result = 0;
// This method mirrors the logic and structure of GenerateRegexString(). If
// one changes, so should the other. See GenerateRegexString() for an
// explanation of the logic.
if (options_.start) {
// ^
result += 1;
}
for (const Part& part : part_list_) {
if (part.type == PartType::kFixed) {
if (part.modifier == Modifier::kNone) {
// <escaped-fixed-value>
result += EscapedRegexpStringLength(part.value);
} else {
// (?:<escaped-fixed-value>)<modifier>
result += EscapedRegexpStringLength(part.value) + 4 +
ModifierLength(part.modifier);
}
continue;
}
absl::string_view regex_value = part.value;
if (part.type == PartType::kSegmentWildcard)
regex_value = segment_wildcard_regex_;
else if (part.type == PartType::kFullWildcard)
regex_value = kFullWildcardRegex;
if (part.prefix.empty() && part.suffix.empty()) {
// (<regex-value>)<modifier>
result += regex_value.size() + ModifierLength(part.modifier) + 2;
continue;
}
size_t prefix_length = EscapedRegexpStringLength(part.prefix);
size_t suffix_length = EscapedRegexpStringLength(part.suffix);
if (part.modifier == Modifier::kNone ||
part.modifier == Modifier::kOptional) {
// (?:<prefix>(<regex-value>)<suffix>)<modifier>
result += prefix_length + regex_value.size() + suffix_length +
ModifierLength(part.modifier) + 6;
continue;
}
// (?:<prefix>((?:<regex-value>)(?:<suffix><prefix>(?:<regex-value>))*)<suffix>)?
result += prefix_length + regex_value.size() + suffix_length +
prefix_length + regex_value.size() + suffix_length + 19;
if (part.modifier == Modifier::kZeroOrMore)
result += 1;
}
if (options_.end) {
if (!options_.strict) {
// [<delimiter chars>]?
result += DelimiterListLength() + 1;
}
if (options_.ends_with.empty()) {
// $
result += 1;
} else {
// (?=[<ends_with chars>]|$)
result += EndsWithLength() + 4;
}
} else {
bool end_delimited = false;
if (!part_list_.empty()) {
auto& last_part = part_list_.back();
if (last_part.type == PartType::kFixed &&
last_part.modifier == Modifier::kNone) {
ABSL_ASSERT(!last_part.value.empty());
end_delimited = options_.delimiter_list.find(last_part.value.back()) !=
std::string::npos;
}
}
if (!options_.strict) {
// (?:[<delimiter chars>](?=[<ends_with chars>]|$))?
result += DelimiterListLength() + EndsWithLength() + 9;
}
if (!end_delimited) {
// (?=[<delimiter chars>]|[<ends_with chars>]|$)
result += DelimiterListLength() + EndsWithLength() + 5;
}
}
return result;
}
void Pattern::AppendDelimiterList(std::string& append_target) const {
append_target += "[";
EscapeRegexpStringAndAppend(options_.delimiter_list, append_target);
append_target += "]";
}
size_t Pattern::DelimiterListLength() const {
return EscapedRegexpStringLength(options_.delimiter_list) + 2;
}
void Pattern::AppendEndsWith(std::string& append_target) const {
append_target += "[";
EscapeRegexpStringAndAppend(options_.ends_with, append_target);
append_target += "]|$";
}
size_t Pattern::EndsWithLength() const {
return EscapedRegexpStringLength(options_.ends_with) + 4;
}
bool Pattern::IsOnlyFullWildcard() const {
if (part_list_.size() != 1)
return false;
auto& part = part_list_[0];
// The modifier does not matter as an optional or repeated full wildcard
// is functionally equivalent.
return part.type == PartType::kFullWildcard && part.prefix.empty() &&
part.suffix.empty();
}
bool Pattern::IsOnlyFixedText() const {
if (part_list_.size() != 1)
return false;
auto& part = part_list_[0];
bool result =
part.type == PartType::kFixed && part.modifier == Modifier::kNone;
if (result) {
ABSL_ASSERT(part.prefix.empty());
ABSL_ASSERT(part.suffix.empty());
}
return result;
}
} // namespace liburlpattern