| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Copyright 2014 Blake Embrey (hello@blakeembrey.com) |
| // Use of this source code is governed by an MIT-style license that can be |
| // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| |
| #include "third_party/liburlpattern/parse.h" |
| |
| #include "third_party/abseil-cpp/absl/base/macros.h" |
| #include "third_party/abseil-cpp/absl/strings/str_format.h" |
| #include "third_party/liburlpattern/pattern.h" |
| #include "third_party/liburlpattern/tokenize.h" |
| #include "third_party/liburlpattern/utils.h" |
| |
| // The following code is a translation from the path-to-regexp typescript at: |
| // |
| // https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L126-L232 |
| |
| namespace liburlpattern { |
| |
| namespace { |
| |
| // The "full wildcard" regex pattern. This regex value is treated specially |
| // resulting in a kFullWildcard Part instead of a kRegex Part. |
| static const char* kWildcardRegex = ".*"; |
| |
| // Helper class that tracks the parser state. |
| class State { |
| public: |
| State(std::vector<Token> token_list, absl::string_view delimiter_list) |
| : token_list_(std::move(token_list)), |
| segment_wildcard_regex_( |
| absl::StrFormat("[^%s]+?", EscapeString(delimiter_list))) {} |
| |
| // Return true if there are more tokens to process. |
| bool HasMoreTokens() const { return index_ < token_list_.size(); } |
| |
| // Attempt to consume the next Token, but only if it matches the given |
| // |type|. Returns a pointer to the Token on success or nullptr on failure. |
| const Token* TryConsume(TokenType type) { |
| ABSL_ASSERT(index_ < token_list_.size()); |
| TokenType next_type = token_list_[index_].type; |
| if (next_type != type) |
| return nullptr; |
| |
| // The last token should always be kEnd. |
| if ((index_ + 1) == token_list_.size()) |
| ABSL_ASSERT(token_list_[index_].type == TokenType::kEnd); |
| |
| return &(token_list_[index_++]); |
| } |
| |
| // Consume the next Token requiring it to be the given |type|. If this |
| // is not possible then return an error. |
| absl::StatusOr<const Token*> MustConsume(TokenType type) { |
| ABSL_ASSERT(index_ < token_list_.size()); |
| if (const Token* token = TryConsume(type)) |
| return token; |
| return absl::InvalidArgumentError( |
| absl::StrFormat("Unexpected %s at %d, expected %s", |
| TokenTypeToString(token_list_[index_].type), index_, |
| TokenTypeToString(type))); |
| } |
| |
| // Consume as many sequential kChar and kEscapedChar Tokens as possible |
| // appending them together into a single string value. |
| std::string ConsumeText() { |
| // Unfortunately we cannot use a view here and must copy into a new |
| // string. This is necessary to flatten escape sequences into |
| // a single value with other characters. |
| std::string result; |
| const Token* token = nullptr; |
| do { |
| token = TryConsume(TokenType::kChar); |
| if (!token) |
| token = TryConsume(TokenType::kEscapedChar); |
| if (token) |
| result.append(token->value.data(), token->value.size()); |
| } while (token); |
| return result; |
| } |
| |
| // Append the given Token value to the pending fixed value. This will |
| // be converted to a kFixed Part when we reach the end of a run of |
| // kChar and kEscapedChar tokens. |
| void AppendToPendingFixedValue(absl::string_view token_value) { |
| pending_fixed_value_.append(token_value.data(), token_value.size()); |
| } |
| |
| // Convert the pending fixed value, if any, to a kFixed Part. Has no effect |
| // if there is no pending value. |
| void MaybeAddPartFromPendingFixedValue() { |
| if (pending_fixed_value_.empty()) |
| return; |
| part_list_.emplace_back(PartType::kFixed, std::move(pending_fixed_value_), |
| Modifier::kNone); |
| pending_fixed_value_ = ""; |
| } |
| |
| // Add a Part for the given set of tokens. |
| void AddPart(std::string prefix, |
| const Token* name_token, |
| const Token* regex_token, |
| std::string suffix, |
| const Token* modifier_token) { |
| // Convert the kModifier Token into a Modifier enum value. |
| Modifier modifier = Modifier::kNone; |
| if (modifier_token) { |
| ABSL_ASSERT(!modifier_token->value.empty()); |
| switch (modifier_token->value[0]) { |
| case '?': |
| modifier = Modifier::kOptional; |
| break; |
| case '*': |
| modifier = Modifier::kZeroOrMore; |
| break; |
| case '+': |
| modifier = Modifier::kOneOrMore; |
| break; |
| default: |
| ABSL_ASSERT(false); |
| break; |
| } |
| } |
| |
| // If there is no name or regex tokens then this is just a fixed string |
| // grouping; e.g. "{foo}?". The fixed string ends up in the prefix value |
| // since it consumed the entire text of the grouping. If the prefix value |
| // is empty then its an empty "{}" group and we return without adding any |
| // Part. |
| if (!name_token && !regex_token) { |
| ABSL_ASSERT(suffix.empty()); |
| if (!prefix.empty()) |
| part_list_.emplace_back(PartType::kFixed, std::move(prefix), modifier); |
| return; |
| } |
| |
| // Determine the regex value. If there is a kRegex Token, then this is |
| // explicitly set by that Token. Otherwise a kName Token by itself gets |
| // an implicit regex value that matches through to the end of the segment. |
| // This is represented by the |segment_wildcard_regex_| value. |
| std::string regex_value; |
| if (regex_token) |
| regex_value = std::string(regex_token->value); |
| else |
| regex_value = segment_wildcard_regex_; |
| |
| // Next determine the type of the Part. This depends on the regex value |
| // since we give certain values special treatment with their own type. |
| // A |segment_wildcard_regex_| is mapped to the kSegmentWildcard type. A |
| // |kWildcardRegex| is mapped to the kFullWildcard type. Otherwise |
| // the Part gets the kRegex type. |
| PartType type = PartType::kRegex; |
| if (regex_value == segment_wildcard_regex_) { |
| type = PartType::kSegmentWildcard; |
| regex_value = ""; |
| } else if (regex_value == kWildcardRegex) { |
| type = PartType::kFullWildcard; |
| regex_value = ""; |
| } |
| |
| // Every kRegex, kSegmentWildcard, and kFullWildcard Part must have a |
| // group name. If there was a kName Token, then use the explicitly |
| // set name. Otherwise we generate a numeric based key for the name. |
| std::string name; |
| if (name_token) |
| name = std::string(name_token->value); |
| else if (regex_token) |
| name = GenerateKey(); |
| |
| // Finally add the part to the list. |
| part_list_.emplace_back(type, std::move(name), std::move(prefix), |
| std::move(regex_value), std::move(suffix), |
| modifier); |
| } |
| |
| Pattern TakeAsPattern() { return Pattern(std::move(part_list_)); } |
| |
| private: |
| // Generate a numeric key string to be used for groups that do not |
| // have an explicit kName Token. |
| std::string GenerateKey() { return absl::StrFormat("%d", next_key_++); } |
| |
| // The input list of Token objects to process. |
| const std::vector<Token> token_list_; |
| |
| // The special regex value corresponding to the default regex value |
| // given to a lone kName Token. This is a variable since its value |
| // is dependent on the |delimiter_list| passed to the constructor. |
| const std::string segment_wildcard_regex_; |
| |
| // The output list of Pattern Part objects. |
| std::vector<Part> part_list_; |
| |
| // A buffer of kChar and kEscapedChar values that are pending the creation |
| // of a kFixed Part. |
| std::string pending_fixed_value_; |
| |
| // The index of the next Token in |token_list_|. |
| size_t index_ = 0; |
| |
| // The next value to use when generating a numeric based name for Parts |
| // without explicit kName Tokens. |
| int next_key_ = 0; |
| }; |
| |
| } // namespace |
| |
| absl::StatusOr<Pattern> Parse(absl::string_view pattern, |
| absl::string_view delimiter_list, |
| absl::string_view prefix_list) { |
| auto result = Tokenize(pattern); |
| if (!result.ok()) |
| return result.status(); |
| |
| State state(std::move(result.value()), delimiter_list); |
| |
| while (state.HasMoreTokens()) { |
| // Look for the sequence: <prefix char><name><regex><modifier> |
| // There could be from zero to all through of these tokens. For |
| // example: |
| // * "/:foo(bar)?" - all four tokens |
| // * "/" - just a char token |
| // * ":foo" - just a name token |
| // * "(bar)" - just a regex token |
| // * "/:foo" - char and name tokens |
| // * "/(bar)" - char and regex tokens |
| // * "/:foo?" - char, name, and modifier tokens |
| // * "/(bar)?" - char, regex, and modifier tokens |
| const Token* char_token = state.TryConsume(TokenType::kChar); |
| const Token* name_token = state.TryConsume(TokenType::kName); |
| const Token* regex_token = state.TryConsume(TokenType::kRegex); |
| |
| // If there is a name or regex token then we need to add a Pattern Part |
| // immediately. |
| if (name_token || regex_token) { |
| // Determine if the char token is a valid prefix. Only characters in the |
| // configured prefix_list are automatically treated as prefixes. A |
| // kEscapedChar Token is never treated as a prefix. |
| absl::string_view prefix = char_token ? char_token->value : ""; |
| if (prefix_list.find(prefix) == std::string::npos) { |
| // This is not a prefix character. Add it to the buffered characters |
| // to be added as a kFixed Part later. |
| state.AppendToPendingFixedValue(prefix); |
| prefix = absl::string_view(); |
| } |
| |
| // If we have any buffered characters in a pending fixed value, then |
| // convert them into a kFixed Part now. |
| state.MaybeAddPartFromPendingFixedValue(); |
| |
| // kName and kRegex tokens can optionally be followed by a modifier. |
| const Token* modifier_token = state.TryConsume(TokenType::kModifier); |
| |
| // Add the Part for the name and regex tokens. |
| state.AddPart(std::string(prefix), name_token, regex_token, /*suffix=*/"", |
| modifier_token); |
| continue; |
| } |
| |
| // There was neither a kRegex or kName token, so consider if we just have a |
| // fixed string part. A fixed string can consist of kChar or kEscapedChar |
| // tokens. These just get added to the buffered pending fixed value for |
| // now. It will get converted to a kFixed Part later. |
| const Token* fixed_token = char_token; |
| if (!fixed_token) |
| fixed_token = state.TryConsume(TokenType::kEscapedChar); |
| if (fixed_token) { |
| state.AppendToPendingFixedValue(fixed_token->value); |
| continue; |
| } |
| |
| // There was not a kChar or kEscapedChar token, so we no we are at the end |
| // of any fixed string. Therefore convert the pending fixed value into a |
| // kFixed Part now. |
| state.MaybeAddPartFromPendingFixedValue(); |
| |
| // Look for the sequence: |
| // |
| // <open><char prefix><name><regex><char suffix><close><modifier> |
| // |
| // The open and close are required, but the other tokens are optional. |
| // For example: |
| // * "{a:foo(.*)b}?" - all tokens present |
| // * "{:foo}?" - just name and modifier tokens |
| // * "{(.*)}?" - just regex and modifier tokens |
| // * "{ab}?" - just char and modifier tokens |
| const Token* open_token = state.TryConsume(TokenType::kOpen); |
| if (open_token) { |
| std::string prefix = state.ConsumeText(); |
| const Token* name_token = state.TryConsume(TokenType::kName); |
| const Token* regex_token = state.TryConsume(TokenType::kRegex); |
| std::string suffix = state.ConsumeText(); |
| |
| auto result = state.MustConsume(TokenType::kClose); |
| if (!result.ok()) |
| return result.status(); |
| |
| const Token* modifier_token = state.TryConsume(TokenType::kModifier); |
| |
| state.AddPart(std::move(prefix), name_token, regex_token, |
| std::move(suffix), modifier_token); |
| continue; |
| } |
| |
| // We didn't find any tokens allowed by the syntax, so we should be |
| // at the end of the token list. If there is a syntax error, this |
| // is where it will typically be caught. |
| auto result = state.MustConsume(TokenType::kEnd); |
| if (!result.ok()) |
| return result.status(); |
| } |
| |
| return state.TakeAsPattern(); |
| } |
| |
| } // namespace liburlpattern |