| // Copyright 2018 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/subresource_filter/tools/rule_parser/rule_parser.h" |
| |
| #include <map> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/strings/string_split.h" |
| #include "base/strings/string_util.h" |
| #include "components/subresource_filter/tools/rule_parser/rule_options.h" |
| #include "components/url_pattern_index/proto/rules.pb.h" |
| |
| namespace subresource_filter { |
| |
| namespace { |
| |
| // Encapsulates meta-information of URL rule options identified by keywords. |
| class KeywordMap { |
| public: |
| // Types of rule options that can be represented by keywords. |
| enum OptionType { |
| OPTION_UNDEFINED, |
| OPTION_ELEMENT_TYPE, |
| OPTION_ACTIVATION_TYPE, |
| OPTION_THIRD_PARTY, |
| OPTION_DOMAIN, |
| OPTION_SITEKEY, |
| OPTION_MATCH_CASE, |
| OPTION_COLLAPSE, |
| OPTION_DO_NOT_TRACK, |
| }; |
| |
| enum OptionFlag : int { |
| FLAG_NONE = 0, |
| // The option requires a value, e.g. 'domain=example.org'. |
| FLAG_REQUIRES_VALUE = 1, |
| // The option allows invertion, e.g. 'image' and '~image'. |
| FLAG_IS_TRISTATE = 2, |
| // The option can be used with whitelist rules only. |
| FLAG_IS_WHITELIST_ONLY = 4, |
| // The option is not supposed to be used any more. |
| FLAG_IS_DEPRECATED = 8, |
| // The option is not supported yet. |
| FLAG_IS_NOT_SUPPORTED = 16, |
| }; |
| |
| // Meta-information about an option represented by a certain keyword. |
| struct OptionDetails { |
| // Creates an option that defines a filter for the specified |element_type|. |
| // In addition to the provided |flags|, FLAG_IS_TRISTATE will always be set |
| // by default. |
| OptionDetails(url_pattern_index::proto::ElementType element_type, int flags) |
| : type(OPTION_ELEMENT_TYPE), |
| flags(FLAG_IS_TRISTATE | flags), |
| element_type(element_type) {} |
| |
| // Creates an ActivationType option. |
| explicit OptionDetails( |
| url_pattern_index::proto::ActivationType activation_type) |
| : type(OPTION_ACTIVATION_TYPE), |
| flags(FLAG_IS_WHITELIST_ONLY), |
| activation_type(activation_type) {} |
| |
| // Creates a generic option. |
| OptionDetails(OptionType type, int flags) : type(type), flags(flags) { |
| DCHECK_NE(type, OPTION_ELEMENT_TYPE); |
| DCHECK_NE(type, OPTION_ACTIVATION_TYPE); |
| } |
| |
| bool requires_value() const { return flags & FLAG_REQUIRES_VALUE; } |
| bool is_tristate() const { return flags & FLAG_IS_TRISTATE; } |
| bool is_whitelist_only() const { return flags & FLAG_IS_WHITELIST_ONLY; } |
| bool is_deprecated() const { return flags & FLAG_IS_DEPRECATED; } |
| bool is_not_supported() const { return flags & FLAG_IS_NOT_SUPPORTED; } |
| |
| OptionType type = OPTION_UNDEFINED; |
| |
| // Stores various OptionFlag's combined using bitwise OR. |
| int flags = FLAG_NONE; |
| |
| // The element type that this option defines a filter for, if any. Set to |
| // ELEMENT_TYPE_UNSPECIFIED for non-ElementType options. |
| url_pattern_index::proto::ElementType element_type = |
| url_pattern_index::proto::ELEMENT_TYPE_UNSPECIFIED; |
| |
| // The activation type that this option includes to the rule. Set to |
| // ACTIVATION_TYPE_UNSPECIFIED for non-ActivationType options. |
| url_pattern_index::proto::ActivationType activation_type = |
| url_pattern_index::proto::ACTIVATION_TYPE_UNSPECIFIED; |
| }; |
| |
| // Initializes the map with default keywords. |
| KeywordMap(); |
| ~KeywordMap(); |
| |
| // Returns detailed information associated with the provided |name| option. |
| // Returns nullptr on unknown options. |
| const OptionDetails* Lookup(base::StringPiece name) const; |
| |
| private: |
| // Associates |details| with a specified option |name|. |
| void AddOption(base::StringPiece name, const OptionDetails& details); |
| |
| std::map<std::string, OptionDetails> options_; |
| |
| DISALLOW_COPY_AND_ASSIGN(KeywordMap); |
| }; |
| |
| KeywordMap::KeywordMap() { |
| // ElementType options. |
| for (const auto& element_type : kElementTypes) { |
| OptionDetails details(element_type.type, FLAG_NONE); |
| AddOption(element_type.name, details); |
| } |
| // Deprecated ElementType options. |
| for (const auto& element_type : kDeprecatedElementTypes) { |
| OptionDetails details(element_type.maps_to_type, FLAG_IS_DEPRECATED); |
| AddOption(element_type.name, details); |
| } |
| |
| // ActivationType options. |
| for (const auto& activation_type : kActivationTypes) { |
| OptionDetails details(activation_type.type); |
| AddOption(activation_type.name, details); |
| } |
| |
| // TODO(pkalinnikov): Consider moving options metadata to a header. |
| struct { |
| const char* name; |
| OptionType type; |
| int flags; |
| } const options[] = { |
| // Tristate options. |
| {"third-party", OPTION_THIRD_PARTY, FLAG_IS_TRISTATE}, |
| {"collapse", OPTION_COLLAPSE, FLAG_IS_TRISTATE | FLAG_IS_NOT_SUPPORTED}, |
| // Flag options. |
| {"match-case", OPTION_MATCH_CASE, FLAG_NONE}, |
| {"donottrack", OPTION_DO_NOT_TRACK, FLAG_IS_NOT_SUPPORTED}, |
| // Value options. |
| {"sitekey", OPTION_SITEKEY, FLAG_REQUIRES_VALUE | FLAG_IS_NOT_SUPPORTED}, |
| {"domain", OPTION_DOMAIN, FLAG_REQUIRES_VALUE}, |
| }; |
| |
| for (const auto& option : options) { |
| AddOption(option.name, OptionDetails(option.type, option.flags)); |
| } |
| } |
| |
| KeywordMap::~KeywordMap() = default; |
| |
| const KeywordMap::OptionDetails* KeywordMap::Lookup( |
| base::StringPiece name) const { |
| // TODO(pkalinnikov): Avoid std::string allocation. |
| auto iterator = options_.find(std::string(name)); |
| return iterator != options_.end() ? &iterator->second : nullptr; |
| } |
| |
| void KeywordMap::AddOption(base::StringPiece name, |
| const OptionDetails& details) { |
| auto inserted = options_.insert(std::make_pair(std::string(name), details)); |
| DCHECK(inserted.second); |
| } |
| |
| KeywordMap* GetKeywordsMapSingleton() { |
| // TODO(melandory): Get rid of this singleton. |
| static auto* shared_keywords = new KeywordMap; |
| return shared_keywords; |
| } |
| |
| } // namespace |
| |
| // RuleParser ------------------------------------------------------------------ |
| |
| RuleParser::ParseError::ParseError() = default; |
| RuleParser::ParseError::~ParseError() = default; |
| |
| RuleParser::RuleParser() = default; |
| RuleParser::~RuleParser() = default; |
| |
| const char* RuleParser::GetParseErrorCodeDescription( |
| ParseError::ErrorCode code) { |
| switch (code) { |
| case ParseError::NONE: |
| return "Ok"; |
| case ParseError::EMPTY_RULE: |
| return "The rule is empty"; |
| case ParseError::BAD_WHITELIST_SYNTAX: |
| return "Wrong whitelist rule syntax"; |
| case ParseError::UNKNOWN_OPTION: |
| return "Unknown URL rule option"; |
| case ParseError::NOT_A_TRISTATE_OPTION: |
| return "Unexpected '~', the option is not invertable"; |
| case ParseError::DEPRECATED_OPTION: |
| return "The option is deprecated"; |
| case ParseError::WHITELIST_ONLY_OPTION: |
| return "The option can be used with whitelist rules only"; |
| case ParseError::NO_VALUE_PROVIDED: |
| return "Expected '=', the option requires a value"; |
| case ParseError::WRONG_CSS_RULE_DELIM: |
| return "Wrong CSS rule delimiter"; |
| case ParseError::EMPTY_CSS_SELECTOR: |
| return "Expected non-empty CSS selector"; |
| case ParseError::UNSUPPORTED_FEATURE: |
| return "The feature is not currently supported"; |
| default: |
| return "Unknown error"; |
| } |
| } |
| |
| // TODO(pkalinnikov): Refactor parsing approach to use a FSM. |
| RuleType RuleParser::Parse(base::StringPiece line) { |
| rule_type_ = url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| parse_error_ = ParseError(); |
| |
| // Strip all leading and trailing whitespaces. |
| base::StringPiece part = line; |
| part = base::TrimWhitespaceASCII(part, base::TRIM_ALL); |
| // Check whether it's a trivial rule. |
| if (part.empty()) { |
| // Note: cannot use part.data() here because it is flaky to rely on *which* |
| // empty StringPiece StripWhitespace will return. |
| SetParseError(ParseError::EMPTY_RULE, line, line.data()); |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| } |
| |
| // Check whether it's a comment. |
| // TODO(pkalinnikov): Handle special comments (e.g. 'Title', 'Expires' etc.). |
| if (part[0] == '!' || part[0] == '[') { |
| return rule_type_ = url_pattern_index::proto::RULE_TYPE_COMMENT; |
| } |
| |
| // Suppose it is a CSS rule if a CSS-selector separator character ('#') is |
| // present, followed by '#' or '@'. |
| size_t css_separator_pos = part.find('#'); |
| for (; css_separator_pos != base::StringPiece::npos; |
| css_separator_pos = part.find('#', css_separator_pos + 1)) { |
| if (css_separator_pos + 1 == part.size()) { |
| css_separator_pos = base::StringPiece::npos; |
| break; |
| } |
| const char next_char = part[css_separator_pos + 1]; |
| if (next_char == '#' || next_char == '@') // CSS rule starter. |
| break; |
| } |
| |
| if (css_separator_pos != base::StringPiece::npos) { |
| return rule_type_ = ParseCssRule(line, part, css_separator_pos); |
| } |
| // Else assume we read a URL filtering rule. |
| return rule_type_ = ParseUrlRule(line, part); |
| } |
| |
| RuleType RuleParser::ParseUrlRule(base::StringPiece origin, |
| base::StringPiece part) { |
| CHECK(!part.empty() && part.data() >= origin.data()); |
| url_rule_ = UrlRule(); |
| |
| // Check whether it's a whitelist rule. |
| if (part[0] == '@') { |
| part.remove_prefix(1); |
| if (part.empty() || part[0] != '@') { |
| SetParseError(ParseError::BAD_WHITELIST_SYNTAX, origin, part.data()); |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| } |
| part.remove_prefix(1); |
| url_rule_.is_whitelist = true; |
| } |
| |
| size_t options_start = part.rfind('$'); |
| // If the URL pattern is a regular expression, |options_start| might be |
| // pointing to a character inside the pattern. This can happen for those rules |
| // which don't have options at all, e.g., "/.*substring$/". All such rules end |
| // with '/', therefore the following code can detect them to work around. |
| if (options_start != base::StringPiece::npos && part.back() == '/') |
| options_start = base::StringPiece::npos; |
| |
| if (options_start != base::StringPiece::npos) { |
| const base::StringPiece options = part.substr(options_start + 1); |
| if (!ParseUrlRuleOptions(origin, options)) |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| part.remove_suffix(part.size() - options_start); |
| } |
| |
| // Check for a left anchor. |
| if (!part.empty() && part[0] == '|') { |
| part.remove_prefix(1); |
| if (!part.empty() && part[0] == '|') { |
| part.remove_prefix(1); |
| url_rule_.anchor_left = url_pattern_index::proto::ANCHOR_TYPE_SUBDOMAIN; |
| } else { |
| url_rule_.anchor_left = url_pattern_index::proto::ANCHOR_TYPE_BOUNDARY; |
| } |
| } |
| |
| // Check for a right anchor. |
| if (!part.empty()) { |
| if (part[part.size() - 1] == '|') { |
| part.remove_suffix(1); |
| url_rule_.anchor_right = url_pattern_index::proto::ANCHOR_TYPE_BOUNDARY; |
| } |
| } |
| |
| url_rule_.url_pattern = std::string(part); |
| url_rule_.Canonicalize(); |
| |
| return url_pattern_index::proto::RULE_TYPE_URL; |
| } |
| |
| bool RuleParser::ParseUrlRuleOptions(base::StringPiece origin, |
| base::StringPiece options) { |
| CHECK_GE(options.data(), origin.data()); |
| |
| bool has_seen_element_or_activation_type = false; |
| for (base::StringPiece piece : base::SplitStringPiece( |
| options, ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY)) { |
| DCHECK(!piece.empty()); |
| |
| TriState tri_state = TriState::YES; |
| if (base::StartsWith(piece, "~", base::CompareCase::SENSITIVE)) { |
| piece.remove_prefix(1); |
| tri_state = TriState::NO; |
| } |
| |
| size_t option_name_end = piece.find('='); |
| base::StringPiece option_name = piece.substr(0, option_name_end); |
| |
| const auto* option_details = GetKeywordsMapSingleton()->Lookup(option_name); |
| if (!option_details) { |
| // TODO(pkalinnikov): Add a flag to RuleParser allowing unknown options. |
| SetParseError(ParseError::UNKNOWN_OPTION, origin, option_name.data()); |
| return false; |
| } |
| |
| if (tri_state == TriState::NO && !option_details->is_tristate()) { |
| SetParseError(ParseError::NOT_A_TRISTATE_OPTION, origin, |
| option_name.data()); |
| return false; |
| } |
| |
| if (option_details->requires_value() && |
| option_name_end == base::StringPiece::npos) { |
| SetParseError(ParseError::NO_VALUE_PROVIDED, origin, option_name.data()); |
| return false; |
| } |
| |
| if (option_details->is_whitelist_only() && !url_rule_.is_whitelist) { |
| SetParseError(ParseError::WHITELIST_ONLY_OPTION, origin, |
| option_name.data()); |
| return false; |
| } |
| |
| if (option_details->is_deprecated()) { |
| // TODO(pkalinnikov): Add a flag to RuleParser allowing deprecated |
| // options (and issuing kind of a warning). |
| SetParseError(ParseError::DEPRECATED_OPTION, origin, option_name.data()); |
| return false; |
| } |
| |
| if (option_details->is_not_supported()) { |
| // TODO(pkalinnikov): Add a flag to RuleParser allowing unsupported |
| // features. |
| SetParseError(ParseError::UNSUPPORTED_FEATURE, origin, |
| option_name.data()); |
| return false; |
| } |
| |
| switch (option_details->type) { |
| case KeywordMap::OPTION_ELEMENT_TYPE: { |
| // The sign of the first element type option encountered determines |
| // whether the unspecified element types will be included (if the first |
| // option is negated) or excluded (otherwise). |
| if (tri_state == TriState::YES) { |
| // TODO(pkalinnikov): How about not resetting ActivationType options? |
| if (!has_seen_element_or_activation_type) |
| url_rule_.type_mask = 0; |
| url_rule_.type_mask |= type_mask_for(option_details->element_type); |
| } else { |
| DCHECK(tri_state == TriState::NO); |
| url_rule_.type_mask &= ~type_mask_for(option_details->element_type); |
| } |
| has_seen_element_or_activation_type = true; |
| break; |
| } |
| case KeywordMap::OPTION_ACTIVATION_TYPE: |
| if (!has_seen_element_or_activation_type) |
| url_rule_.type_mask = 0; |
| url_rule_.type_mask |= type_mask_for(option_details->activation_type); |
| has_seen_element_or_activation_type = true; |
| break; |
| case KeywordMap::OPTION_THIRD_PARTY: |
| url_rule_.is_third_party = tri_state; |
| break; |
| case KeywordMap::OPTION_MATCH_CASE: |
| url_rule_.match_case = true; |
| break; |
| case KeywordMap::OPTION_DOMAIN: |
| url_rule_.domains = |
| base::SplitString(piece.substr(option_name_end + 1), "|", |
| base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY); |
| break; |
| default: |
| LOG(FATAL); |
| } |
| } |
| |
| return true; |
| } |
| |
| RuleType RuleParser::ParseCssRule(base::StringPiece origin, |
| base::StringPiece part, |
| size_t css_section_start) { |
| CHECK(part.data() >= origin.data()); |
| css_rule_ = CssRule(); |
| |
| // Check for a list of domains. |
| if (css_section_start) { |
| DCHECK(css_section_start != base::StringPiece::npos); |
| auto pieces = base::SplitStringPiece(part.substr(0, css_section_start), ",", |
| base::TRIM_WHITESPACE, |
| base::SPLIT_WANT_NONEMPTY); |
| for (base::StringPiece domain : pieces) { |
| DCHECK(!domain.empty()); |
| css_rule_.domains.push_back(std::string(domain)); |
| } |
| } |
| |
| part.remove_prefix(css_section_start + 1); |
| if (part.empty()) { |
| SetParseError(ParseError::WRONG_CSS_RULE_DELIM, origin, part.data()); |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| } |
| if (part[0] == '@') { |
| css_rule_.is_whitelist = true; |
| part.remove_prefix(1); |
| } |
| if (part.empty() || part[0] != '#') { |
| SetParseError(ParseError::WRONG_CSS_RULE_DELIM, origin, part.data()); |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| } |
| part.remove_prefix(1); |
| |
| if (part.empty()) { |
| SetParseError(ParseError::EMPTY_CSS_SELECTOR, origin, part.data()); |
| return url_pattern_index::proto::RULE_TYPE_UNSPECIFIED; |
| } |
| |
| css_rule_.css_selector = std::string(part); |
| css_rule_.Canonicalize(); |
| return url_pattern_index::proto::RULE_TYPE_CSS; |
| } |
| |
| void RuleParser::SetParseError(ParseError::ErrorCode code, |
| base::StringPiece origin, |
| const char* error_begin) { |
| DCHECK(code != ParseError::NONE); |
| DCHECK(error_begin >= origin.data()); |
| |
| parse_error_.error_code = code; |
| parse_error_.line = std::string(origin); |
| parse_error_.error_index = error_begin - origin.data(); |
| } |
| |
| std::ostream& operator<<(std::ostream& out, |
| const RuleParser::ParseError& error) { |
| if (error.error_code != RuleParser::ParseError::NONE) { |
| out << "(error:" << error.error_index + 1 << ") " |
| << RuleParser::GetParseErrorCodeDescription(error.error_code) << ":\n" |
| << error.line << '\n' |
| << std::string(error.error_index, ' ') << "^\n"; |
| } |
| return out; |
| } |
| |
| } // namespace subresource_filter |