| // Copyright 2016 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "src/regexp/regexp-parser.h" |
| |
| #include "src/base/small-vector.h" |
| #include "src/execution/isolate.h" |
| #include "src/objects/string-inl.h" |
| #include "src/regexp/property-sequences.h" |
| #include "src/regexp/regexp-ast.h" |
| #include "src/regexp/regexp-macro-assembler.h" |
| #include "src/regexp/regexp.h" |
| #include "src/strings/char-predicates-inl.h" |
| #include "src/utils/ostreams.h" |
| #include "src/utils/utils.h" |
| #include "src/zone/zone-allocator.h" |
| #include "src/zone/zone-list-inl.h" |
| |
| #ifdef V8_INTL_SUPPORT |
| #include "unicode/uniset.h" |
| #endif // V8_INTL_SUPPORT |
| |
| namespace v8 { |
| namespace internal { |
| |
| namespace { |
| |
| // Whether we're currently inside the ClassEscape production |
| // (tc39.es/ecma262/#prod-annexB-CharacterEscape). |
| enum class InClassEscapeState { |
| kInClass, |
| kNotInClass, |
| }; |
| |
| // Accumulates RegExp atoms and assertions into lists of terms and alternatives. |
| class RegExpBuilder { |
| public: |
| RegExpBuilder(Zone* zone, RegExpFlags flags) |
| : zone_(zone), |
| flags_(flags), |
| terms_(ZoneAllocator<RegExpTree*>{zone}), |
| text_(ZoneAllocator<RegExpTree*>{zone}), |
| alternatives_(ZoneAllocator<RegExpTree*>{zone}) {} |
| void AddCharacter(base::uc16 character); |
| void AddUnicodeCharacter(base::uc32 character); |
| void AddEscapedUnicodeCharacter(base::uc32 character); |
| // "Adds" an empty expression. Does nothing except consume a |
| // following quantifier |
| void AddEmpty(); |
| void AddCharacterClass(RegExpCharacterClass* cc); |
| void AddCharacterClassForDesugaring(base::uc32 c); |
| void AddAtom(RegExpTree* tree); |
| void AddTerm(RegExpTree* tree); |
| void AddAssertion(RegExpTree* tree); |
| void NewAlternative(); // '|' |
| bool AddQuantifierToAtom(int min, int max, |
| RegExpQuantifier::QuantifierType type); |
| void FlushText(); |
| RegExpTree* ToRegExp(); |
| RegExpFlags flags() const { return flags_; } |
| |
| bool ignore_case() const { return IsIgnoreCase(flags_); } |
| bool multiline() const { return IsMultiline(flags_); } |
| bool dotall() const { return IsDotAll(flags_); } |
| |
| private: |
| static const base::uc16 kNoPendingSurrogate = 0; |
| void AddLeadSurrogate(base::uc16 lead_surrogate); |
| void AddTrailSurrogate(base::uc16 trail_surrogate); |
| void FlushPendingSurrogate(); |
| void FlushCharacters(); |
| void FlushTerms(); |
| bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); |
| bool NeedsDesugaringForIgnoreCase(base::uc32 c); |
| Zone* zone() const { return zone_; } |
| bool unicode() const { return IsUnicode(flags_); } |
| |
| Zone* const zone_; |
| bool pending_empty_ = false; |
| const RegExpFlags flags_; |
| ZoneList<base::uc16>* characters_ = nullptr; |
| base::uc16 pending_surrogate_ = kNoPendingSurrogate; |
| |
| using SmallRegExpTreeVector = |
| base::SmallVector<RegExpTree*, 8, ZoneAllocator<RegExpTree*>>; |
| SmallRegExpTreeVector terms_; |
| SmallRegExpTreeVector text_; |
| SmallRegExpTreeVector alternatives_; |
| #ifdef DEBUG |
| enum { |
| ADD_NONE, |
| ADD_CHAR, |
| ADD_TERM, |
| ADD_ASSERT, |
| ADD_ATOM |
| } last_added_ = ADD_NONE; |
| #define LAST(x) last_added_ = x; |
| #else |
| #define LAST(x) |
| #endif |
| }; |
| |
| enum SubexpressionType { |
| INITIAL, |
| CAPTURE, // All positive values represent captures. |
| POSITIVE_LOOKAROUND, |
| NEGATIVE_LOOKAROUND, |
| GROUPING |
| }; |
| |
| class RegExpParserState : public ZoneObject { |
| public: |
| // Push a state on the stack. |
| RegExpParserState(RegExpParserState* previous_state, |
| SubexpressionType group_type, |
| RegExpLookaround::Type lookaround_type, |
| int disjunction_capture_index, |
| const ZoneVector<base::uc16>* capture_name, |
| RegExpFlags flags, Zone* zone) |
| : previous_state_(previous_state), |
| builder_(zone, flags), |
| group_type_(group_type), |
| lookaround_type_(lookaround_type), |
| disjunction_capture_index_(disjunction_capture_index), |
| capture_name_(capture_name) {} |
| // Parser state of containing expression, if any. |
| RegExpParserState* previous_state() const { return previous_state_; } |
| bool IsSubexpression() { return previous_state_ != nullptr; } |
| // RegExpBuilder building this regexp's AST. |
| RegExpBuilder* builder() { return &builder_; } |
| // Type of regexp being parsed (parenthesized group or entire regexp). |
| SubexpressionType group_type() const { return group_type_; } |
| // Lookahead or Lookbehind. |
| RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } |
| // Index in captures array of first capture in this sub-expression, if any. |
| // Also the capture index of this sub-expression itself, if group_type |
| // is CAPTURE. |
| int capture_index() const { return disjunction_capture_index_; } |
| // The name of the current sub-expression, if group_type is CAPTURE. Only |
| // used for named captures. |
| const ZoneVector<base::uc16>* capture_name() const { return capture_name_; } |
| |
| bool IsNamedCapture() const { return capture_name_ != nullptr; } |
| |
| // Check whether the parser is inside a capture group with the given index. |
| bool IsInsideCaptureGroup(int index) const { |
| for (const RegExpParserState* s = this; s != nullptr; |
| s = s->previous_state()) { |
| if (s->group_type() != CAPTURE) continue; |
| // Return true if we found the matching capture index. |
| if (index == s->capture_index()) return true; |
| // Abort if index is larger than what has been parsed up till this state. |
| if (index > s->capture_index()) return false; |
| } |
| return false; |
| } |
| |
| // Check whether the parser is inside a capture group with the given name. |
| bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const { |
| DCHECK_NOT_NULL(name); |
| for (const RegExpParserState* s = this; s != nullptr; |
| s = s->previous_state()) { |
| if (s->capture_name() == nullptr) continue; |
| if (*s->capture_name() == *name) return true; |
| } |
| return false; |
| } |
| |
| private: |
| // Linked list implementation of stack of states. |
| RegExpParserState* const previous_state_; |
| // Builder for the stored disjunction. |
| RegExpBuilder builder_; |
| // Stored disjunction type (capture, look-ahead or grouping), if any. |
| const SubexpressionType group_type_; |
| // Stored read direction. |
| const RegExpLookaround::Type lookaround_type_; |
| // Stored disjunction's capture index (if any). |
| const int disjunction_capture_index_; |
| // Stored capture name (if any). |
| const ZoneVector<base::uc16>* const capture_name_; |
| }; |
| |
| template <class CharT> |
| class RegExpParserImpl final { |
| private: |
| RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags, |
| uintptr_t stack_limit, Zone* zone, |
| const DisallowGarbageCollection& no_gc); |
| |
| bool Parse(RegExpCompileData* result); |
| |
| RegExpTree* ParsePattern(); |
| RegExpTree* ParseDisjunction(); |
| RegExpTree* ParseGroup(); |
| |
| // Parses a {...,...} quantifier and stores the range in the given |
| // out parameters. |
| bool ParseIntervalQuantifier(int* min_out, int* max_out); |
| |
| // Checks whether the following is a length-digit hexadecimal number, |
| // and sets the value if it is. |
| bool ParseHexEscape(int length, base::uc32* value); |
| bool ParseUnicodeEscape(base::uc32* value); |
| bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value); |
| |
| bool ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2); |
| bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, |
| const ZoneVector<char>& name_1, |
| const ZoneVector<char>& name_2); |
| |
| RegExpTree* ParseCharacterClass(const RegExpBuilder* state); |
| |
| base::uc32 ParseOctalLiteral(); |
| |
| // Tries to parse the input as a back reference. If successful it |
| // stores the result in the output parameter and returns true. If |
| // it fails it will push back the characters read so the same characters |
| // can be reparsed. |
| bool ParseBackReferenceIndex(int* index_out); |
| |
| // Parse inside a class. Either add escaped class to the range, or return |
| // false and pass parsed single character through |char_out|. |
| void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, |
| bool add_unicode_case_equivalents, base::uc32* char_out, |
| bool* is_class_escape); |
| // Returns true iff parsing was successful. |
| bool TryParseCharacterClassEscape(base::uc32 next, |
| InClassEscapeState in_class_escape_state, |
| ZoneList<CharacterRange>* ranges, |
| Zone* zone, |
| bool add_unicode_case_equivalents); |
| // Parses and returns a single escaped character. |
| base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, |
| bool* is_escaped_unicode_character); |
| |
| RegExpTree* ReportError(RegExpError error); |
| void Advance(); |
| void Advance(int dist); |
| void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). |
| void Reset(int pos); |
| |
| // Reports whether the pattern might be used as a literal search string. |
| // Only use if the result of the parse is a single atom node. |
| bool simple() const { return simple_; } |
| bool contains_anchor() const { return contains_anchor_; } |
| void set_contains_anchor() { contains_anchor_ = true; } |
| int captures_started() const { return captures_started_; } |
| int position() const { return next_pos_ - 1; } |
| bool failed() const { return failed_; } |
| bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; } |
| |
| static bool IsSyntaxCharacterOrSlash(base::uc32 c); |
| |
| static const base::uc32 kEndMarker = (1 << 21); |
| |
| private: |
| // Return the 1-indexed RegExpCapture object, allocate if necessary. |
| RegExpCapture* GetCapture(int index); |
| |
| // Creates a new named capture at the specified index. Must be called exactly |
| // once for each named capture. Fails if a capture with the same name is |
| // encountered. |
| bool CreateNamedCaptureAtIndex(const ZoneVector<base::uc16>* name, int index); |
| |
| // Parses the name of a capture group (?<name>pattern). The name must adhere |
| // to IdentifierName in the ECMAScript standard. |
| const ZoneVector<base::uc16>* ParseCaptureGroupName(); |
| |
| bool ParseNamedBackReference(RegExpBuilder* builder, |
| RegExpParserState* state); |
| RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); |
| |
| // After the initial parsing pass, patch corresponding RegExpCapture objects |
| // into all RegExpBackReferences. This is done after initial parsing in order |
| // to avoid complicating cases in which references comes before the capture. |
| void PatchNamedBackReferences(); |
| |
| ZoneVector<RegExpCapture*>* GetNamedCaptures() const; |
| |
| // Returns true iff the pattern contains named captures. May call |
| // ScanForCaptures to look ahead at the remaining pattern. |
| bool HasNamedCaptures(InClassEscapeState in_class_escape_state); |
| |
| Zone* zone() const { return zone_; } |
| |
| base::uc32 current() const { return current_; } |
| bool has_more() const { return has_more_; } |
| bool has_next() const { return next_pos_ < input_length(); } |
| base::uc32 Next(); |
| template <bool update_position> |
| base::uc32 ReadNext(); |
| CharT InputAt(int index) const { |
| DCHECK(0 <= index && index < input_length()); |
| return input_[index]; |
| } |
| int input_length() const { return input_length_; } |
| void ScanForCaptures(InClassEscapeState in_class_escape_state); |
| |
| struct RegExpCaptureNameLess { |
| bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { |
| DCHECK_NOT_NULL(lhs); |
| DCHECK_NOT_NULL(rhs); |
| return *lhs->name() < *rhs->name(); |
| } |
| }; |
| |
| class ForceUnicodeScope final { |
| public: |
| explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser) |
| : parser_(parser) { |
| DCHECK(!parser_->force_unicode_); |
| parser_->force_unicode_ = true; |
| } |
| ~ForceUnicodeScope() { |
| DCHECK(parser_->force_unicode_); |
| parser_->force_unicode_ = false; |
| } |
| |
| private: |
| RegExpParserImpl<CharT>* const parser_; |
| }; |
| |
| const DisallowGarbageCollection no_gc_; |
| Zone* const zone_; |
| RegExpError error_ = RegExpError::kNone; |
| int error_pos_ = 0; |
| ZoneList<RegExpCapture*>* captures_; |
| ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; |
| ZoneList<RegExpBackReference*>* named_back_references_; |
| const CharT* const input_; |
| const int input_length_; |
| base::uc32 current_; |
| const RegExpFlags top_level_flags_; |
| bool force_unicode_ = false; // Force parser to act as if unicode were set. |
| int next_pos_; |
| int captures_started_; |
| int capture_count_; // Only valid after we have scanned for captures. |
| bool has_more_; |
| bool simple_; |
| bool contains_anchor_; |
| bool is_scanned_for_captures_; |
| bool has_named_captures_; // Only valid after we have scanned for captures. |
| bool failed_; |
| const uintptr_t stack_limit_; |
| |
| friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*, |
| Handle<String>, |
| RegExpFlags, |
| RegExpCompileData*); |
| friend bool RegExpParser::VerifyRegExpSyntax<CharT>( |
| Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*, |
| const DisallowGarbageCollection&); |
| }; |
| |
| template <class CharT> |
| RegExpParserImpl<CharT>::RegExpParserImpl( |
| const CharT* input, int input_length, RegExpFlags flags, |
| uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc) |
| : zone_(zone), |
| captures_(nullptr), |
| named_captures_(nullptr), |
| named_back_references_(nullptr), |
| input_(input), |
| input_length_(input_length), |
| current_(kEndMarker), |
| top_level_flags_(flags), |
| next_pos_(0), |
| captures_started_(0), |
| capture_count_(0), |
| has_more_(true), |
| simple_(false), |
| contains_anchor_(false), |
| is_scanned_for_captures_(false), |
| has_named_captures_(false), |
| failed_(false), |
| stack_limit_(stack_limit) { |
| Advance(); |
| } |
| |
| template <> |
| template <bool update_position> |
| inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() { |
| int position = next_pos_; |
| base::uc16 c0 = InputAt(position); |
| position++; |
| DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0)); |
| if (update_position) next_pos_ = position; |
| return c0; |
| } |
| |
| template <> |
| template <bool update_position> |
| inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() { |
| int position = next_pos_; |
| base::uc16 c0 = InputAt(position); |
| base::uc32 result = c0; |
| position++; |
| // Read the whole surrogate pair in case of unicode flag, if possible. |
| if (unicode() && position < input_length() && |
| unibrow::Utf16::IsLeadSurrogate(c0)) { |
| base::uc16 c1 = InputAt(position); |
| if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| result = unibrow::Utf16::CombineSurrogatePair(c0, c1); |
| position++; |
| } |
| } |
| if (update_position) next_pos_ = position; |
| return result; |
| } |
| |
| template <class CharT> |
| base::uc32 RegExpParserImpl<CharT>::Next() { |
| if (has_next()) { |
| return ReadNext<false>(); |
| } else { |
| return kEndMarker; |
| } |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Advance() { |
| if (has_next()) { |
| if (GetCurrentStackPosition() < stack_limit_) { |
| if (FLAG_correctness_fuzzer_suppressions) { |
| FATAL("Aborting on stack overflow"); |
| } |
| ReportError(RegExpError::kStackOverflow); |
| } else { |
| current_ = ReadNext<true>(); |
| } |
| } else { |
| current_ = kEndMarker; |
| // Advance so that position() points to 1-after-the-last-character. This is |
| // important so that Reset() to this position works correctly. |
| next_pos_ = input_length() + 1; |
| has_more_ = false; |
| } |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::RewindByOneCodepoint() { |
| if (current() == kEndMarker) return; |
| // Rewinds by one code point, i.e.: two code units if `current` is outside |
| // the basic multilingual plane (= composed of a lead and trail surrogate), |
| // or one code unit otherwise. |
| const int rewind_by = |
| current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1; |
| Advance(rewind_by); // Undo the last Advance. |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Reset(int pos) { |
| next_pos_ = pos; |
| has_more_ = (pos < input_length()); |
| Advance(); |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::Advance(int dist) { |
| next_pos_ += dist - 1; |
| Advance(); |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) { |
| switch (c) { |
| case '^': |
| case '$': |
| case '\\': |
| case '.': |
| case '*': |
| case '+': |
| case '?': |
| case '(': |
| case ')': |
| case '[': |
| case ']': |
| case '{': |
| case '}': |
| case '|': |
| case '/': |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) { |
| if (failed_) return nullptr; // Do not overwrite any existing error. |
| failed_ = true; |
| error_ = error; |
| error_pos_ = position(); |
| // Zip to the end to make sure no more input is read. |
| current_ = kEndMarker; |
| next_pos_ = input_length(); |
| return nullptr; |
| } |
| |
| #define CHECK_FAILED /**/); \ |
| if (failed_) return nullptr; \ |
| ((void)0 |
| |
| // Pattern :: |
| // Disjunction |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParsePattern() { |
| RegExpTree* result = ParseDisjunction(CHECK_FAILED); |
| PatchNamedBackReferences(CHECK_FAILED); |
| DCHECK(!has_more()); |
| // If the result of parsing is a literal string atom, and it has the |
| // same length as the input, then the atom is identical to the input. |
| if (result->IsAtom() && result->AsAtom()->length() == input_length()) { |
| simple_ = true; |
| } |
| return result; |
| } |
| |
| // Disjunction :: |
| // Alternative |
| // Alternative | Disjunction |
| // Alternative :: |
| // [empty] |
| // Term Alternative |
| // Term :: |
| // Assertion |
| // Atom |
| // Atom Quantifier |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() { |
| // Used to store current state while parsing subexpressions. |
| RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, |
| 0, nullptr, top_level_flags_, zone()); |
| RegExpParserState* state = &initial_state; |
| // Cache the builder in a local variable for quick access. |
| RegExpBuilder* builder = initial_state.builder(); |
| while (true) { |
| switch (current()) { |
| case kEndMarker: |
| if (failed()) return nullptr; // E.g. the initial Advance failed. |
| if (state->IsSubexpression()) { |
| // Inside a parenthesized group when hitting end of input. |
| return ReportError(RegExpError::kUnterminatedGroup); |
| } |
| DCHECK_EQ(INITIAL, state->group_type()); |
| // Parsing completed successfully. |
| return builder->ToRegExp(); |
| case ')': { |
| if (!state->IsSubexpression()) { |
| return ReportError(RegExpError::kUnmatchedParen); |
| } |
| DCHECK_NE(INITIAL, state->group_type()); |
| |
| Advance(); |
| // End disjunction parsing and convert builder content to new single |
| // regexp atom. |
| RegExpTree* body = builder->ToRegExp(); |
| |
| int end_capture_index = captures_started(); |
| |
| int capture_index = state->capture_index(); |
| SubexpressionType group_type = state->group_type(); |
| |
| // Build result of subexpression. |
| if (group_type == CAPTURE) { |
| if (state->IsNamedCapture()) { |
| CreateNamedCaptureAtIndex(state->capture_name(), |
| capture_index CHECK_FAILED); |
| } |
| RegExpCapture* capture = GetCapture(capture_index); |
| capture->set_body(body); |
| body = capture; |
| } else if (group_type == GROUPING) { |
| body = zone()->template New<RegExpGroup>(body); |
| } else { |
| DCHECK(group_type == POSITIVE_LOOKAROUND || |
| group_type == NEGATIVE_LOOKAROUND); |
| bool is_positive = (group_type == POSITIVE_LOOKAROUND); |
| body = zone()->template New<RegExpLookaround>( |
| body, is_positive, end_capture_index - capture_index, |
| capture_index, state->lookaround_type()); |
| } |
| |
| // Restore previous state. |
| state = state->previous_state(); |
| builder = state->builder(); |
| |
| builder->AddAtom(body); |
| // For compatibility with JSC and ES3, we allow quantifiers after |
| // lookaheads, and break in all cases. |
| break; |
| } |
| case '|': { |
| Advance(); |
| builder->NewAlternative(); |
| continue; |
| } |
| case '*': |
| case '+': |
| case '?': |
| return ReportError(RegExpError::kNothingToRepeat); |
| case '^': { |
| Advance(); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| builder->multiline() ? RegExpAssertion::Type::START_OF_LINE |
| : RegExpAssertion::Type::START_OF_INPUT)); |
| set_contains_anchor(); |
| continue; |
| } |
| case '$': { |
| Advance(); |
| RegExpAssertion::Type assertion_type = |
| builder->multiline() ? RegExpAssertion::Type::END_OF_LINE |
| : RegExpAssertion::Type::END_OF_INPUT; |
| builder->AddAssertion( |
| zone()->template New<RegExpAssertion>(assertion_type)); |
| continue; |
| } |
| case '.': { |
| Advance(); |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| |
| if (builder->dotall()) { |
| // Everything. |
| CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, |
| ranges, false, zone()); |
| } else { |
| // Everything except \x0A, \x0D, \u2028 and \u2029. |
| CharacterRange::AddClassEscape( |
| StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); |
| } |
| |
| RegExpCharacterClass* cc = |
| zone()->template New<RegExpCharacterClass>(zone(), ranges); |
| builder->AddCharacterClass(cc); |
| break; |
| } |
| case '(': { |
| state = ParseOpenParenthesis(state CHECK_FAILED); |
| builder = state->builder(); |
| continue; |
| } |
| case '[': { |
| RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); |
| builder->AddCharacterClass(cc->AsCharacterClass()); |
| break; |
| } |
| // Atom :: |
| // \ AtomEscape |
| case '\\': |
| switch (Next()) { |
| case kEndMarker: |
| return ReportError(RegExpError::kEscapeAtEndOfPattern); |
| // AtomEscape :: |
| // [+UnicodeMode] DecimalEscape |
| // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber |
| // of DecimalEscape is ≤ NcapturingParens |
| // CharacterEscape (some cases of this mixed in too) |
| // |
| // TODO(jgruber): It may make sense to disentangle all the different |
| // cases and make the structure mirror the spec, e.g. for AtomEscape: |
| // |
| // if (TryParseDecimalEscape(...)) return; |
| // if (TryParseCharacterClassEscape(...)) return; |
| // if (TryParseCharacterEscape(...)) return; |
| // if (TryParseGroupName(...)) return; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': { |
| int index = 0; |
| const bool is_backref = |
| ParseBackReferenceIndex(&index CHECK_FAILED); |
| if (is_backref) { |
| if (state->IsInsideCaptureGroup(index)) { |
| // The back reference is inside the capture group it refers to. |
| // Nothing can possibly have been captured yet, so we use empty |
| // instead. This ensures that, when checking a back reference, |
| // the capture registers of the referenced capture are either |
| // both set or both cleared. |
| builder->AddEmpty(); |
| } else { |
| RegExpCapture* capture = GetCapture(index); |
| RegExpTree* atom = zone()->template New<RegExpBackReference>( |
| capture, builder->flags()); |
| builder->AddAtom(atom); |
| } |
| break; |
| } |
| // With /u, no identity escapes except for syntax characters |
| // are allowed. Otherwise, all identity escapes are allowed. |
| if (unicode()) { |
| return ReportError(RegExpError::kInvalidEscape); |
| } |
| base::uc32 first_digit = Next(); |
| if (first_digit == '8' || first_digit == '9') { |
| builder->AddCharacter(first_digit); |
| Advance(2); |
| break; |
| } |
| V8_FALLTHROUGH; |
| } |
| case '0': { |
| Advance(); |
| if (unicode() && Next() >= '0' && Next() <= '9') { |
| // With /u, decimal escape with leading 0 are not parsed as octal. |
| return ReportError(RegExpError::kInvalidDecimalEscape); |
| } |
| base::uc32 octal = ParseOctalLiteral(); |
| builder->AddCharacter(octal); |
| break; |
| } |
| case 'b': |
| Advance(2); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| RegExpAssertion::Type::BOUNDARY)); |
| continue; |
| case 'B': |
| Advance(2); |
| builder->AddAssertion(zone()->template New<RegExpAssertion>( |
| RegExpAssertion::Type::NON_BOUNDARY)); |
| continue; |
| // AtomEscape :: |
| // CharacterClassEscape |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': |
| case 'p': |
| case 'P': { |
| base::uc32 next = Next(); |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| bool add_unicode_case_equivalents = |
| unicode() && builder->ignore_case(); |
| bool parsed_character_class_escape = TryParseCharacterClassEscape( |
| next, InClassEscapeState::kNotInClass, ranges, zone(), |
| add_unicode_case_equivalents CHECK_FAILED); |
| |
| if (parsed_character_class_escape) { |
| RegExpCharacterClass* cc = |
| zone()->template New<RegExpCharacterClass>(zone(), ranges); |
| builder->AddCharacterClass(cc); |
| } else { |
| CHECK(!unicode()); |
| Advance(2); |
| builder->AddCharacter(next); // IdentityEscape. |
| } |
| break; |
| } |
| // AtomEscape :: |
| // k GroupName |
| case 'k': { |
| // Either an identity escape or a named back-reference. The two |
| // interpretations are mutually exclusive: '\k' is interpreted as |
| // an identity escape for non-Unicode patterns without named |
| // capture groups, and as the beginning of a named back-reference |
| // in all other cases. |
| const bool has_named_captures = |
| HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); |
| if (unicode() || has_named_captures) { |
| Advance(2); |
| ParseNamedBackReference(builder, state CHECK_FAILED); |
| break; |
| } |
| } |
| V8_FALLTHROUGH; |
| // AtomEscape :: |
| // CharacterEscape |
| default: { |
| bool is_escaped_unicode_character = false; |
| base::uc32 c = ParseCharacterEscape( |
| InClassEscapeState::kNotInClass, |
| &is_escaped_unicode_character CHECK_FAILED); |
| if (is_escaped_unicode_character) { |
| builder->AddEscapedUnicodeCharacter(c); |
| } else { |
| builder->AddCharacter(c); |
| } |
| break; |
| } |
| } |
| break; |
| case '{': { |
| int dummy; |
| bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); |
| if (parsed) return ReportError(RegExpError::kNothingToRepeat); |
| V8_FALLTHROUGH; |
| } |
| case '}': |
| case ']': |
| if (unicode()) { |
| return ReportError(RegExpError::kLoneQuantifierBrackets); |
| } |
| V8_FALLTHROUGH; |
| default: |
| builder->AddUnicodeCharacter(current()); |
| Advance(); |
| break; |
| } // end switch(current()) |
| |
| int min; |
| int max; |
| switch (current()) { |
| // QuantifierPrefix :: |
| // * |
| // + |
| // ? |
| // { |
| case '*': |
| min = 0; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '+': |
| min = 1; |
| max = RegExpTree::kInfinity; |
| Advance(); |
| break; |
| case '?': |
| min = 0; |
| max = 1; |
| Advance(); |
| break; |
| case '{': |
| if (ParseIntervalQuantifier(&min, &max)) { |
| if (max < min) { |
| return ReportError(RegExpError::kRangeOutOfOrder); |
| } |
| break; |
| } else if (unicode()) { |
| // With /u, incomplete quantifiers are not allowed. |
| return ReportError(RegExpError::kIncompleteQuantifier); |
| } |
| continue; |
| default: |
| continue; |
| } |
| RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
| if (current() == '?') { |
| quantifier_type = RegExpQuantifier::NON_GREEDY; |
| Advance(); |
| } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
| // FLAG_regexp_possessive_quantifier is a debug-only flag. |
| quantifier_type = RegExpQuantifier::POSSESSIVE; |
| Advance(); |
| } |
| if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
| return ReportError(RegExpError::kInvalidQuantifier); |
| } |
| } |
| } |
| |
| template <class CharT> |
| RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis( |
| RegExpParserState* state) { |
| RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| bool is_named_capture = false; |
| const ZoneVector<base::uc16>* capture_name = nullptr; |
| SubexpressionType subexpr_type = CAPTURE; |
| Advance(); |
| if (current() == '?') { |
| switch (Next()) { |
| case ':': |
| Advance(2); |
| subexpr_type = GROUPING; |
| break; |
| case '=': |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| case '!': |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKAHEAD; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| case '<': |
| Advance(); |
| if (Next() == '=') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = POSITIVE_LOOKAROUND; |
| break; |
| } else if (Next() == '!') { |
| Advance(2); |
| lookaround_type = RegExpLookaround::LOOKBEHIND; |
| subexpr_type = NEGATIVE_LOOKAROUND; |
| break; |
| } |
| is_named_capture = true; |
| has_named_captures_ = true; |
| Advance(); |
| break; |
| default: |
| ReportError(RegExpError::kInvalidGroup); |
| return nullptr; |
| } |
| } |
| if (subexpr_type == CAPTURE) { |
| if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) { |
| ReportError(RegExpError::kTooManyCaptures); |
| return nullptr; |
| } |
| captures_started_++; |
| |
| if (is_named_capture) { |
| capture_name = ParseCaptureGroupName(CHECK_FAILED); |
| } |
| } |
| // Store current state and begin new disjunction parsing. |
| return zone()->template New<RegExpParserState>( |
| state, subexpr_type, lookaround_type, captures_started_, capture_name, |
| state->builder()->flags(), zone()); |
| } |
| |
| #ifdef DEBUG |
| namespace { |
| |
| bool IsSpecialClassEscape(base::uc32 c) { |
| switch (c) { |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| } // namespace |
| #endif |
| |
| // In order to know whether an escape is a backreference or not we have to scan |
| // the entire regexp and find the number of capturing parentheses. However we |
| // don't want to scan the regexp twice unless it is necessary. This mini-parser |
| // is called when needed. It can see the difference between capturing and |
| // noncapturing parentheses and can skip character classes and backslash-escaped |
| // characters. |
| // |
| // Important: The scanner has to be in a consistent state when calling |
| // ScanForCaptures, e.g. not in the middle of an escape sequence '\['. |
| template <class CharT> |
| void RegExpParserImpl<CharT>::ScanForCaptures( |
| InClassEscapeState in_class_escape_state) { |
| DCHECK(!is_scanned_for_captures_); |
| const int saved_position = position(); |
| // Start with captures started previous to current position |
| int capture_count = captures_started(); |
| // When we start inside a character class, skip everything inside the class. |
| if (in_class_escape_state == InClassEscapeState::kInClass) { |
| int c; |
| while ((c = current()) != kEndMarker) { |
| Advance(); |
| if (c == '\\') { |
| Advance(); |
| } else { |
| if (c == ']') break; |
| } |
| } |
| } |
| // Add count of captures after this position. |
| int n; |
| while ((n = current()) != kEndMarker) { |
| Advance(); |
| switch (n) { |
| case '\\': |
| Advance(); |
| break; |
| case '[': { |
| int c; |
| while ((c = current()) != kEndMarker) { |
| Advance(); |
| if (c == '\\') { |
| Advance(); |
| } else { |
| if (c == ']') break; |
| } |
| } |
| break; |
| } |
| case '(': |
| if (current() == '?') { |
| // At this point we could be in |
| // * a non-capturing group '(:', |
| // * a lookbehind assertion '(?<=' '(?<!' |
| // * or a named capture '(?<'. |
| // |
| // Of these, only named captures are capturing groups. |
| |
| Advance(); |
| if (current() != '<') break; |
| |
| Advance(); |
| if (current() == '=' || current() == '!') break; |
| |
| // Found a possible named capture. It could turn out to be a syntax |
| // error (e.g. an unterminated or invalid name), but that distinction |
| // does not matter for our purposes. |
| has_named_captures_ = true; |
| } |
| capture_count++; |
| break; |
| } |
| } |
| capture_count_ = capture_count; |
| is_scanned_for_captures_ = true; |
| Reset(saved_position); |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) { |
| DCHECK_EQ('\\', current()); |
| DCHECK('1' <= Next() && Next() <= '9'); |
| // Try to parse a decimal literal that is no greater than the total number |
| // of left capturing parentheses in the input. |
| int start = position(); |
| int value = Next() - '0'; |
| Advance(2); |
| while (true) { |
| base::uc32 c = current(); |
| if (IsDecimalDigit(c)) { |
| value = 10 * value + (c - '0'); |
| if (value > RegExpMacroAssembler::kMaxRegisterCount) { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } else { |
| break; |
| } |
| } |
| if (value > captures_started()) { |
| if (!is_scanned_for_captures_) |
| ScanForCaptures(InClassEscapeState::kNotInClass); |
| if (value > capture_count_) { |
| Reset(start); |
| return false; |
| } |
| } |
| *index_out = value; |
| return true; |
| } |
| |
| namespace { |
| |
| void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) { |
| if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| v->push_back(code_unit); |
| } else { |
| v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); |
| v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); |
| } |
| } |
| |
| } // namespace |
| |
| template <class CharT> |
| const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() { |
| // Due to special Advance requirements (see the next comment), rewind by one |
| // such that names starting with a surrogate pair are parsed correctly for |
| // patterns where the unicode flag is unset. |
| // |
| // Note that we use this odd pattern of rewinding the last advance in order |
| // to adhere to the common parser behavior of expecting `current` to point at |
| // the first candidate character for a function (e.g. when entering ParseFoo, |
| // `current` should point at the first character of Foo). |
| RewindByOneCodepoint(); |
| |
| ZoneVector<base::uc16>* name = |
| zone()->template New<ZoneVector<base::uc16>>(zone()); |
| |
| { |
| // Advance behavior inside this function is tricky since |
| // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) |
| // and thus allows surrogate pairs and \u{}-style escapes even in |
| // non-unicode patterns. Therefore Advance within the capture group name |
| // has to force-enable unicode, and outside the name revert to default |
| // behavior. |
| ForceUnicodeScope force_unicode(this); |
| |
| bool at_start = true; |
| while (true) { |
| Advance(); |
| base::uc32 c = current(); |
| |
| // Convert unicode escapes. |
| if (c == '\\' && Next() == 'u') { |
| Advance(2); |
| if (!ParseUnicodeEscape(&c)) { |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return nullptr; |
| } |
| RewindByOneCodepoint(); |
| } |
| |
| // The backslash char is misclassified as both ID_Start and ID_Continue. |
| if (c == '\\') { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| |
| if (at_start) { |
| if (!IsIdentifierStart(c)) { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| push_code_unit(name, c); |
| at_start = false; |
| } else { |
| if (c == '>') { |
| break; |
| } else if (IsIdentifierPart(c)) { |
| push_code_unit(name, c); |
| } else { |
| ReportError(RegExpError::kInvalidCaptureGroupName); |
| return nullptr; |
| } |
| } |
| } |
| } |
| |
| // This final advance goes back into the state of pointing at the next |
| // relevant char, which the rest of the parser expects. See also the previous |
| // comments in this function. |
| Advance(); |
| return name; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex( |
| const ZoneVector<base::uc16>* name, int index) { |
| DCHECK(0 < index && index <= captures_started_); |
| DCHECK_NOT_NULL(name); |
| |
| RegExpCapture* capture = GetCapture(index); |
| DCHECK_NULL(capture->name()); |
| |
| capture->set_name(name); |
| |
| if (named_captures_ == nullptr) { |
| named_captures_ = |
| zone_->template New<ZoneSet<RegExpCapture*, RegExpCaptureNameLess>>( |
| zone()); |
| } else { |
| // Check for duplicates and bail if we find any. |
| |
| const auto& named_capture_it = named_captures_->find(capture); |
| if (named_capture_it != named_captures_->end()) { |
| ReportError(RegExpError::kDuplicateCaptureGroupName); |
| return false; |
| } |
| } |
| |
| named_captures_->emplace(capture); |
| |
| return true; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseNamedBackReference( |
| RegExpBuilder* builder, RegExpParserState* state) { |
| // The parser is assumed to be on the '<' in \k<name>. |
| if (current() != '<') { |
| ReportError(RegExpError::kInvalidNamedReference); |
| return false; |
| } |
| |
| Advance(); |
| const ZoneVector<base::uc16>* name = ParseCaptureGroupName(); |
| if (name == nullptr) { |
| return false; |
| } |
| |
| if (state->IsInsideCaptureGroup(name)) { |
| builder->AddEmpty(); |
| } else { |
| RegExpBackReference* atom = |
| zone()->template New<RegExpBackReference>(builder->flags()); |
| atom->set_name(name); |
| |
| builder->AddAtom(atom); |
| |
| if (named_back_references_ == nullptr) { |
| named_back_references_ = |
| zone()->template New<ZoneList<RegExpBackReference*>>(1, zone()); |
| } |
| named_back_references_->Add(atom, zone()); |
| } |
| |
| return true; |
| } |
| |
| template <class CharT> |
| void RegExpParserImpl<CharT>::PatchNamedBackReferences() { |
| if (named_back_references_ == nullptr) return; |
| |
| if (named_captures_ == nullptr) { |
| ReportError(RegExpError::kInvalidNamedCaptureReference); |
| return; |
| } |
| |
| // Look up and patch the actual capture for each named back reference. |
| |
| for (int i = 0; i < named_back_references_->length(); i++) { |
| RegExpBackReference* ref = named_back_references_->at(i); |
| |
| // Capture used to search the named_captures_ by name, index of the |
| // capture is never used. |
| static const int kInvalidIndex = 0; |
| RegExpCapture* search_capture = |
| zone()->template New<RegExpCapture>(kInvalidIndex); |
| DCHECK_NULL(search_capture->name()); |
| search_capture->set_name(ref->name()); |
| |
| int index = -1; |
| const auto& capture_it = named_captures_->find(search_capture); |
| if (capture_it != named_captures_->end()) { |
| index = (*capture_it)->index(); |
| } else { |
| ReportError(RegExpError::kInvalidNamedCaptureReference); |
| return; |
| } |
| |
| ref->set_capture(GetCapture(index)); |
| } |
| } |
| |
| template <class CharT> |
| RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) { |
| // The index for the capture groups are one-based. Its index in the list is |
| // zero-based. |
| const int known_captures = |
| is_scanned_for_captures_ ? capture_count_ : captures_started_; |
| DCHECK(index <= known_captures); |
| if (captures_ == nullptr) { |
| captures_ = |
| zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone()); |
| } |
| while (captures_->length() < known_captures) { |
| captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1), |
| zone()); |
| } |
| return captures_->at(index - 1); |
| } |
| |
| template <class CharT> |
| ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() const { |
| if (named_captures_ == nullptr || named_captures_->empty()) { |
| return nullptr; |
| } |
| |
| return zone()->template New<ZoneVector<RegExpCapture*>>( |
| named_captures_->begin(), named_captures_->end(), zone()); |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::HasNamedCaptures( |
| InClassEscapeState in_class_escape_state) { |
| if (has_named_captures_ || is_scanned_for_captures_) { |
| return has_named_captures_; |
| } |
| |
| ScanForCaptures(in_class_escape_state); |
| DCHECK(is_scanned_for_captures_); |
| return has_named_captures_; |
| } |
| |
| // QuantifierPrefix :: |
| // { DecimalDigits } |
| // { DecimalDigits , } |
| // { DecimalDigits , DecimalDigits } |
| // |
| // Returns true if parsing succeeds, and set the min_out and max_out |
| // values. Values are truncated to RegExpTree::kInfinity if they overflow. |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out, |
| int* max_out) { |
| DCHECK_EQ(current(), '{'); |
| int start = position(); |
| Advance(); |
| int min = 0; |
| if (!IsDecimalDigit(current())) { |
| Reset(start); |
| return false; |
| } |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (min > (RegExpTree::kInfinity - next) / 10) { |
| // Overflow. Skip past remaining decimal digits and return -1. |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| min = RegExpTree::kInfinity; |
| break; |
| } |
| min = 10 * min + next; |
| Advance(); |
| } |
| int max = 0; |
| if (current() == '}') { |
| max = min; |
| Advance(); |
| } else if (current() == ',') { |
| Advance(); |
| if (current() == '}') { |
| max = RegExpTree::kInfinity; |
| Advance(); |
| } else { |
| while (IsDecimalDigit(current())) { |
| int next = current() - '0'; |
| if (max > (RegExpTree::kInfinity - next) / 10) { |
| do { |
| Advance(); |
| } while (IsDecimalDigit(current())); |
| max = RegExpTree::kInfinity; |
| break; |
| } |
| max = 10 * max + next; |
| Advance(); |
| } |
| if (current() != '}') { |
| Reset(start); |
| return false; |
| } |
| Advance(); |
| } |
| } else { |
| Reset(start); |
| return false; |
| } |
| *min_out = min; |
| *max_out = max; |
| return true; |
| } |
| |
| template <class CharT> |
| base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() { |
| DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); |
| // For compatibility with some other browsers (not all), we parse |
| // up to three octal digits with a value below 256. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| base::uc32 value = current() - '0'; |
| Advance(); |
| if ('0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| if (value < 32 && '0' <= current() && current() <= '7') { |
| value = value * 8 + current() - '0'; |
| Advance(); |
| } |
| } |
| return value; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) { |
| int start = position(); |
| base::uc32 val = 0; |
| for (int i = 0; i < length; ++i) { |
| base::uc32 c = current(); |
| int d = base::HexValue(c); |
| if (d < 0) { |
| Reset(start); |
| return false; |
| } |
| val = val * 16 + d; |
| Advance(); |
| } |
| *value = val; |
| return true; |
| } |
| |
| // This parses RegExpUnicodeEscapeSequence as described in ECMA262. |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) { |
| // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| // allowed). In the latter case, the number of hex digits between { } is |
| // arbitrary. \ and u have already been read. |
| if (current() == '{' && unicode()) { |
| int start = position(); |
| Advance(); |
| if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { |
| if (current() == '}') { |
| Advance(); |
| return true; |
| } |
| } |
| Reset(start); |
| return false; |
| } |
| // \u but no {, or \u{...} escapes not allowed. |
| bool result = ParseHexEscape(4, value); |
| if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && |
| current() == '\\') { |
| // Attempt to read trail surrogate. |
| int start = position(); |
| if (Next() == 'u') { |
| Advance(2); |
| base::uc32 trail; |
| if (ParseHexEscape(4, &trail) && |
| unibrow::Utf16::IsTrailSurrogate(trail)) { |
| *value = unibrow::Utf16::CombineSurrogatePair( |
| static_cast<base::uc16>(*value), static_cast<base::uc16>(trail)); |
| return true; |
| } |
| } |
| Reset(start); |
| } |
| return result; |
| } |
| |
| #ifdef V8_INTL_SUPPORT |
| |
| namespace { |
| |
| bool IsExactPropertyAlias(const char* property_name, UProperty property) { |
| const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_name, short_name) == 0) |
| return true; |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyName( |
| property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| bool IsExactPropertyValueAlias(const char* property_value_name, |
| UProperty property, int32_t property_value) { |
| const char* short_name = |
| u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); |
| if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { |
| return true; |
| } |
| for (int i = 0;; i++) { |
| const char* long_name = u_getPropertyValueName( |
| property, property_value, |
| static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); |
| if (long_name == nullptr) break; |
| if (strcmp(property_value_name, long_name) == 0) return true; |
| } |
| return false; |
| } |
| |
| bool LookupPropertyValueName(UProperty property, |
| const char* property_value_name, bool negate, |
| ZoneList<CharacterRange>* result, Zone* zone) { |
| UProperty property_for_lookup = property; |
| if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { |
| // For the property Script_Extensions, we have to do the property value |
| // name lookup as if the property is Script. |
| property_for_lookup = UCHAR_SCRIPT; |
| } |
| int32_t property_value = |
| u_getPropertyValueEnum(property_for_lookup, property_value_name); |
| if (property_value == UCHAR_INVALID_CODE) return false; |
| |
| // We require the property name to match exactly to one of the property value |
| // aliases. However, u_getPropertyValueEnum uses loose matching. |
| if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, |
| property_value)) { |
| return false; |
| } |
| |
| UErrorCode ec = U_ZERO_ERROR; |
| icu::UnicodeSet set; |
| set.applyIntPropertyValue(property, property_value, ec); |
| bool success = ec == U_ZERO_ERROR && !set.isEmpty(); |
| |
| if (success) { |
| set.removeAllStrings(); |
| if (negate) set.complement(); |
| for (int i = 0; i < set.getRangeCount(); i++) { |
| result->Add( |
| CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
| zone); |
| } |
| } |
| return success; |
| } |
| |
| template <size_t N> |
| inline bool NameEquals(const char* name, const char (&literal)[N]) { |
| return strncmp(name, literal, N + 1) == 0; |
| } |
| |
| bool LookupSpecialPropertyValueName(const char* name, |
| ZoneList<CharacterRange>* result, |
| bool negate, Zone* zone) { |
| if (NameEquals(name, "Any")) { |
| if (negate) { |
| // Leave the list of character ranges empty, since the negation of 'Any' |
| // is the empty set. |
| } else { |
| result->Add(CharacterRange::Everything(), zone); |
| } |
| } else if (NameEquals(name, "ASCII")) { |
| result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) |
| : CharacterRange::Range(0x0, 0x7F), |
| zone); |
| } else if (NameEquals(name, "Assigned")) { |
| return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", |
| !negate, result, zone); |
| } else { |
| return false; |
| } |
| return true; |
| } |
| |
| // Explicitly allowlist supported binary properties. The spec forbids supporting |
| // properties outside of this set to ensure interoperability. |
| bool IsSupportedBinaryProperty(UProperty property) { |
| switch (property) { |
| case UCHAR_ALPHABETIC: |
| // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. |
| // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_ASCII_HEX_DIGIT: |
| // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. |
| case UCHAR_BIDI_CONTROL: |
| case UCHAR_BIDI_MIRRORED: |
| case UCHAR_CASE_IGNORABLE: |
| case UCHAR_CASED: |
| case UCHAR_CHANGES_WHEN_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_CASEMAPPED: |
| case UCHAR_CHANGES_WHEN_LOWERCASED: |
| case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: |
| case UCHAR_CHANGES_WHEN_TITLECASED: |
| case UCHAR_CHANGES_WHEN_UPPERCASED: |
| case UCHAR_DASH: |
| case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: |
| case UCHAR_DEPRECATED: |
| case UCHAR_DIACRITIC: |
| case UCHAR_EMOJI: |
| case UCHAR_EMOJI_COMPONENT: |
| case UCHAR_EMOJI_MODIFIER_BASE: |
| case UCHAR_EMOJI_MODIFIER: |
| case UCHAR_EMOJI_PRESENTATION: |
| case UCHAR_EXTENDED_PICTOGRAPHIC: |
| case UCHAR_EXTENDER: |
| case UCHAR_GRAPHEME_BASE: |
| case UCHAR_GRAPHEME_EXTEND: |
| case UCHAR_HEX_DIGIT: |
| case UCHAR_ID_CONTINUE: |
| case UCHAR_ID_START: |
| case UCHAR_IDEOGRAPHIC: |
| case UCHAR_IDS_BINARY_OPERATOR: |
| case UCHAR_IDS_TRINARY_OPERATOR: |
| case UCHAR_JOIN_CONTROL: |
| case UCHAR_LOGICAL_ORDER_EXCEPTION: |
| case UCHAR_LOWERCASE: |
| case UCHAR_MATH: |
| case UCHAR_NONCHARACTER_CODE_POINT: |
| case UCHAR_PATTERN_SYNTAX: |
| case UCHAR_PATTERN_WHITE_SPACE: |
| case UCHAR_QUOTATION_MARK: |
| case UCHAR_RADICAL: |
| case UCHAR_REGIONAL_INDICATOR: |
| case UCHAR_S_TERM: |
| case UCHAR_SOFT_DOTTED: |
| case UCHAR_TERMINAL_PUNCTUATION: |
| case UCHAR_UNIFIED_IDEOGRAPH: |
| case UCHAR_UPPERCASE: |
| case UCHAR_VARIATION_SELECTOR: |
| case UCHAR_WHITE_SPACE: |
| case UCHAR_XID_CONTINUE: |
| case UCHAR_XID_START: |
| return true; |
| default: |
| break; |
| } |
| return false; |
| } |
| |
| bool IsUnicodePropertyValueCharacter(char c) { |
| // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ |
| // |
| // Note that using this to validate each parsed char is quite conservative. |
| // A possible alternative solution would be to only ensure the parsed |
| // property name/value candidate string does not contain '\0' characters and |
| // let ICU lookups trigger the final failure. |
| if ('a' <= c && c <= 'z') return true; |
| if ('A' <= c && c <= 'Z') return true; |
| if ('0' <= c && c <= '9') return true; |
| return (c == '_'); |
| } |
| |
| } // namespace |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2) { |
| DCHECK(name_1->empty()); |
| DCHECK(name_2->empty()); |
| // Parse the property class as follows: |
| // - In \p{name}, 'name' is interpreted |
| // - either as a general category property value name. |
| // - or as a binary property name. |
| // - In \p{name=value}, 'name' is interpreted as an enumerated property name, |
| // and 'value' is interpreted as one of the available property value names. |
| // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. |
| // - Loose matching is not applied. |
| if (current() == '{') { |
| // Parse \p{[PropertyName=]PropertyNameValue} |
| for (Advance(); current() != '}' && current() != '='; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| name_1->push_back(static_cast<char>(current())); |
| } |
| if (current() == '=') { |
| for (Advance(); current() != '}'; Advance()) { |
| if (!IsUnicodePropertyValueCharacter(current())) return false; |
| if (!has_next()) return false; |
| name_2->push_back(static_cast<char>(current())); |
| } |
| name_2->push_back(0); // null-terminate string. |
| } |
| } else { |
| return false; |
| } |
| Advance(); |
| name_1->push_back(0); // null-terminate string. |
| |
| DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); |
| DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); |
| return true; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::AddPropertyClassRange( |
| ZoneList<CharacterRange>* add_to, bool negate, |
| const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { |
| if (name_2.empty()) { |
| // First attempt to interpret as general category property value name. |
| const char* name = name_1.data(); |
| if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, |
| add_to, zone())) { |
| return true; |
| } |
| // Interpret "Any", "ASCII", and "Assigned". |
| if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) { |
| return true; |
| } |
| // Then attempt to interpret as binary property name with value name 'Y'. |
| UProperty property = u_getPropertyEnum(name); |
| if (!IsSupportedBinaryProperty(property)) return false; |
| if (!IsExactPropertyAlias(name, property)) return false; |
| return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to, |
| zone()); |
| } else { |
| // Both property name and value name are specified. Attempt to interpret |
| // the property name as enumerated property. |
| const char* property_name = name_1.data(); |
| const char* value_name = name_2.data(); |
| UProperty property = u_getPropertyEnum(property_name); |
| if (!IsExactPropertyAlias(property_name, property)) return false; |
| if (property == UCHAR_GENERAL_CATEGORY) { |
| // We want to allow aggregate value names such as "Letter". |
| property = UCHAR_GENERAL_CATEGORY_MASK; |
| } else if (property != UCHAR_SCRIPT && |
| property != UCHAR_SCRIPT_EXTENSIONS) { |
| return false; |
| } |
| return LookupPropertyValueName(property, value_name, negate, add_to, |
| zone()); |
| } |
| } |
| |
| #else // V8_INTL_SUPPORT |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1, |
| ZoneVector<char>* name_2) { |
| return false; |
| } |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::AddPropertyClassRange( |
| ZoneList<CharacterRange>* add_to, bool negate, |
| const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) { |
| return false; |
| } |
| |
| #endif // V8_INTL_SUPPORT |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value, |
| base::uc32* value) { |
| base::uc32 x = 0; |
| int d = base::HexValue(current()); |
| if (d < 0) { |
| return false; |
| } |
| while (d >= 0) { |
| x = x * 16 + d; |
| if (x > static_cast<base::uc32>(max_value)) { |
| return false; |
| } |
| Advance(); |
| d = base::HexValue(current()); |
| } |
| *value = x; |
| return true; |
| } |
| |
| // https://tc39.es/ecma262/#prod-CharacterEscape |
| template <class CharT> |
| base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape( |
| InClassEscapeState in_class_escape_state, |
| bool* is_escaped_unicode_character) { |
| DCHECK_EQ('\\', current()); |
| DCHECK(has_next() && !IsSpecialClassEscape(Next())); |
| |
| Advance(); |
| |
| const base::uc32 c = current(); |
| switch (c) { |
| // CharacterEscape :: |
| // ControlEscape :: one of |
| // f n r t v |
| case 'f': |
| Advance(); |
| return '\f'; |
| case 'n': |
| Advance(); |
| return '\n'; |
| case 'r': |
| Advance(); |
| return '\r'; |
| case 't': |
| Advance(); |
| return '\t'; |
| case 'v': |
| Advance(); |
| return '\v'; |
| // CharacterEscape :: |
| // c ControlLetter |
| case 'c': { |
| base::uc32 controlLetter = Next(); |
| base::uc32 letter = controlLetter & ~('A' ^ 'a'); |
| if (letter >= 'A' && letter <= 'Z') { |
| Advance(2); |
| // Control letters mapped to ASCII control characters in the range |
| // 0x00-0x1F. |
| return controlLetter & 0x1F; |
| } |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return 0; |
| } |
| if (in_class_escape_state == InClassEscapeState::kInClass) { |
| // Inside a character class, we also accept digits and underscore as |
| // control characters, unless with /u. See Annex B: |
| // ES#prod-annexB-ClassControlLetter |
| if ((controlLetter >= '0' && controlLetter <= '9') || |
| controlLetter == '_') { |
| Advance(2); |
| return controlLetter & 0x1F; |
| } |
| } |
| // We match JSC in reading the backslash as a literal |
| // character instead of as starting an escape. |
| return '\\'; |
| } |
| // CharacterEscape :: |
| // 0 [lookahead ∉ DecimalDigit] |
| // [~UnicodeMode] LegacyOctalEscapeSequence |
| case '0': |
| // \0 is interpreted as NUL if not followed by another digit. |
| if (Next() < '0' || Next() > '9') { |
| Advance(); |
| return 0; |
| } |
| V8_FALLTHROUGH; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| // For compatibility, we interpret a decimal escape that isn't |
| // a back reference (and therefore either \0 or not valid according |
| // to the specification) as a 1..3 digit octal character code. |
| // ES#prod-annexB-LegacyOctalEscapeSequence |
| if (unicode()) { |
| // With /u, decimal escape is not interpreted as octal character code. |
| ReportError(RegExpError::kInvalidClassEscape); |
| return 0; |
| } |
| return ParseOctalLiteral(); |
| // CharacterEscape :: |
| // HexEscapeSequence |
| case 'x': { |
| Advance(); |
| base::uc32 value; |
| if (ParseHexEscape(2, &value)) return value; |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| // If \x is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'x'; |
| } |
| // CharacterEscape :: |
| // RegExpUnicodeEscapeSequence [?UnicodeMode] |
| case 'u': { |
| Advance(); |
| base::uc32 value; |
| if (ParseUnicodeEscape(&value)) { |
| *is_escaped_unicode_character = true; |
| return value; |
| } |
| if (unicode()) { |
| // With /u, invalid escapes are not treated as identity escapes. |
| ReportError(RegExpError::kInvalidUnicodeEscape); |
| return 0; |
| } |
| // If \u is not followed by a two-digit hexadecimal, treat it |
| // as an identity escape. |
| return 'u'; |
| } |
| default: |
| break; |
| } |
| |
| // CharacterEscape :: |
| // IdentityEscape[?UnicodeMode, ?N] |
| // |
| // * With /u, no identity escapes except for syntax characters are |
| // allowed. |
| // * Without /u: |
| // * '\c' is not an IdentityEscape. |
| // * '\k' is not an IdentityEscape when named captures exist. |
| // * Otherwise, all identity escapes are allowed. |
| if (unicode()) { |
| if (!IsSyntaxCharacterOrSlash(c)) { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| Advance(); |
| return c; |
| } |
| DCHECK(!unicode()); |
| if (c == 'c') { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| Advance(); |
| // Note: It's important to Advance before the HasNamedCaptures call s.t. we |
| // don't start scanning in the middle of an escape. |
| if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { |
| ReportError(RegExpError::kInvalidEscape); |
| return 0; |
| } |
| return c; |
| } |
| |
| // https://tc39.es/ecma262/#prod-ClassEscape |
| template <class CharT> |
| void RegExpParserImpl<CharT>::ParseClassEscape( |
| ZoneList<CharacterRange>* ranges, Zone* zone, |
| bool add_unicode_case_equivalents, base::uc32* char_out, |
| bool* is_class_escape) { |
| *is_class_escape = false; |
| |
| if (current() != '\\') { |
| // Not a ClassEscape. |
| *char_out = current(); |
| Advance(); |
| return; |
| } |
| |
| const base::uc32 next = Next(); |
| switch (next) { |
| case 'b': |
| *char_out = '\b'; |
| Advance(2); |
| return; |
| case '-': |
| if (unicode()) { |
| *char_out = next; |
| Advance(2); |
| return; |
| } |
| break; |
| case kEndMarker: |
| ReportError(RegExpError::kEscapeAtEndOfPattern); |
| return; |
| default: |
| break; |
| } |
| |
| static constexpr InClassEscapeState kInClassEscape = |
| InClassEscapeState::kInClass; |
| *is_class_escape = TryParseCharacterClassEscape( |
| next, kInClassEscape, ranges, zone, add_unicode_case_equivalents); |
| if (*is_class_escape) return; |
| |
| bool dummy = false; // Unused. |
| *char_out = ParseCharacterEscape(kInClassEscape, &dummy); |
| } |
| |
| // https://tc39.es/ecma262/#prod-CharacterClassEscape |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape( |
| base::uc32 next, InClassEscapeState in_class_escape_state, |
| ZoneList<CharacterRange>* ranges, Zone* zone, |
| bool add_unicode_case_equivalents) { |
| DCHECK_EQ(current(), '\\'); |
| DCHECK_EQ(Next(), next); |
| |
| switch (next) { |
| case 'd': |
| case 'D': |
| case 's': |
| case 'S': |
| case 'w': |
| case 'W': |
| CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next), |
| ranges, add_unicode_case_equivalents, |
| zone); |
| Advance(2); |
| return true; |
| case 'p': |
| case 'P': { |
| if (!unicode()) return false; |
| bool negate = next == 'P'; |
| Advance(2); |
| ZoneVector<char> name_1(zone); |
| ZoneVector<char> name_2(zone); |
| if (!ParsePropertyClassName(&name_1, &name_2) || |
| !AddPropertyClassRange(ranges, negate, name_1, name_2)) { |
| ReportError(in_class_escape_state == InClassEscapeState::kInClass |
| ? RegExpError::kInvalidClassPropertyName |
| : RegExpError::kInvalidPropertyName); |
| } |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
| |
| template <class CharT> |
| RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass( |
| const RegExpBuilder* builder) { |
| DCHECK_EQ(current(), '['); |
| Advance(); |
| bool is_negated = false; |
| if (current() == '^') { |
| is_negated = true; |
| Advance(); |
| } |
| ZoneList<CharacterRange>* ranges = |
| zone()->template New<ZoneList<CharacterRange>>(2, zone()); |
| bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); |
| while (has_more() && current() != ']') { |
| base::uc32 char_1, char_2; |
| bool is_class_1, is_class_2; |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, |
| &is_class_1 CHECK_FAILED); |
| if (current() == '-') { |
| Advance(); |
| if (current() == kEndMarker) { |
| // If we reach the end we break out of the loop and let the |
| // following code report an error. |
| break; |
| } else if (current() == ']') { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| break; |
| } |
| ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, |
| &is_class_2 CHECK_FAILED); |
| if (is_class_1 || is_class_2) { |
| // Either end is an escaped character class. Treat the '-' verbatim. |
| if (unicode()) { |
| // ES2015 21.2.2.15.1 step 1. |
| return ReportError(RegExpError::kInvalidCharacterClass); |
| } |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| ranges->Add(CharacterRange::Singleton('-'), zone()); |
| if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); |
| continue; |
| } |
| // ES2015 21.2.2.15.1 step 6. |
| if (char_1 > char_2) { |
| return ReportError(RegExpError::kOutOfOrderCharacterClass); |
| } |
| ranges->Add(CharacterRange::Range(char_1, char_2), zone()); |
| } else { |
| if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); |
| } |
| } |
| if (!has_more()) { |
| return ReportError(RegExpError::kUnterminatedCharacterClass); |
| } |
| Advance(); |
| RegExpCharacterClass::CharacterClassFlags character_class_flags; |
| if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; |
| return zone()->template New<RegExpCharacterClass>(zone(), ranges, |
| character_class_flags); |
| } |
| |
| #undef CHECK_FAILED |
| |
| template <class CharT> |
| bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) { |
| DCHECK_NOT_NULL(result); |
| RegExpTree* tree = ParsePattern(); |
| |
| if (failed()) { |
| DCHECK_NULL(tree); |
| DCHECK_NE(error_, RegExpError::kNone); |
| result->error = error_; |
| result->error_pos = error_pos_; |
| return false; |
| } |
| |
| DCHECK_NOT_NULL(tree); |
| DCHECK_EQ(error_, RegExpError::kNone); |
| if (FLAG_trace_regexp_parser) { |
| StdoutStream os; |
| tree->Print(os, zone()); |
| os << "\n"; |
| } |
| |
| result->tree = tree; |
| const int capture_count = captures_started(); |
| result->simple = tree->IsAtom() && simple() && capture_count == 0; |
| result->contains_anchor = contains_anchor(); |
| result->capture_count = capture_count; |
| result->named_captures = GetNamedCaptures(); |
| return true; |
| } |
| |
| void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) { |
| DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| FlushPendingSurrogate(); |
| // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| pending_surrogate_ = lead_surrogate; |
| } |
| |
| void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) { |
| DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| base::uc16 lead_surrogate = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| base::uc32 combined = |
| unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
| if (NeedsDesugaringForIgnoreCase(combined)) { |
| AddCharacterClassForDesugaring(combined); |
| } else { |
| ZoneList<base::uc16> surrogate_pair(2, zone()); |
| surrogate_pair.Add(lead_surrogate, zone()); |
| surrogate_pair.Add(trail_surrogate, zone()); |
| RegExpAtom* atom = |
| zone()->New<RegExpAtom>(surrogate_pair.ToConstVector()); |
| AddAtom(atom); |
| } |
| } else { |
| pending_surrogate_ = trail_surrogate; |
| FlushPendingSurrogate(); |
| } |
| } |
| |
| void RegExpBuilder::FlushPendingSurrogate() { |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| DCHECK(unicode()); |
| base::uc32 c = pending_surrogate_; |
| pending_surrogate_ = kNoPendingSurrogate; |
| AddCharacterClassForDesugaring(c); |
| } |
| } |
| |
| void RegExpBuilder::FlushCharacters() { |
| FlushPendingSurrogate(); |
| pending_empty_ = false; |
| if (characters_ != nullptr) { |
| RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector()); |
| characters_ = nullptr; |
| text_.emplace_back(atom); |
| LAST(ADD_ATOM); |
| } |
| } |
| |
| void RegExpBuilder::FlushText() { |
| FlushCharacters(); |
| size_t num_text = text_.size(); |
| if (num_text == 0) { |
| return; |
| } else if (num_text == 1) { |
| terms_.emplace_back(text_.back()); |
| } else { |
| RegExpText* text = zone()->New<RegExpText>(zone()); |
| for (size_t i = 0; i < num_text; i++) { |
| text_[i]->AppendToText(text, zone()); |
| } |
| terms_.emplace_back(text); |
| } |
| text_.clear(); |
| } |
| |
| void RegExpBuilder::AddCharacter(base::uc16 c) { |
| FlushPendingSurrogate(); |
| pending_empty_ = false; |
| if (NeedsDesugaringForIgnoreCase(c)) { |
| AddCharacterClassForDesugaring(c); |
| } else { |
| if (characters_ == nullptr) { |
| characters_ = zone()->New<ZoneList<base::uc16>>(4, zone()); |
| } |
| characters_->Add(c, zone()); |
| LAST(ADD_CHAR); |
| } |
| } |
| |
| void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) { |
| if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { |
| DCHECK(unicode()); |
| AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
| AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| AddLeadSurrogate(c); |
| } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| AddTrailSurrogate(c); |
| } else { |
| AddCharacter(static_cast<base::uc16>(c)); |
| } |
| } |
| |
| void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { |
| // A lead or trail surrogate parsed via escape sequence will not |
| // pair up with any preceding lead or following trail surrogate. |
| FlushPendingSurrogate(); |
| AddUnicodeCharacter(character); |
| FlushPendingSurrogate(); |
| } |
| |
| void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| |
| void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| if (NeedsDesugaringForUnicode(cc)) { |
| // With /u, character class needs to be desugared, so it |
| // must be a standalone term instead of being part of a RegExpText. |
| AddTerm(cc); |
| } else { |
| AddAtom(cc); |
| } |
| } |
| |
| void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) { |
| AddTerm(zone()->New<RegExpCharacterClass>( |
| zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); |
| } |
| |
| void RegExpBuilder::AddAtom(RegExpTree* term) { |
| if (term->IsEmpty()) { |
| AddEmpty(); |
| return; |
| } |
| if (term->IsTextElement()) { |
| FlushCharacters(); |
| text_.emplace_back(term); |
| } else { |
| FlushText(); |
| terms_.emplace_back(term); |
| } |
| LAST(ADD_ATOM); |
| } |
| |
| void RegExpBuilder::AddTerm(RegExpTree* term) { |
| FlushText(); |
| terms_.emplace_back(term); |
| LAST(ADD_ATOM); |
| } |
| |
| void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| FlushText(); |
| terms_.emplace_back(assert); |
| LAST(ADD_ASSERT); |
| } |
| |
| void RegExpBuilder::NewAlternative() { FlushTerms(); } |
| |
| void RegExpBuilder::FlushTerms() { |
| FlushText(); |
| size_t num_terms = terms_.size(); |
| RegExpTree* alternative; |
| if (num_terms == 0) { |
| alternative = zone()->New<RegExpEmpty>(); |
| } else if (num_terms == 1) { |
| alternative = terms_.back(); |
| } else { |
| alternative = |
| zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>( |
| base::VectorOf(terms_.begin(), terms_.size()), zone())); |
| } |
| alternatives_.emplace_back(alternative); |
| terms_.clear(); |
| LAST(ADD_NONE); |
| } |
| |
| bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
| if (!unicode()) return false; |
| // TODO(yangguo): we could be smarter than this. Case-insensitivity does not |
| // necessarily mean that we need to desugar. It's probably nicer to have a |
| // separate pass to figure out unicode desugarings. |
| if (ignore_case()) return true; |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| CharacterRange::Canonicalize(ranges); |
| for (int i = ranges->length() - 1; i >= 0; i--) { |
| base::uc32 from = ranges->at(i).from(); |
| base::uc32 to = ranges->at(i).to(); |
| // Check for non-BMP characters. |
| if (to >= kNonBmpStart) return true; |
| // Check for lone surrogates. |
| if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| } |
| return false; |
| } |
| |
| bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) { |
| #ifdef V8_INTL_SUPPORT |
| if (unicode() && ignore_case()) { |
| icu::UnicodeSet set(c, c); |
| set.closeOver(USET_CASE_INSENSITIVE); |
| set.removeAllStrings(); |
| return set.size() > 1; |
| } |
| // In the case where ICU is not included, we act as if the unicode flag is |
| // not set, and do not desugar. |
| #endif // V8_INTL_SUPPORT |
| return false; |
| } |
| |
| RegExpTree* RegExpBuilder::ToRegExp() { |
| FlushTerms(); |
| size_t num_alternatives = alternatives_.size(); |
| if (num_alternatives == 0) return zone()->New<RegExpEmpty>(); |
| if (num_alternatives == 1) return alternatives_.back(); |
| return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>( |
| base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); |
| } |
| |
| bool RegExpBuilder::AddQuantifierToAtom( |
| int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| FlushPendingSurrogate(); |
| if (pending_empty_) { |
| pending_empty_ = false; |
| return true; |
| } |
| RegExpTree* atom; |
| if (characters_ != nullptr) { |
| DCHECK(last_added_ == ADD_CHAR); |
| // Last atom was character. |
| base::Vector<const base::uc16> char_vector = characters_->ToConstVector(); |
| int num_chars = char_vector.length(); |
| if (num_chars > 1) { |
| base::Vector<const base::uc16> prefix = |
| char_vector.SubVector(0, num_chars - 1); |
| text_.emplace_back(zone()->New<RegExpAtom>(prefix)); |
| char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
| } |
| characters_ = nullptr; |
| atom = zone()->New<RegExpAtom>(char_vector); |
| FlushText(); |
| } else if (text_.size() > 0) { |
| DCHECK(last_added_ == ADD_ATOM); |
| atom = text_.back(); |
| text_.pop_back(); |
| FlushText(); |
| } else if (terms_.size() > 0) { |
| DCHECK(last_added_ == ADD_ATOM); |
| atom = terms_.back(); |
| terms_.pop_back(); |
| if (atom->IsLookaround()) { |
| // With /u, lookarounds are not quantifiable. |
| if (unicode()) return false; |
| // Lookbehinds are not quantifiable. |
| if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { |
| return false; |
| } |
| } |
| if (atom->max_match() == 0) { |
| // Guaranteed to only match an empty string. |
| LAST(ADD_TERM); |
| if (min == 0) { |
| return true; |
| } |
| terms_.emplace_back(atom); |
| return true; |
| } |
| } else { |
| // Only call immediately after adding an atom or character! |
| UNREACHABLE(); |
| } |
| terms_.emplace_back( |
| zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom)); |
| LAST(ADD_TERM); |
| return true; |
| } |
| |
| template class RegExpParserImpl<uint8_t>; |
| template class RegExpParserImpl<base::uc16>; |
| |
| } // namespace |
| |
| // static |
| bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, |
| Handle<String> input, |
| RegExpFlags flags, |
| RegExpCompileData* result) { |
| DisallowGarbageCollection no_gc; |
| uintptr_t stack_limit = isolate->stack_guard()->real_climit(); |
| String::FlatContent content = input->GetFlatContent(no_gc); |
| if (content.IsOneByte()) { |
| base::Vector<const uint8_t> v = content.ToOneByteVector(); |
| return RegExpParserImpl<uint8_t>{v.begin(), v.length(), flags, |
| stack_limit, zone, no_gc} |
| .Parse(result); |
| } else { |
| base::Vector<const base::uc16> v = content.ToUC16Vector(); |
| return RegExpParserImpl<base::uc16>{v.begin(), v.length(), flags, |
| stack_limit, zone, no_gc} |
| .Parse(result); |
| } |
| } |
| |
| // static |
| template <class CharT> |
| bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, |
| const CharT* input, int input_length, |
| RegExpFlags flags, |
| RegExpCompileData* result, |
| const DisallowGarbageCollection& no_gc) { |
| return RegExpParserImpl<CharT>{input, input_length, flags, |
| stack_limit, zone, no_gc} |
| .Parse(result); |
| } |
| |
| template bool RegExpParser::VerifyRegExpSyntax<uint8_t>( |
| Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*, |
| const DisallowGarbageCollection&); |
| template bool RegExpParser::VerifyRegExpSyntax<base::uc16>( |
| Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*, |
| const DisallowGarbageCollection&); |
| |
| // static |
| bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone, |
| Handle<String> input, RegExpFlags flags, |
| RegExpCompileData* result, |
| const DisallowGarbageCollection&) { |
| return ParseRegExpFromHeapString(isolate, zone, input, flags, result); |
| } |
| |
| #undef LAST |
| |
| } // namespace internal |
| } // namespace v8 |