src/regexp/regexp-compiler.h - v8/v8 - Git at Google

 // Copyright 2019 the V8 project authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef V8_REGEXP_REGEXP_COMPILER_H_
 #define V8_REGEXP_REGEXP_COMPILER_H_

 #include <bitset>

 #include "src/base/small-vector.h"
 #include "src/regexp/regexp-nodes.h"

 namespace v8 {
 namespace internal {

 class DynamicBitSet;
 class Isolate;

 namespace regexp_compiler_constants {

 // The '2' variant is has inclusive from and exclusive to.
 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
 constexpr uc32 kRangeEndMarker = 0x110000;
 constexpr int kSpaceRanges[] = {
     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
 constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);

 constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
                                '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
 constexpr int kWordRangeCount = arraysize(kWordRanges);
 constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
 constexpr int kDigitRangeCount = arraysize(kDigitRanges);
 constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
                                     kLeadSurrogateStart + 1, kRangeEndMarker};
 constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
 constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
                                          0x2028, 0x202A, kRangeEndMarker};
 constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);

 // More makes code generation slower, less makes V8 benchmark score lower.
 constexpr int kMaxLookaheadForBoyerMoore = 8;
 // In a 3-character pattern you can maximally step forwards 3 characters
 // at a time, which is not always enough to pay for the extra logic.
 constexpr int kPatternTooShortForBoyerMoore = 2;

 }  // namespace regexp_compiler_constants

 inline bool IgnoreCase(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kIgnoreCase) != 0;
 }

 inline bool IsUnicode(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kUnicode) != 0;
 }

 inline bool IsSticky(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kSticky) != 0;
 }

 inline bool IsGlobal(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kGlobal) != 0;
 }

 inline bool DotAll(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kDotAll) != 0;
 }

 inline bool Multiline(JSRegExp::Flags flags) {
   return (flags & JSRegExp::kMultiline) != 0;
 }

 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
   // Both unicode and ignore_case flags are set. We need to use ICU to find
   // the closure over case equivalents.
   return IsUnicode(flags) && IgnoreCase(flags);
 }

 // Details of a quick mask-compare check that can look ahead in the
 // input stream.
 class QuickCheckDetails {
  public:
   QuickCheckDetails()
       : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
   explicit QuickCheckDetails(int characters)
       : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
   bool Rationalize(bool one_byte);
   // Merge in the information from another branch of an alternation.
   void Merge(QuickCheckDetails* other, int from_index);
   // Advance the current position by some amount.
   void Advance(int by, bool one_byte);
   void Clear();
   bool cannot_match() { return cannot_match_; }
   void set_cannot_match() { cannot_match_ = true; }
   struct Position {
     Position() : mask(0), value(0), determines_perfectly(false) {}
     uc16 mask;
     uc16 value;
     bool determines_perfectly;
   };
   int characters() { return characters_; }
   void set_characters(int characters) { characters_ = characters; }
   Position* positions(int index) {
     DCHECK_LE(0, index);
     DCHECK_GT(characters_, index);
     return positions_ + index;
   }
   uint32_t mask() { return mask_; }
   uint32_t value() { return value_; }

  private:
   // How many characters do we have quick check information from.  This is
   // the same for all branches of a choice node.
   int characters_;
   Position positions_[4];
   // These values are the condensate of the above array after Rationalize().
   uint32_t mask_;
   uint32_t value_;
   // If set to true, there is no way this quick check can match at all.
   // E.g., if it requires to be at the start of the input, and isn't.
   bool cannot_match_;
 };

 // Improve the speed that we scan for an initial point where a non-anchored
 // regexp can match by using a Boyer-Moore-like table. This is done by
 // identifying non-greedy non-capturing loops in the nodes that eat any
 // character one at a time.  For example in the middle of the regexp
 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
 // inserted at the start of any non-anchored regexp.
 //
 // When we have found such a loop we look ahead in the nodes to find the set of
 // characters that can come at given distances. For example for the regexp
 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
 // the lookahead info where the set of characters is reasonably constrained. In
 // our example this is from index 1 to 2 (0 is not constrained). We can now
 // look 3 characters ahead and if we don't find one of [f, o] (the union of
 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
 //
 // For Unicode input strings we do the same, but modulo 128.
 //
 // We also look at the first string fed to the regexp and use that to get a hint
 // of the character frequencies in the inputs. This affects the assessment of
 // whether the set of characters is 'reasonably constrained'.
 //
 // We also have another lookahead mechanism (called quick check in the code),
 // which uses a wide load of multiple characters followed by a mask and compare
 // to determine whether a match is possible at this point.
 enum ContainedInLattice {
   kNotYet = 0,
   kLatticeIn = 1,
   kLatticeOut = 2,
   kLatticeUnknown = 3  // Can also mean both in and out.
 };

 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
   return static_cast<ContainedInLattice>(a | b);
 }

 class BoyerMoorePositionInfo : public ZoneObject {
  public:
   bool at(int i) const { return map_[i]; }

   static constexpr int kMapSize = 128;
   static constexpr int kMask = kMapSize - 1;

   int map_count() const { return map_count_; }

   void Set(int character);
   void SetInterval(const Interval& interval);
   void SetAll();

   bool is_non_word() { return w_ == kLatticeOut; }
   bool is_word() { return w_ == kLatticeIn; }

   using Bitset = std::bitset<kMapSize>;
   Bitset raw_bitset() const { return map_; }

  private:
   Bitset map_;
   int map_count_ = 0;               // Number of set bits in the map.
   ContainedInLattice w_ = kNotYet;  // The \w character class.
 };

 class BoyerMooreLookahead : public ZoneObject {
  public:
   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);

   int length() { return length_; }
   int max_char() { return max_char_; }
   RegExpCompiler* compiler() { return compiler_; }

   int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }

   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }

   void Set(int map_number, int character) {
     if (character > max_char_) return;
     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
     info->Set(character);
   }

   void SetInterval(int map_number, const Interval& interval) {
     if (interval.from() > max_char_) return;
     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
     if (interval.to() > max_char_) {
       info->SetInterval(Interval(interval.from(), max_char_));
     } else {
       info->SetInterval(interval);
     }
   }

   void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }

   void SetRest(int from_map) {
     for (int i = from_map; i < length_; i++) SetAll(i);
   }
   void EmitSkipInstructions(RegExpMacroAssembler* masm);

  private:
   // This is the value obtained by EatsAtLeast.  If we do not have at least this
   // many characters left in the sample string then the match is bound to fail.
   // Therefore it is OK to read a character this far ahead of the current match
   // point.
   int length_;
   RegExpCompiler* compiler_;
   // 0xff for Latin1, 0xffff for UTF-16.
   int max_char_;
   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;

   int GetSkipTable(int min_lookahead, int max_lookahead,
                    Handle<ByteArray> boolean_skip_table);
   bool FindWorthwhileInterval(int* from, int* to);
   int FindBestInterval(int max_number_of_chars, int old_biggest_points,
                        int* from, int* to);
 };

 // There are many ways to generate code for a node.  This class encapsulates
 // the current way we should be generating.  In other words it encapsulates
 // the current state of the code generator.  The effect of this is that we
 // generate code for paths that the matcher can take through the regular
 // expression.  A given node in the regexp can be code-generated several times
 // as it can be part of several traces.  For example for the regexp:
 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
 // to match foo is generated only once (the traces have a common prefix).  The
 // code to store the capture is deferred and generated (twice) after the places
 // where baz has been matched.
 class Trace {
  public:
   // A value for a property that is either known to be true, know to be false,
   // or not known.
   enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };

   class DeferredAction {
    public:
     DeferredAction(ActionNode::ActionType action_type, int reg)
         : action_type_(action_type), reg_(reg), next_(nullptr) {}
     DeferredAction* next() { return next_; }
     bool Mentions(int reg);
     int reg() { return reg_; }
     ActionNode::ActionType action_type() { return action_type_; }

    private:
     ActionNode::ActionType action_type_;
     int reg_;
     DeferredAction* next_;
     friend class Trace;
   };

   class DeferredCapture : public DeferredAction {
    public:
     DeferredCapture(int reg, bool is_capture, Trace* trace)
         : DeferredAction(ActionNode::STORE_POSITION, reg),
           cp_offset_(trace->cp_offset()),
           is_capture_(is_capture) {}
     int cp_offset() { return cp_offset_; }
     bool is_capture() { return is_capture_; }

    private:
     int cp_offset_;
     bool is_capture_;
     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
   };

   class DeferredSetRegisterForLoop : public DeferredAction {
    public:
     DeferredSetRegisterForLoop(int reg, int value)
         : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
           value_(value) {}
     int value() { return value_; }

    private:
     int value_;
   };

   class DeferredClearCaptures : public DeferredAction {
    public:
     explicit DeferredClearCaptures(Interval range)
         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
     Interval range() { return range_; }

    private:
     Interval range_;
   };

   class DeferredIncrementRegister : public DeferredAction {
    public:
     explicit DeferredIncrementRegister(int reg)
         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
   };

   Trace()
       : cp_offset_(0),
         actions_(nullptr),
         backtrack_(nullptr),
         stop_node_(nullptr),
         loop_label_(nullptr),
         characters_preloaded_(0),
         bound_checked_up_to_(0),
         flush_budget_(100),
         at_start_(UNKNOWN) {}

   // End the trace.  This involves flushing the deferred actions in the trace
   // and pushing a backtrack location onto the backtrack stack.  Once this is
   // done we can start a new trace or go to one that has already been
   // generated.
   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
   int cp_offset() { return cp_offset_; }
   DeferredAction* actions() { return actions_; }
   // A trivial trace is one that has no deferred actions or other state that
   // affects the assumptions used when generating code.  There is no recorded
   // backtrack location in a trivial trace, so with a trivial trace we will
   // generate code that, on a failure to match, gets the backtrack location
   // from the backtrack stack rather than using a direct jump instruction.  We
   // always start code generation with a trivial trace and non-trivial traces
   // are created as we emit code for nodes or add to the list of deferred
   // actions in the trace.  The location of the code generated for a node using
   // a trivial trace is recorded in a label in the node so that gotos can be
   // generated to that code.
   bool is_trivial() {
     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
   }
   TriBool at_start() { return at_start_; }
   void set_at_start(TriBool at_start) { at_start_ = at_start; }
   Label* backtrack() { return backtrack_; }
   Label* loop_label() { return loop_label_; }
   RegExpNode* stop_node() { return stop_node_; }
   int characters_preloaded() { return characters_preloaded_; }
   int bound_checked_up_to() { return bound_checked_up_to_; }
   int flush_budget() { return flush_budget_; }
   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
   bool mentions_reg(int reg);
   // Returns true if a deferred position store exists to the specified
   // register and stores the offset in the out-parameter.  Otherwise
   // returns false.
   bool GetStoredPosition(int reg, int* cp_offset);
   // These set methods and AdvanceCurrentPositionInTrace should be used only on
   // new traces - the intention is that traces are immutable after creation.
   void add_action(DeferredAction* new_action) {
     DCHECK(new_action->next_ == nullptr);
     new_action->next_ = actions_;
     actions_ = new_action;
   }
   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
   void set_loop_label(Label* label) { loop_label_ = label; }
   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
   void set_flush_budget(int to) { flush_budget_ = to; }
   void set_quick_check_performed(QuickCheckDetails* d) {
     quick_check_performed_ = *d;
   }
   void InvalidateCurrentCharacter();
   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);

  private:
   int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
   void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
                               const DynamicBitSet& affected_registers,
                               DynamicBitSet* registers_to_pop,
                               DynamicBitSet* registers_to_clear, Zone* zone);
   void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
                                 const DynamicBitSet& registers_to_pop,
                                 const DynamicBitSet& registers_to_clear);
   int cp_offset_;
   DeferredAction* actions_;
   Label* backtrack_;
   RegExpNode* stop_node_;
   Label* loop_label_;
   int characters_preloaded_;
   int bound_checked_up_to_;
   QuickCheckDetails quick_check_performed_;
   int flush_budget_;
   TriBool at_start_;
 };

 class GreedyLoopState {
  public:
   explicit GreedyLoopState(bool not_at_start);

   Label* label() { return &label_; }
   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }

  private:
   Label label_;
   Trace counter_backtrack_trace_;
 };

 struct PreloadState {
   static const int kEatsAtLeastNotYetInitialized = -1;
   bool preload_is_current_;
   bool preload_has_checked_bounds_;
   int preload_characters_;
   int eats_at_least_;
   void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
 };

 // Analysis performs assertion propagation and computes eats_at_least_ values.
 // See the comments on AssertionPropagator and EatsAtLeastPropagator for more
 // details.
 //
 // This method returns nullptr on success or a null-terminated failure message
 // on failure.
 const char* AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);

 class FrequencyCollator {
  public:
   FrequencyCollator() : total_samples_(0) {
     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
       frequencies_[i] = CharacterFrequency(i);
     }
   }

   void CountCharacter(int character) {
     int index = (character & RegExpMacroAssembler::kTableMask);
     frequencies_[index].Increment();
     total_samples_++;
   }

   // Does not measure in percent, but rather per-128 (the table size from the
   // regexp macro assembler).
   int Frequency(int in_character) {
     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
     if (total_samples_ < 1) return 1;  // Division by zero.
     int freq_in_per128 =
         (frequencies_[in_character].counter() * 128) / total_samples_;
     return freq_in_per128;
   }

  private:
   class CharacterFrequency {
    public:
     CharacterFrequency() : counter_(0), character_(-1) {}
     explicit CharacterFrequency(int character)
         : counter_(0), character_(character) {}

     void Increment() { counter_++; }
     int counter() { return counter_; }
     int character() { return character_; }

    private:
     int counter_;
     int character_;
   };

  private:
   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
   int total_samples_;
 };

 class RegExpCompiler {
  public:
   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
                  bool is_one_byte);

   int AllocateRegister() {
     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
       reg_exp_too_big_ = true;
       return next_register_;
     }
     return next_register_++;
   }

   // Lookarounds to match lone surrogates for unicode character class matches
   // are never nested. We can therefore reuse registers.
   int UnicodeLookaroundStackRegister() {
     if (unicode_lookaround_stack_register_ == kNoRegister) {
       unicode_lookaround_stack_register_ = AllocateRegister();
     }
     return unicode_lookaround_stack_register_;
   }

   int UnicodeLookaroundPositionRegister() {
     if (unicode_lookaround_position_register_ == kNoRegister) {
       unicode_lookaround_position_register_ = AllocateRegister();
     }
     return unicode_lookaround_position_register_;
   }

   struct CompilationResult final {
     explicit CompilationResult(const char* error_message)
         : error_message(error_message) {}
     CompilationResult(Object code, int registers)
         : code(code), num_registers(registers) {}

     static CompilationResult RegExpTooBig() {
       return CompilationResult("RegExp too big");
     }

     bool Succeeded() const { return error_message == nullptr; }

     const char* const error_message = nullptr;
     Object code;
     int num_registers = 0;
   };

   CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
                              RegExpNode* start, int capture_count,
                              Handle<String> pattern);

   // If the regexp matching starts within a surrogate pair, step back to the
   // lead surrogate and start matching from there.
   static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
                                                        RegExpNode* on_success,
                                                        JSRegExp::Flags flags);

   inline void AddWork(RegExpNode* node) {
     if (!node->on_work_list() && !node->label()->is_bound()) {
       node->set_on_work_list(true);
       work_list_->push_back(node);
     }
   }

   static const int kImplementationOffset = 0;
   static const int kNumberOfRegistersOffset = 0;
   static const int kCodeOffset = 1;

   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
   EndNode* accept() { return accept_; }

   static const int kMaxRecursion = 100;
   inline int recursion_depth() { return recursion_depth_; }
   inline void IncrementRecursionDepth() { recursion_depth_++; }
   inline void DecrementRecursionDepth() { recursion_depth_--; }

   void SetRegExpTooBig() { reg_exp_too_big_ = true; }

   inline bool one_byte() { return one_byte_; }
   inline bool optimize() { return optimize_; }
   inline void set_optimize(bool value) { optimize_ = value; }
   inline bool limiting_recursion() { return limiting_recursion_; }
   inline void set_limiting_recursion(bool value) {
     limiting_recursion_ = value;
   }
   bool read_backward() { return read_backward_; }
   void set_read_backward(bool value) { read_backward_ = value; }
   FrequencyCollator* frequency_collator() { return &frequency_collator_; }

   int current_expansion_factor() { return current_expansion_factor_; }
   void set_current_expansion_factor(int value) {
     current_expansion_factor_ = value;
   }

   Isolate* isolate() const { return isolate_; }
   Zone* zone() const { return zone_; }

   static const int kNoRegister = -1;

  private:
   EndNode* accept_;
   int next_register_;
   int unicode_lookaround_stack_register_;
   int unicode_lookaround_position_register_;
   std::vector<RegExpNode*>* work_list_;
   int recursion_depth_;
   RegExpMacroAssembler* macro_assembler_;
   bool one_byte_;
   bool reg_exp_too_big_;
   bool limiting_recursion_;
   bool optimize_;
   bool read_backward_;
   int current_expansion_factor_;
   FrequencyCollator frequency_collator_;
   Isolate* isolate_;
   Zone* zone_;
 };

 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
 class UnicodeRangeSplitter {
  public:
   V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);

   static constexpr int kInitialSize = 8;
   using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;

   const CharacterRangeVector* bmp() const { return &bmp_; }
   const CharacterRangeVector* lead_surrogates() const {
     return &lead_surrogates_;
   }
   const CharacterRangeVector* trail_surrogates() const {
     return &trail_surrogates_;
   }
   const CharacterRangeVector* non_bmp() const { return &non_bmp_; }

  private:
   void AddRange(CharacterRange range);

   CharacterRangeVector bmp_;
   CharacterRangeVector lead_surrogates_;
   CharacterRangeVector trail_surrogates_;
   CharacterRangeVector non_bmp_;
 };

 // We need to check for the following characters: 0x39C 0x3BC 0x178.
 // TODO(jgruber): Move to CharacterRange.
 bool RangeContainsLatin1Equivalents(CharacterRange range);

 }  // namespace internal
 }  // namespace v8

 #endif  // V8_REGEXP_REGEXP_COMPILER_H_