lib/Parser/RegexParser.h - external/github.com/Microsoft/ChakraCore - Git at Google

 //-------------------------------------------------------------------------------------------------------
 // Copyright (C) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
 //-------------------------------------------------------------------------------------------------------
 #pragma once

 namespace UnifiedRegex
 {
     struct ParseError
     {
         bool isBody;
         CharCount pos;           // Position in unicode characters
         CharCount encodedPos;    // Position in underlying characters (eg utf-8 bytes)
         HRESULT error;

         ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
     };

     template <typename EncodingPolicy, const bool IsLiteral>
     class Parser : private EncodingPolicy, private Chars<char16>
     {
     private:
         typedef typename EncodingPolicy::EncodedChar EncodedChar;

         // A linked list node to track indices of surrogate pairs.
         struct SurrogatePairTracker
         {
             const EncodedChar* location;
             // If this surrogate pair is inside a range, then rangeLocation isn't null.
             const EncodedChar* rangeLocation;
             codepoint_t value;
             uint32 length;
             size_t multiUnits;
             SurrogatePairTracker* next;

             SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
                 : location(location)
                 , next(nullptr)
                 , value(value)
                 , length(length)
                 , multiUnits(multiUnits)
                 , rangeLocation(nullptr)
             {
             }

             SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
                 : location(location)
                 , next(nullptr)
                 , value(value)
                 , length(length)
                 , multiUnits(multiUnits)
                 , rangeLocation(rangeLocation)
             {
             }

             bool IsInsideRange() const
             {
                 return this->rangeLocation != nullptr;
             }
         };

         static const CharCount initLitbufSize = 16;

         Js::ScriptContext* scriptContext;
         // Arena for nodes and items needed only during compilation
         ArenaAllocator* ctAllocator;
         // Standard characters using raw encoding character representation (eg char for utf-8)
         StandardChars<EncodedChar>* standardEncodedChars;
         // Standard characters using final character representation (eg char16 for Unicode)
         StandardChars<Char>* standardChars;
 #if ENABLE_REGEX_CONFIG_OPTIONS
         DebugWriter* w;
 #endif

         const EncodedChar* input;
         const EncodedChar* inputLim;
         const EncodedChar* next;
         bool inBody;

         // Maximum number of capturing groups allowed, including the entire regexp, which is always
         // considered a capturing group.  Using INT16_MAX allows us to pass one value for each
         // group, plus a few additional values, to a JavaScript function without overflowing the
         // number of arguments.  This is important, for example, in the implementation of
         // String.prototype.replace, where the second argument is a function.
         static const uint16 MAX_NUM_GROUPS = INT16_MAX;

         uint16 numGroups; // determined in first parse

         // Keeps track of how many capturing groups we've seen during parsing.  We use an int, rather than
         // a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
         // the end.  (We know we can't overflow an int because strings and regex literals are limited to
         // 2G characters and therefore to 1G pairs of parentheses, which can fit into an int.  I'd prefer
         // to use size_t here, but making that change would go down a serious rabbit hole changing the
         // interface to UnifiedRegex::Node::AccumDefineGroups.)
         int nextGroupId;

         // Buffer accumulating all literals.
         // In compile-time allocator, must be transferred to runtime allocator when build program
         Char* litbuf;
         CharCount litbufLen;
         CharCount litbufNext;

         // During pass 0, if /u option for regex is provided, a linked list will be built up to
         // track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
         // to figure out when to output a surrogate pair node.
         SurrogatePairTracker* surrogatePairList;
         SurrogatePairTracker* currentSurrogatePairNode;
         bool unicodeFlagPresent;
         bool caseInsensitiveFlagPresent;

         // The following two variables are used to determine if the the surrogate pair has been encountered
         // First holds the temporary location, second holds the value of the codepoint
         const EncodedChar* tempLocationOfSurrogatePair;
         // This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
         const EncodedChar* tempLocationOfRange;
         codepoint_t codePointAtTempLocation;

         // When a surrogate is added for tracking, this will be updated.
         const EncodedChar* positionAfterLastSurrogate;
         codepoint_t valueOfLastSurrogate;

         // deferred error state.
         ParseError* deferredIfNotUnicodeError;
         ParseError* deferredIfUnicodeError;

     private:

         //
         // Input buffer management
         //

         void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);

         // Current position in number of logical characters, regardless of underlying character encoding
         inline CharCount Pos();

         inline bool IsEOF();
         inline bool ECCanConsume(CharCount n = 1);
         inline EncodedChar ECLookahead(CharCount n = 0);
         inline EncodedChar ECLookback(CharCount n = 0);
         inline void ECConsume(CharCount n = 1);
         inline void ECConsumeMultiUnit(CharCount n = 1);
         inline void ECRevert(CharCount n = 1);

         //
         // Helpers
         //
         int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
         void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
         Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
         AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
         AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);

         //
         // Errors
         //

         void Fail(HRESULT error);
         void DeferredFailIfUnicode(HRESULT error);
         void DeferredFailIfNotUnicode(HRESULT error);
         inline void ECMust(EncodedChar ec, HRESULT error);
         inline Char NextChar();

         //
         // Patterns/Disjunctions/Alternatives
         //

         void PatternPass0();
         Node* PatternPass1();
         Node* UnionNodes(Node* prev, Node* curr);
         void DisjunctionPass0(int depth);
         Node* DisjunctionPass1();
         bool IsEndOfAlternative();
         void EnsureLitbuf(CharCount size);
         void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
         Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
         void AlternativePass0(int depth);
         Node* AlternativePass1();

         //
         // Terms
         //

         Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
         bool AtQuantifier();
         bool OptNonGreedy();
         CharCount RepeatCount();
         void TermPass0(int depth);
         Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
         bool AtomEscapePass0();
         bool AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
         bool SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart);

         //
         // Classes
         //

         bool AtSecondSingletonClassAtom();
         void CharacterClassPass0();
         template <bool containsSurrogates>
         Node* CharacterClassPass1();
         bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
         Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
         Node* GetNodeWithValidCharacterSet(EncodedChar ch);

         //
         // Options
         //

         void Options(RegexFlags& flags);

     public:

         Parser
             ( Js::ScriptContext* scriptContext
             , ArenaAllocator* ctAllocator
             , StandardChars<EncodedChar>* standardEncodedChars
             , StandardChars<Char>* standardChars
             , bool isUtf8
 #if ENABLE_REGEX_CONFIG_OPTIONS
             , DebugWriter* w
 #endif
             );

         //
         // Entry points
         //


         Node* ParseDynamic
             ( const EncodedChar* body           // non null, null terminated (may contain embedded nulls)
             , const EncodedChar* bodyLim        // points to terminating null of above
             , const EncodedChar* opts           // may be null if no options, otherwise null terminated
             , const EncodedChar* optsLim        // if above non-null, points to terminating null of above
             , RegexFlags& flags );

         // (*) For ParseLiteral:
         //  - input string must be null terminated
         //  - inputLim may point to the terminating null in above or before it
         //     - if the later, input is known to be syntactically well-formed so that the parser
         //       will find the natural end of the regex literal before passing inputLim
         //  - input may contain nulls before the inputLim

         Node* ParseLiteral
             ( const EncodedChar* input          // non null, null terminated (may contain embedded nulls)
             , const EncodedChar* inputLim       // see (*) above
             , CharCount& outBodyEncodedChars    // in encoded characters, not including trailing '/'
             , CharCount& outTotalEncodedChars   // in encoded characters, including trailing '/' and any options
             , CharCount& outBodyChars           // in unicode characters, not including ttrailing '/'
             , CharCount& outTotalChars          // in unicode characters, including trailing '/' and any options
             , RegexFlags& flags );

         void ParseLiteralNoAST
             ( const EncodedChar* input          // non null, null terminated
             , const EncodedChar* inputLim       // see (*) above
             , CharCount& outBodyEncodedChars
             , CharCount& outTotalEncodedChars
             , CharCount& outBodyChars
             , CharCount& outTotalChars );

         template<const bool buildAST>
         RegexPattern* CompileProgram
             ( Node* root,
               const EncodedChar*& currentCharacter,
               const CharCount totalLen,
               const CharCount bodyChars,
               const CharCount bodyEncodedChars,
               const CharCount totalChars,
               const RegexFlags flags );

         static void CaptureEmptySourceAndNoGroups(Program* program);

         // bodyChars is number of unicode characters in program body, which may be less than the number
         // of underlying UTF-8 characters
         void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);

         inline const Char* GetLitbuf() { return litbuf; }

         void FreeBody();

         size_t GetMultiUnits() { return this->m_cMultiUnits; }
     };
 }
	//-------------------------------------------------------------------------------------------------------
	// Copyright (C) Microsoft. All rights reserved.
	// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
	//-------------------------------------------------------------------------------------------------------
	#pragma once

	namespace UnifiedRegex
	{
	struct ParseError
	{
	bool isBody;
	CharCount pos; // Position in unicode characters
	CharCount encodedPos; // Position in underlying characters (eg utf-8 bytes)
	HRESULT error;

	ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error);
	};

	template <typename EncodingPolicy, const bool IsLiteral>
	class Parser : private EncodingPolicy, private Chars<char16>
	{
	private:
	typedef typename EncodingPolicy::EncodedChar EncodedChar;

	// A linked list node to track indices of surrogate pairs.
	struct SurrogatePairTracker
	{
	const EncodedChar* location;
	// If this surrogate pair is inside a range, then rangeLocation isn't null.
	const EncodedChar* rangeLocation;
	codepoint_t value;
	uint32 length;
	size_t multiUnits;
	SurrogatePairTracker* next;

	SurrogatePairTracker(const EncodedChar* location, codepoint_t value, uint32 length, size_t multiUnits)
	: location(location)
	, next(nullptr)
	, value(value)
	, length(length)
	, multiUnits(multiUnits)
	, rangeLocation(nullptr)
	{
	}

	SurrogatePairTracker(const EncodedChar* location, const EncodedChar* rangeLocation, codepoint_t value, uint32 length, size_t multiUnits)
	: location(location)
	, next(nullptr)
	, value(value)
	, length(length)
	, multiUnits(multiUnits)
	, rangeLocation(rangeLocation)
	{
	}

	bool IsInsideRange() const
	{
	return this->rangeLocation != nullptr;
	}
	};

	static const CharCount initLitbufSize = 16;

	Js::ScriptContext* scriptContext;
	// Arena for nodes and items needed only during compilation
	ArenaAllocator* ctAllocator;
	// Standard characters using raw encoding character representation (eg char for utf-8)
	StandardChars<EncodedChar>* standardEncodedChars;
	// Standard characters using final character representation (eg char16 for Unicode)
	StandardChars<Char>* standardChars;
	#if ENABLE_REGEX_CONFIG_OPTIONS
	DebugWriter* w;
	#endif

	const EncodedChar* input;
	const EncodedChar* inputLim;
	const EncodedChar* next;
	bool inBody;

	// Maximum number of capturing groups allowed, including the entire regexp, which is always
	// considered a capturing group. Using INT16_MAX allows us to pass one value for each
	// group, plus a few additional values, to a JavaScript function without overflowing the
	// number of arguments. This is important, for example, in the implementation of
	// String.prototype.replace, where the second argument is a function.
	static const uint16 MAX_NUM_GROUPS = INT16_MAX;

	uint16 numGroups; // determined in first parse

	// Keeps track of how many capturing groups we've seen during parsing. We use an int, rather than
	// a uint16, to be sure we don't overflow during parsing and only check it against MAX_NUM_GROUPS at
	// the end. (We know we can't overflow an int because strings and regex literals are limited to
	// 2G characters and therefore to 1G pairs of parentheses, which can fit into an int. I'd prefer
	// to use size_t here, but making that change would go down a serious rabbit hole changing the
	// interface to UnifiedRegex::Node::AccumDefineGroups.)
	int nextGroupId;

	// Buffer accumulating all literals.
	// In compile-time allocator, must be transferred to runtime allocator when build program
	Char* litbuf;
	CharCount litbufLen;
	CharCount litbufNext;

	// During pass 0, if /u option for regex is provided, a linked list will be built up to
	// track positions of surrogate pairs in the buffer. During pass 1, these linked lists will be used
	// to figure out when to output a surrogate pair node.
	SurrogatePairTracker* surrogatePairList;
	SurrogatePairTracker* currentSurrogatePairNode;
	bool unicodeFlagPresent;
	bool caseInsensitiveFlagPresent;

	// The following two variables are used to determine if the the surrogate pair has been encountered
	// First holds the temporary location, second holds the value of the codepoint
	const EncodedChar* tempLocationOfSurrogatePair;
	// This will be set to a location when we are parsing a range in TermPass0, and cleared when we are out of it.
	const EncodedChar* tempLocationOfRange;
	codepoint_t codePointAtTempLocation;

	// When a surrogate is added for tracking, this will be updated.
	const EncodedChar* positionAfterLastSurrogate;
	codepoint_t valueOfLastSurrogate;

	// deferred error state.
	ParseError* deferredIfNotUnicodeError;
	ParseError* deferredIfUnicodeError;

	private:

	//
	// Input buffer management
	//

	void SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody);

	// Current position in number of logical characters, regardless of underlying character encoding
	inline CharCount Pos();

	inline bool IsEOF();
	inline bool ECCanConsume(CharCount n = 1);
	inline EncodedChar ECLookahead(CharCount n = 0);
	inline EncodedChar ECLookback(CharCount n = 0);
	inline void ECConsume(CharCount n = 1);
	inline void ECConsumeMultiUnit(CharCount n = 1);
	inline void ECRevert(CharCount n = 1);

	//
	// Helpers
	//
	int TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false);
	void TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength);
	Node* CreateSurrogatePairAtom(char16 lower, char16 upper);
	AltNode* AppendSurrogateRangeToDisjunction(codepoint_t lowerCodePoint, codepoint_t upperCodePoint, AltNode *lastAlttNode);
	AltNode* AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAlttNode);

	//
	// Errors
	//

	void Fail(HRESULT error);
	void DeferredFailIfUnicode(HRESULT error);
	void DeferredFailIfNotUnicode(HRESULT error);
	inline void ECMust(EncodedChar ec, HRESULT error);
	inline Char NextChar();

	//
	// Patterns/Disjunctions/Alternatives
	//

	void PatternPass0();
	Node* PatternPass1();
	Node* UnionNodes(Node* prev, Node* curr);
	void DisjunctionPass0(int depth);
	Node* DisjunctionPass1();
	bool IsEndOfAlternative();
	void EnsureLitbuf(CharCount size);
	void AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode);
	Node* FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode);
	void AlternativePass0(int depth);
	Node* AlternativePass1();

	//
	// Terms
	//

	Node* NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body);
	bool AtQuantifier();
	bool OptNonGreedy();
	CharCount RepeatCount();
	void TermPass0(int depth);
	Node* TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart);
	bool AtomEscapePass0();
	bool AtomEscapePass1(Node& node, MatchCharNode deferredCharNode, bool& previousSurrogatePart);
	bool SurrogatePairPass1(Node& node, MatchCharNode deferredCharNode, bool& previousSurrogatePart);

	//
	// Classes
	//

	bool AtSecondSingletonClassAtom();
	void CharacterClassPass0();
	template <bool containsSurrogates>
	Node* CharacterClassPass1();
	bool ClassEscapePass0(Char& singleton, bool& previousSurrogatePart);
	Node* ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart);
	Node* GetNodeWithValidCharacterSet(EncodedChar ch);

	//
	// Options
	//

	void Options(RegexFlags& flags);

	public:

	Parser
	( Js::ScriptContext* scriptContext
	, ArenaAllocator* ctAllocator
	, StandardChars<EncodedChar>* standardEncodedChars
	, StandardChars<Char>* standardChars
	, bool isUtf8
	#if ENABLE_REGEX_CONFIG_OPTIONS
	, DebugWriter* w
	#endif
	);

	//
	// Entry points
	//


	Node* ParseDynamic
	( const EncodedChar* body // non null, null terminated (may contain embedded nulls)
	, const EncodedChar* bodyLim // points to terminating null of above
	, const EncodedChar* opts // may be null if no options, otherwise null terminated
	, const EncodedChar* optsLim // if above non-null, points to terminating null of above
	, RegexFlags& flags );

	// (*) For ParseLiteral:
	// - input string must be null terminated
	// - inputLim may point to the terminating null in above or before it
	// - if the later, input is known to be syntactically well-formed so that the parser
	// will find the natural end of the regex literal before passing inputLim
	// - input may contain nulls before the inputLim

	Node* ParseLiteral
	( const EncodedChar* input // non null, null terminated (may contain embedded nulls)
	, const EncodedChar* inputLim // see (*) above
	, CharCount& outBodyEncodedChars // in encoded characters, not including trailing '/'
	, CharCount& outTotalEncodedChars // in encoded characters, including trailing '/' and any options
	, CharCount& outBodyChars // in unicode characters, not including ttrailing '/'
	, CharCount& outTotalChars // in unicode characters, including trailing '/' and any options
	, RegexFlags& flags );

	void ParseLiteralNoAST
	( const EncodedChar* input // non null, null terminated
	, const EncodedChar* inputLim // see (*) above
	, CharCount& outBodyEncodedChars
	, CharCount& outTotalEncodedChars
	, CharCount& outBodyChars
	, CharCount& outTotalChars );

	template<const bool buildAST>
	RegexPattern* CompileProgram
	( Node* root,
	const EncodedChar*& currentCharacter,
	const CharCount totalLen,
	const CharCount bodyChars,
	const CharCount bodyEncodedChars,
	const CharCount totalChars,
	const RegexFlags flags );

	static void CaptureEmptySourceAndNoGroups(Program* program);

	// bodyChars is number of unicode characters in program body, which may be less than the number
	// of underlying UTF-8 characters
	void CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars);

	inline const Char* GetLitbuf() { return litbuf; }

	void FreeBody();

	size_t GetMultiUnits() { return this->m_cMultiUnits; }
	};
	}