blob: fa8bdcc3743cf708c5036919a129c78631ec9dc1 [file]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
/*
--------------------------------------------------------------------------------------------------------------------------------
Original
--------------------------------------------------------------------------------------------------------------------------------
Pattern ::= Disjunction
Disjunction ::= Alternative | Alternative '|' Disjunction
Alternative ::= [empty] | Alternative Term
Term ::= Assertion | Atom | Atom Quantifier
Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
ClassRanges ::= [empty] | NonemptyClassRanges
NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
ClassAtom ::= '-' | ClassAtomNoDash
ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
SourceCharacter ::= <unicode character>
HexEscapeSequence ::= 'x' HexDigit HexDigit
UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
UnicodeDigit ::= <unicode Decimal number>
UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
DecimalDigit ::= one of { '0'..'9' }
NonZeroDigit ::= one of { '1'..'9' }
------------------
Annex B Deviations
------------------
1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
Other assertions are not quantifiable.
QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
Term ::= ... | QuantifiableAssertion Quantifier [added]
--------------------------------------------------------------------------------------------------------------------------------
Left factored
--------------------------------------------------------------------------------------------------------------------------------
Pattern ::= Disjunction
Disjunction ::= Alternative ('|' Alternative)*
FOLLOW(Disjunction) = { <eof>, ')' }
Alternative ::= Term*
FOLLOW(Alternative) = { <eof>, ')', '|' }
Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
| ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
ClassRanges ::= [empty] | NonemptyClassRanges
NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
ClassAtom ::= '-' | ClassAtomNoDash
ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
HexEscapeSequence ::= 'x' HexDigit{2}
UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
DecimalDigits ::= DecimalDigit+
DecimalDigit ::= one of { '0'..'9' }
NonZeroDigit ::= one of { '1'..'9' }
------------------
Annex B Deviations
------------------
QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
*/
#include "ParserPch.h"
namespace UnifiedRegex
{
template <typename P, const bool IsLiteral>
const CharCount Parser<P, IsLiteral>::initLitbufSize;
ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
: isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
{
}
template <typename P, const bool IsLiteral>
Parser<P, IsLiteral>::Parser
( Js::ScriptContext* scriptContext
, ArenaAllocator* ctAllocator
, StandardChars<EncodedChar>* standardEncodedChars
, StandardChars<Char>* standardChars
, bool isFromExternalSource
#if ENABLE_REGEX_CONFIG_OPTIONS
, DebugWriter* w
#endif
)
: scriptContext(scriptContext)
, ctAllocator(ctAllocator)
, standardEncodedChars(standardEncodedChars)
, standardChars(standardChars)
#if ENABLE_REGEX_CONFIG_OPTIONS
, w(w)
#endif
, input(0)
, inputLim(0)
, next(0)
, inBody(false)
, numGroups(1)
, nextGroupId(1) // implicit overall group always takes index 0
, litbuf(0)
, litbufLen(0)
, litbufNext(0)
, surrogatePairList(nullptr)
, currentSurrogatePairNode(nullptr)
, tempLocationOfSurrogatePair(nullptr)
, tempLocationOfRange(nullptr)
, codePointAtTempLocation(0)
, unicodeFlagPresent(false)
, caseInsensitiveFlagPresent(false)
, positionAfterLastSurrogate(nullptr)
, valueOfLastSurrogate(INVALID_CODEPOINT)
, deferredIfNotUnicodeError(nullptr)
, deferredIfUnicodeError(nullptr)
{
if (isFromExternalSource)
this->FromExternalSource();
}
//
// Input buffer management
//
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
{
this->input = input;
this->inputLim = inputLim;
next = input;
this->inBody = inBody;
this->RestoreMultiUnits(0);
}
template <typename P, const bool IsLiteral>
inline CharCount Parser<P, IsLiteral>::Pos()
{
CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
Assert(nextOffset >= this->m_cMultiUnits);
return nextOffset - (CharCount) this->m_cMultiUnits;
}
template <typename P, const bool IsLiteral>
inline bool Parser<P, IsLiteral>::IsEOF()
{
return next >= inputLim;
}
template <typename P, const bool IsLiteral>
inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n /*= 1*/)
{
return next + n <= inputLim;
}
template <typename P, const bool IsLiteral>
inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n /*= 0*/)
{
// Ok to look ahead to terminating 0
Assert(next + n <= inputLim);
return next[n];
}
template <typename P, const bool IsLiteral>
inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n /*= 0*/)
{
// Ok to look ahead to terminating 0
Assert(n + input <= next);
return *(next - n);
}
template <typename P, const bool IsLiteral>
inline void Parser<P, IsLiteral>::ECConsume(CharCount n /*= 1*/)
{
Assert(next + n <= inputLim);
#if DBG
for (CharCount i = 0; i < n; i++)
Assert(!this->IsMultiUnitChar(next[i]));
#endif
next += n;
}
template <typename P, const bool IsLiteral>
inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n /*= 1*/)
{
Assert(next + n <= inputLim);
next += n;
}
template <typename P, const bool IsLiteral>
inline void Parser<P, IsLiteral>::ECRevert(CharCount n /*= 1*/)
{
Assert(n + input <= next);
next -= n;
}
//
// Helpers
//
template <typename P, const bool IsLiteral>
int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair /* = false */)
{
if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
return 0;
}
if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
{
return 0;
}
// The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
int i = 2;
while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
{
codePoint <<= 4;
codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
if (codePoint > 0x10FFFF)
{
return 0;
}
i++;
}
if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
{
return 0;
}
uint consumptionNumber = i + 1;
Assert(consumptionNumber >= 3);
if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
// Current location
TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
}
char16 other;
// Generally if this code point is a single character, then we take it and return.
// If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
// Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
if (codePoint < 0x10000)
{
c = UTC(codePoint);
ECConsumeMultiUnit(consumptionNumber);
}
else if (previousSurrogatePart)
{
previousSurrogatePart = false;
Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
ECConsumeMultiUnit(consumptionNumber);
}
else
{
previousSurrogatePart = true;
Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
ECRevert(2);
}
return consumptionNumber;
}
// This function has the following 'knowledge':
// - A codepoint that might be part of a surrogate pair or part of one
// - A location where that codepoint is located
// - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
// - A pointer to the current location of the linked list
// - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
// This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
{
Assert(codePoint < 0x110000);
Assert(location != nullptr);
Assert(location != this->tempLocationOfSurrogatePair);
if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
{
this->tempLocationOfSurrogatePair = location;
this->codePointAtTempLocation = codePoint;
}
else
{
if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
{
Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
location = this->tempLocationOfSurrogatePair;
}
// At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
this->tempLocationOfSurrogatePair = nullptr;
this->codePointAtTempLocation = 0;
}
if (codePoint > 0xFFFF)
{
this->positionAfterLastSurrogate = location + consumptionLength;
this->valueOfLastSurrogate = codePoint;
// When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
// while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
if(this->ctAllocator != nullptr)
{
SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, this->m_cMultiUnits);
if (surrogatePairList == nullptr)
{
Assert(currentSurrogatePairNode == nullptr);
surrogatePairList = node;
currentSurrogatePairNode = node;
}
else
{
Assert(currentSurrogatePairNode != nullptr);
currentSurrogatePairNode->next = node;
currentSurrogatePairNode = node;
}
}
}
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(char16 lower, char16 upper)
{
MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
MatchCharNode lowerNode(lower);
MatchCharNode upperNode(upper);
AccumLiteral(literalNode, &lowerNode);
AccumLiteral(literalNode, &upperNode);
return literalNode;
}
// This function will create appropriate pairs of ranges and add them to the disjunction node.
// Terms used in comments:
// - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
// - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
// - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
// - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
// - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
// The algorithm works as follows:
// 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
// 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
// 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
// 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
// 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
// 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
// Visual representation for sample range 0x10300 - 0x10900
// | [ _ | ^ ] |
// 0x10000 0x10800 0x11000
// [ ] - denote the actual range
// _ - minorBoundary
// ^ - majorBoundary
// | - other boundaries
// prefix is between [ and _
// suffix is between ^ and ]
// full range is between _ and ^
template <typename P, const bool IsLiteral>
AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
{
Assert(minorCodePoint < majorCodePoint);
Assert(minorCodePoint >= 0x10000u);
Assert(majorCodePoint >= 0x10000u);
Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
char16 lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
// These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
// minor boundary is the first boundary strictly above minorCodePoint
// major boundary is the first boundary below or equal to majorCodePoint
codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
Assert(minorBoundary >= 0x10000);
Assert(majorBoundary >= 0x10000);
AltNode* tailToAdd = nullptr;
// If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
// Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
// This pair will be represented in single range set.
const bool singleRange = minorBoundary > majorBoundary;
if (singleRange)
{
Assert(majorCodePoint - minorCodePoint < 0x400u);
Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
}
else
{
Node* prefixNode = nullptr, *suffixNode = nullptr;
const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
// For minorBoundary,
if (minorBoundary - minorCodePoint == 1) // Single character in minor range
{
// The prefix is only a surrogate pair atom
prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
}
else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
{
Assert(minorBoundary - minorCodePoint < 0x400u);
MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
}
else // Full minor range
{
minorBoundary -= 0x400u;
}
if (majorBoundary == majorCodePoint) // Single character in major range
{
// The suffix is only a surrogate pair atom
suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
majorBoundary -= 0x400u;
}
else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
{
Assert(majorCodePoint - majorBoundary < 0x3FFu);
MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
majorBoundary -= 0x400u;
}
const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
if (nonFullConsecutiveRanges)
{
Assert(suffixNode != nullptr);
Assert(minorCodePoint != minorBoundary - 0x400u);
Assert(majorBoundary + 0x3FFu != majorCodePoint);
// If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
// We can only cross one boundary.
Assert(majorCodePoint - minorCodePoint < 0x800u);
tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
}
else
{
// We have 3 sets of ranges, comprising of prefix, full and suffix.
Assert(majorCodePoint - minorCodePoint >= 0x400u);
Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
|| (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
|| (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
Node* lowerOfFullRange;
char16 lowerMinorBoundary, lowerMajorBoundary, ignore;
Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
bool singleFullRange = majorBoundary == minorBoundary;
if (singleFullRange)
{
// The lower part of the full range is simple a surrogate lower char
lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
}
else
{
Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
lowerOfFullRange = setNode;
}
MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
// These are added in the following order [full] [prefix][suffix]
// This is doing by prepending, so in reverse.
if (suffixNode != nullptr)
{
tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
}
if (prefixNode != nullptr)
{
tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
}
tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
}
}
if (lastAltNode != nullptr)
{
Assert(lastAltNode->tail == nullptr);
lastAltNode->tail = tailToAdd;
}
return tailToAdd;
}
template <typename P, const bool IsLiteral>
AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
{
char16 lower, upper;
Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
if (lastAltNode != nullptr)
{
lastAltNode->tail = tailNode;
}
return tailNode;
}
//
// Errors
//
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::Fail(HRESULT error)
{
throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
}
// This doesn't throw, but stores first error code for throwing later
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
{
Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
if (this->deferredIfUnicodeError == nullptr)
{
this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
}
}
// This doesn't throw, but stores first error code for throwing later
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
{
Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
if (this->deferredIfNotUnicodeError == nullptr)
{
this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
}
}
template <typename P, const bool IsLiteral>
inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
{
// We never look for 0
Assert(ec != 0);
if (ECLookahead() != ec)
Fail(error);
ECConsume();
}
template <typename P, const bool IsLiteral>
inline char16 Parser<P, IsLiteral>::NextChar()
{
Assert(!IsEOF());
// Could be an embedded 0
Char c = this->template ReadFull<true>(next, inputLim);
// No embedded newlines in literals
if (IsLiteral && standardChars->IsNewline(c))
Fail(ERRnoSlash);
return c;
}
//
// Patterns/Disjunctions/Alternatives
//
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::PatternPass0()
{
this->positionAfterLastSurrogate = nullptr;
this->deferredIfNotUnicodeError = nullptr;
this->deferredIfUnicodeError = nullptr;
DisjunctionPass0(0);
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::PatternPass1()
{
return DisjunctionPass1();
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
{
if (prev->tag == Node::MatchChar)
{
MatchCharNode* prevChar = (MatchCharNode*)prev;
if (curr->tag == Node::MatchChar)
{
MatchCharNode* currChar = (MatchCharNode*)curr;
if (prevChar->cs[0] == currChar->cs[0])
// Just ignore current node
return prevChar;
else
{
// Union chars into new set
MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
setNode->set.Set(ctAllocator, prevChar->cs[0]);
setNode->set.Set(ctAllocator, currChar->cs[0]);
return setNode;
}
}
else if (curr->tag == Node::MatchSet)
{
MatchSetNode* currSet = (MatchSetNode*)curr;
if (currSet->isNegation)
// Can't merge
return 0;
else
{
// Union chars into new set
MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
setNode->set.UnionInPlace(ctAllocator, currSet->set);
return setNode;
}
}
else
// Can't merge
return 0;
}
else if (prev->tag == Node::MatchSet)
{
MatchSetNode* prevSet = (MatchSetNode*)prev;
if (prevSet->isNegation)
// Can't merge
return 0;
else if (curr->tag == Node::MatchChar)
{
MatchCharNode* currChar = (MatchCharNode*)curr;
// Include char in prev set
prevSet->set.Set(ctAllocator, currChar->cs[0]);
return prevSet;
}
else if (curr->tag == Node::MatchSet)
{
MatchSetNode* currSet = (MatchSetNode*)curr;
if (currSet->isNegation)
// Can't merge
return 0;
else
{
// Include chars in prev set
prevSet->set.UnionInPlace(ctAllocator, currSet->set);
return prevSet;
}
}
else
// Can't merge
return 0;
}
else
// Can't merge
return 0;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
{
AlternativePass0(depth);
while (true)
{
// Could be terminating 0
if (ECLookahead() != '|')
return;
ECConsume();
AlternativePass0(depth);
}
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::DisjunctionPass1()
{
// Maintain the invariants:
// - alt lists have two or more items
// - alt list items are never alt lists (so we must inline them)
// - an alt list never contains two consecutive match-character/match-set nodes
// (so we must union consecutive items into a single set)
Node* node = AlternativePass1();
AltNode* last = 0;
// First node may be an alternative
if (node->tag == Node::Alt)
{
last = (AltNode*)node;
while (last->tail != 0)
last = last->tail;
}
while (true)
{
// Could be terminating 0
if (ECLookahead() != '|')
return node;
ECConsume(); // '|'
Node* next = AlternativePass1();
AnalysisAssert(next != nullptr);
Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
if (revisedPrev != 0)
{
// Can merge next into previously seen alternative
if (last == 0)
node = revisedPrev;
else
last->head = revisedPrev;
}
else if (next->tag == Node::Alt)
{
AltNode* nextList = (AltNode*)next;
// Append inner list to current list
revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
if (revisedPrev != 0)
{
// Can merge head of list into previously seen alternative
if (last ==0)
node = revisedPrev;
else
last->head = revisedPrev;
nextList = nextList->tail;
}
AnalysisAssert(nextList != nullptr);
if (last == 0)
node = Anew(ctAllocator, AltNode, node, nextList);
else
last->tail = nextList;
while (nextList->tail != 0)
nextList = nextList->tail;
last = nextList;
}
else
{
// Append node
AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
if (last == 0)
node = Anew(ctAllocator, AltNode, node, cons);
else
last->tail = cons;
last = cons;
}
}
}
template <typename P, const bool IsLiteral>
inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
{
EncodedChar ec = ECLookahead();
// Could be terminating 0, but embedded 0 is part of alternative
return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
{
if (litbufLen - litbufNext < size)
{
CharCount newLen = max(litbufLen, initLitbufSize);
while (newLen < litbufNext + size)
newLen *= 2;
litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
litbufLen = newLen;
}
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
{
Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
CharCount addLen = charOrLiteralNode->LiteralLength();
Assert(addLen > 0);
if (deferredLiteralNode->length == 0)
{
// Start a new literal
EnsureLitbuf(addLen);
deferredLiteralNode->offset = litbufNext;
deferredLiteralNode->length = addLen;
charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
}
else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
{
// Keep growing the current literal
EnsureLitbuf(addLen);
charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
deferredLiteralNode->length += addLen;
}
else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
{
// Absorb next literal into current literal since they are adjacent
deferredLiteralNode->length += addLen;
}
else
{
// Abandon current literal and start a fresh one (leaves gap)
EnsureLitbuf(deferredLiteralNode->length + addLen);
js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
deferredLiteralNode->offset = litbufNext;
litbufNext += deferredLiteralNode->length;
charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
deferredLiteralNode->length += addLen;
}
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
{
if (node == deferredLiteralNode)
{
#if DBG
if (deferredLiteralNode->length == 0)
Assert(false);
#endif
Assert(deferredLiteralNode->offset < litbufNext);
Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
if (deferredLiteralNode->length == 1)
{
node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
// Reclaim last added character
litbufNext--;
// else: leave a gap in the literal buffer
}
else
node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
deferredLiteralNode->offset = 0;
deferredLiteralNode->length = 0;
}
return node;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::AlternativePass0(int depth)
{
while (!IsEndOfAlternative())
TermPass0(depth);
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::AlternativePass1()
{
if (IsEndOfAlternative())
return Anew(ctAllocator, SimpleNode, Node::Empty);
MatchCharNode deferredCharNode(0);
MatchLiteralNode deferredLiteralNode(0, 0);
// Maintain the invariants:
// - concat lists have two or more items
// - concat list items are never concat lists
// - a concat list never contains two consecutive match-character/match-literal nodes
bool previousSurrogatePart = false;
Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
AnalysisAssert(node != nullptr);
ConcatNode* last = 0;
// First node may be a concat
if (node->tag == Node::Concat)
{
last = (ConcatNode*)node;
while (last->tail != 0)
last = last->tail;
}
if (last == 0)
{
if (node->LiteralLength() > 0)
{
// Begin a new literal
AccumLiteral(&deferredLiteralNode, node);
node = &deferredLiteralNode;
}
}
else
{
if (last->head->LiteralLength() > 0)
{
// Begin a new literal
AccumLiteral(&deferredLiteralNode, last->head);
last->head = &deferredLiteralNode;
}
}
while (!IsEndOfAlternative())
{
Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
AnalysisAssert(next != nullptr);
if (next->LiteralLength() > 0)
{
// Begin a new literal or grow the existing literal
AccumLiteral(&deferredLiteralNode, next);
if (last == 0)
{
if (node != &deferredLiteralNode)
{
// So far we have first item and the current literal
ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
node = Anew(ctAllocator, ConcatNode, node, cons);
last = cons;
}
// else: keep growing first literal
}
else
{
if (last->head != &deferredLiteralNode)
{
// Append a new literal node
ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
last->tail = cons;
last = cons;
}
// else: keep growing current literal
}
}
else if (next->tag == Node::Concat)
{
// Append this list to accumulated list
ConcatNode* nextList = (ConcatNode*)next;
if (nextList->head->LiteralLength() > 0 &&
((last == 0 && node == &deferredLiteralNode) ||
(last != 0 && last->head == &deferredLiteralNode)))
{
// Absorb the next character or literal into the current literal
// (may leave a gab in litbuf)
AccumLiteral(&deferredLiteralNode, nextList->head);
nextList = nextList->tail;
// List has at least two items
AnalysisAssert(nextList != 0);
// List should be in canonical form, so no consecutive chars/literals
Assert(nextList->head->LiteralLength() == 0);
}
if (last == 0)
node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
else
{
last->head = FinalTerm(last->head, &deferredLiteralNode);
last->tail = nextList;
}
while (nextList->tail != 0)
nextList = nextList->tail;
last = nextList;
// No outstanding literals
Assert(deferredLiteralNode.length == 0);
if (last->head->LiteralLength() > 0)
{
// If the list ends with a literal, transfer it into deferredLiteralNode
// so we can continue accumulating (won't leave a gab in litbuf)
AccumLiteral(&deferredLiteralNode, last->head);
// Can discard MatchLiteralNode since it lives in compile-time allocator
last->head = &deferredLiteralNode;
}
}
else
{
// Append this node to accumulated list
ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
if (last == 0)
node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
else
{
last->head = FinalTerm(last->head, &deferredLiteralNode);
last->tail = cons;
}
last = cons;
// No outstanding literals
Assert(deferredLiteralNode.length == 0);
}
}
if (last == 0)
node = FinalTerm(node, &deferredLiteralNode);
else
last->head = FinalTerm(last->head, &deferredLiteralNode);
// No outstanding literals
Assert(deferredLiteralNode.length == 0);
return node;
}
//
// Terms
//
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
{
//
// NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
// HOWEVER if r contains a group definition and could match empty then we must execute as a loop
// so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
// this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
//
if (lower == 1 && upper == 1)
return body;
else if (lower == 0 && upper == 0)
// Loop is equivalent to empty. If the loop body contains group definitions they will have already been
// counted towards the overall number of groups. The matcher will initialize their contents to
// undefined, and since the loop body would never execute the inner groups could never be updated from
// undefined.
return Anew(ctAllocator, SimpleNode, Node::Empty);
else
return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
}
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::AtQuantifier()
{
// Could be terminating 0
switch (ECLookahead())
{
case '*':
case '+':
case '?':
return true;
case '{':
{
CharCount lookahead = 1;
while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
lookahead++;
if (lookahead == 1 || !ECCanConsume(lookahead + 1))
return false;
switch (ECLookahead(lookahead))
{
case ',':
lookahead++;
if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
return true;
else
{
CharCount saved = lookahead;
while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
lookahead++;
if (lookahead == saved)
return false;
return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
}
case '}':
return true;
default:
return false;
}
}
default:
return false;
}
}
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::OptNonGreedy()
{
// Could be terminating 0
if (ECLookahead() != '?')
return true;
ECConsume();
return false;
}
template <typename P, const bool IsLiteral>
CharCount Parser<P, IsLiteral>::RepeatCount()
{
CharCount n = 0;
int digits = 0;
while (true)
{
// Could be terminating 0
EncodedChar ec = ECLookahead();
if (!standardEncodedChars->IsDigit(ec))
{
if (digits == 0)
Fail(JSERR_RegExpSyntax);
return n;
}
if (n > MaxCharCount / 10)
Fail(JSERR_RegExpSyntax);
n *= 10;
if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
Fail(JSERR_RegExpSyntax);
n += standardEncodedChars->DigitValue(ec);
digits++;
ECConsume();
}
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::TermPass0(int depth)
{
PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
// Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
// Or must be cleared if we didn't perform the check
bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
switch (ECLookahead())
{
case '^':
case '$':
ECConsume();
return;
case '\\':
ECConsume();
if (AtomEscapePass0())
return;
break;
case '(':
// Can't combine into a single codeunit because of group present
this->tempLocationOfSurrogatePair = nullptr;
this->codePointAtTempLocation = 0;
clearLocationIfPresent = false;
ECConsume();
switch (ECLookahead())
{
case '?':
if (!ECCanConsume(2))
Fail(JSERR_RegExpSyntax);
switch (ECLookahead(1))
{
case '=':
case '!':
case ':':
ECConsume(2);
break;
default:
numGroups++;
break;
}
break;
default:
numGroups++;
break;
}
DisjunctionPass0(depth + 1);
ECMust(')', JSERR_RegExpNoParen);
break;
case '.':
ECConsume();
break;
case '[':
ECConsume();
this->tempLocationOfSurrogatePair = nullptr;
this->codePointAtTempLocation = 0;
this->tempLocationOfRange = next;
CharacterClassPass0();
this->tempLocationOfRange = nullptr;
ECMust(']', JSERR_RegExpNoBracket);
break;
case ')':
case '|':
Fail(JSERR_RegExpSyntax);
break;
case ']':
case '}':
NextChar();
break;
case '*':
case '+':
case '?':
case '{':
if (AtQuantifier())
Fail(JSERR_RegExpBadQuant);
else
ECConsume();
break;
case 0:
if (IsEOF())
// Terminating 0
Fail(JSERR_RegExpSyntax);
// else fall-through for embedded 0
default:
{
const EncodedChar* current = next;
Char c = NextChar();
if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
TrackIfSurrogatePair(c, current, (uint32)(next - current));
}
// Closing '/' in literals should be caught explicitly
Assert(!IsLiteral || c != '/');
}
break;
}
if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
{
this->tempLocationOfSurrogatePair = nullptr;
this->codePointAtTempLocation = 0;
}
if (AtQuantifier())
{
switch (ECLookahead())
{
case '*':
case '+':
case '?':
ECConsume();
OptNonGreedy();
break;
case '{':
{
ECConsume();
CharCount lower = RepeatCount();
switch (ECLookahead())
{
case ',':
ECConsume();
if (ECLookahead() == '}')
{
ECConsume();
OptNonGreedy();
}
else
{
CharCount upper = RepeatCount();
if (upper < lower)
Fail(JSERR_RegExpSyntax);
Assert(ECLookahead() == '}');
ECConsume();
OptNonGreedy();
}
break;
case '}':
ECConsume();
OptNonGreedy();
break;
default:
Assert(false);
break;
}
break;
}
default:
Assert(false);
break;
}
}
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
{
PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
Node* node = 0;
bool containsSurrogatePair = false;
switch (ECLookahead())
{
case '^':
ECConsume();
return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
case '$':
ECConsume();
return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
case '\\':
ECConsume();
if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
{
break; // For quantifier
}
else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
{
return node; // No quantifier allowed
}
break; // else: fall-through for opt quantifier
case '(':
ECConsume();
switch (ECLookahead())
{
case '?':
switch (ECLookahead(1))
{
case '=':
ECConsume(2); // ?=
node = DisjunctionPass1();
Assert(ECLookahead() == ')');
ECConsume(); // )
node = Anew(ctAllocator, AssertionNode, false, node);
break; // As per Annex B, allow this to be quantifiable
case '!':
ECConsume(2); // ?!
node = DisjunctionPass1();
Assert(ECLookahead() == ')');
ECConsume(); // )
node = Anew(ctAllocator, AssertionNode, true, node);
break; // As per Annex B, allow this to be quantifiable
case ':':
ECConsume(2); // ?:
node = DisjunctionPass1();
Assert(ECLookahead() == ')');
ECConsume(); // )
break; // fall-through for opt quantifier
default:
{
// ? not yet consumed
int thisGroupId = nextGroupId++;
node = DisjunctionPass1();
Assert(ECLookahead() == ')');
ECConsume(); // )
node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
break; // fall-through for opt quantifier
}
}
break;
default:
{
// next char not yet consumed
int thisGroupId = nextGroupId++;
node = DisjunctionPass1();
Assert(ECLookahead() == ')');
ECConsume(); // )
node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
break; // fall-through for opt quantifier
}
}
break;
case '.':
{
ECConsume();
node = GetNodeWithValidCharacterSet('.');
break; // fall-through for opt quantifier
}
case '[':
ECConsume();
if (unicodeFlagPresent)
{
containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
}
node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
Assert(ECLookahead() == ']');
ECConsume(); // ]
break; // fall-through for opt quantifier
#if DBG
case ')':
case '|':
Assert(false);
break;
#endif
case ']':
// SPEC DEVIATION: This should be syntax error, instead accept as itself
deferredCharNode->cs[0] = NextChar();
node = deferredCharNode;
break; // fall-through for opt quantifier
#if DBG
case '*':
case '+':
case '?':
AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
break;
#endif
case 0:
if (IsEOF())
// Terminating 0
Fail(JSERR_RegExpSyntax);
// else fall-through for embedded 0
default:
if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
{
break; //For quantifier
}
else
{
deferredCharNode->cs[0] = NextChar();
node = deferredCharNode;
break; // fall-through for opt quantifier
}
}
Assert(node != 0);
if (AtQuantifier())
{
switch (ECLookahead())
{
case '*':
if (node == deferredCharNode)
node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
ECConsume();
return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
case '+':
if (node == deferredCharNode)
node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
ECConsume();
return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
case '?':
if (node == deferredCharNode)
node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
ECConsume();
return NewLoopNode(0, 1, OptNonGreedy(), node);
case '{':
{
if (node == deferredCharNode)
node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
ECConsume();
CharCount lower = RepeatCount();
switch (ECLookahead())
{
case ',':
ECConsume();
if (ECLookahead() == '}')
{
ECConsume();
return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
}
else
{
CharCount upper = RepeatCount();
Assert(lower <= upper);
Assert(ECLookahead() == '}');
ECConsume(); // }
return NewLoopNode(lower, upper, OptNonGreedy(), node);
}
case '}':
ECConsume();
return NewLoopNode(lower, lower, OptNonGreedy(), node);
default:
Assert(false);
break;
}
break;
}
default:
Assert(false);
break;
}
}
return node;
}
#pragma warning(push)
#pragma warning(disable:4702) // unreachable code
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::AtomEscapePass0()
{
EncodedChar ec = ECLookahead();
if (ec == 0 && IsEOF())
{
// Terminating 0
Fail(JSERR_RegExpSyntax);
return false;
}
else if (standardEncodedChars->IsDigit(ec))
{
do
{
ECConsume();
}
while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
return false;
}
else if (ECLookahead() == 'c')
{
if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
ECConsume(2);
return false;
}
else
{
const EncodedChar *current = next;
// An escaped '/' is ok
Char c = NextChar();
switch (c)
{
case 'b':
case 'B':
return true;
// case 'c': handled as special case above
case 'x':
if (ECCanConsume(2) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)))
ECConsume(2);
break;
case 'u':
bool surrogateEncountered = false;
int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
if (lengthOfSurrogate > 0)
{
if (surrogateEncountered)
{
// If we don't have an allocator, we don't create nodes
// Asserts in place as extra checks for when we do have an allocator
Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
ECConsume(lengthOfSurrogate);
}
//Don't fall through
break;
}
else if (ECCanConsume(4) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)) &&
standardEncodedChars->IsHex(ECLookahead(2)) &&
standardEncodedChars->IsHex(ECLookahead(3)))
{
if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
(standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
(standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(3)));
TrackIfSurrogatePair(value, (next - 1), 5);
}
ECConsume(4);
}
break;
}
// embedded 0 is ok
return false;
}
}
#pragma warning(pop)
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
{
Assert(!IsEOF()); // checked for terminating 0 in pass 0
if (standardEncodedChars->IsDigit(ECLookahead()))
{
// As per Annex B, allow octal escapes as well as group references, disambiguate based on known
// number of groups.
if (ECLookahead() == '0')
{
// fall through for octal
}
else
{
// Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
int n = 0;
CharCount digits = 0;
do
{
n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
digits++;
}
while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
if (n >= numGroups ||
(ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits))))
{
if (standardEncodedChars->IsOctal(ECLookahead()))
{
// fall through for octal
}
else
{
// \8 and \9 are identity escapes
deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
ECConsume();
node = deferredCharNode;
return false; // not an assertion
}
}
else
{
ECConsume(digits);
node = Anew(ctAllocator, MatchGroupNode, n);
return false; // not an assertion
}
}
// Must be between 1 and 3 octal digits
Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
uint n = 0;
CharCount digits = 0;
do
{
uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
break;
n = m;
ECConsume();
digits++;
}
while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
deferredCharNode->cs[0] = UTC((UChar)n);
node = deferredCharNode;
return false; // not an assertion
}
else if (ECLookahead() == 'c')
{
Char c;
if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
{
c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
ECConsume(2);
}
else
{
// SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
// don't consume the 'c' or letter.
c = '\\';
}
deferredCharNode->cs[0] = c;
node = deferredCharNode;
return false; // not an assertion
}
else
{
Char c = NextChar();
switch (c)
{
case 'b':
node = Anew(ctAllocator, WordBoundaryNode, false);
return true; // Is an assertion
case 'B':
node = Anew(ctAllocator, WordBoundaryNode, true);
return true; // Is an assertion
case 'f':
c = '\f';
break; // fall-through for identity escape
case 'n':
c = '\n';
break; // fall-through for identity escape
case 'r':
c = '\r';
break; // fall-through for identity escape
case 't':
c = '\t';
break; // fall-through for identity escape
case 'v':
c = '\v';
break; // fall-through for identity escape
case 'd':
{
MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
standardChars->SetDigits(ctAllocator, setNode->set);
node = setNode;
return false; // not an assertion
}
case 'D':
{
node = GetNodeWithValidCharacterSet('D');
return false; // not an assertion
}
case 's':
{
MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
standardChars->SetWhitespace(ctAllocator, setNode->set);
node = setNode;
return false; // not an assertion
}
case 'S':
{
node = GetNodeWithValidCharacterSet('S');
return false; // not an assertion
}
case 'w':
{
MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
if (this->unicodeFlagPresent && this->caseInsensitiveFlagPresent)
{
standardChars->SetWordIUChars(ctAllocator, setNode->set);
}
else
{
standardChars->SetWordChars(ctAllocator, setNode->set);
}
node = setNode;
return false; // not an assertion
}
case 'W':
{
node = GetNodeWithValidCharacterSet('W');
return false; // not an assertion
}
// case 'c': handled as special case above
case 'x':
if (ECCanConsume(2) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)))
{
c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(1))));
ECConsume(2);
// fall-through for identity escape
}
// Take to be identity escape if ill-formed as per Annex B
break;
case 'u':
if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
break;
else if (ECCanConsume(4) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)) &&
standardEncodedChars->IsHex(ECLookahead(2)) &&
standardEncodedChars->IsHex(ECLookahead(3)))
{
c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
(standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
(standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(3))));
ECConsume(4);
// fall-through for identity escape
}
// Take to be identity escape if ill-formed as per Annex B
break;
default:
// As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
break;
}
// Must be an identity escape
deferredCharNode->cs[0] = c;
node = deferredCharNode;
return false; // not an assertion
}
}
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
{
if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
{
return false;
}
if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
{
AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
char16 lower, upper;
uint tableIndex = 0, actualHigh = 0;
codepoint_t equivClass[CaseInsensitive::EquivClassSize];
if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
{
Node *equivNode[CaseInsensitive::EquivClassSize];
int indexForNextNode = 0;
for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
{
bool alreadyAdded = false;
for (int j = 0; j < i; j++)
{
if (equivClass[i] == equivClass[j])
{
alreadyAdded = true;
break;
}
}
if (!alreadyAdded)
{
if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
{
Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
}
else
{
equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
}
indexForNextNode ++;
}
}
Assert(indexForNextNode > 0);
if (indexForNextNode == 1)
{
node = equivNode[0];
}
else
{
AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
AltNode *altNodeTail = altNode;
for (int i = 1; i < indexForNextNode; i++)
{
altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
altNodeTail = altNodeTail->tail;
}
node = altNode;
}
}
else
{
Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
node = CreateSurrogatePairAtom(lower, upper);
}
previousSurrogatePart = false;
Assert(ECCanConsume(this->currentSurrogatePairNode->length));
ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
return true;
}
return false;
}
//
// Classes
//
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
{
Assert(ECLookahead() == '-');
if (ECLookahead(1) == '\\')
{
switch (ECLookahead(2))
{
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
// These all denote non-singleton sets
return false;
default:
// fall-through for singleton
break;
}
}
return true;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::CharacterClassPass0()
{
// Could be terminating 0
if (ECLookahead() == '^')
ECConsume();
EncodedChar nextChar = ECLookahead();
const EncodedChar* current;
codepoint_t lastCodepoint = INVALID_CODEPOINT;
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
bool previousSurrogatePart = false;
while(nextChar != ']')
{
current = next;
if (nextChar == '\0' && IsEOF())
{
// Report as unclosed '['
Fail(JSERR_RegExpNoBracket);
return;
} // Otherwise embedded '\0' is ok
else if (nextChar == '\\')
{
// Consume, as classescapepass0 expects for it to be consumed
Char outChar = NextChar();
// If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
// This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
if (ClassEscapePass0(outChar, previousSurrogatePart))
{
lastCodepoint = outChar;
}
else
{
// Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
lastCodepoint = INVALID_CODEPOINT;
// Unless we have a possible range end, cancel our range tracking.
if (pendingRangeEnd == INVALID_CODEPOINT)
{
pendingRangeStart = INVALID_CODEPOINT;
}
}
}
else if (nextChar == '-')
{
if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
{
// '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
// OR
// '-' is just a char, consume it and set as last char
lastCodepoint = NextChar();
}
else
{
pendingRangeStart = this->next == this->positionAfterLastSurrogate
? this->valueOfLastSurrogate
: lastCodepoint;
lastCodepoint = INVALID_CODEPOINT;
NextChar();
}
// If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
// In order to achieve this, the two variables that we use to track surrogate pairs, namely
// tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
//
// We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
// when we're here. We need to clear it in order not to have a surrogate pair when we process the
// second escape (\udfff).
//
// previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
// character is in a supplementary plane. However, there is no need to change its value here. When
// such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
// to true, but it rewinds the input back to the beginning of the escape sequence. The next
// iteration of the loop here will again call ClassEscape0() with the same character and the
// variable will this time be set to false. Therefore, the variable will always be false here.
tempLocationOfSurrogatePair = nullptr;
Assert(!previousSurrogatePart);
}
else
{
lastCodepoint = NextChar();
if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
}
}
if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
// This is much easier to handle.
if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
{
Assert(pendingRangeStart < 0x10000);
Assert(pendingRangeEnd == INVALID_CODEPOINT);
if (pendingRangeStart > lastCodepoint)
{
Fail(JSERR_RegExpBadRange);
}
pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
}
}
// We have a candidate for range end, and we have a range start.
else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
{
// The following will be true at the end of each surrogate pair parse.
// Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
if (this->next == this->positionAfterLastSurrogate)
{
lastCodepoint = this->valueOfLastSurrogate;
Assert(!previousSurrogatePart);
pendingRangeEnd = lastCodepoint;
lastCodepoint = INVALID_CODEPOINT;
}
// If we the next character is the end of range ']', then we can't have a surrogate pair.
// The current character is the range end, if we don't already have a candidate.
else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
{
pendingRangeEnd = lastCodepoint;
}
//If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
if (pendingRangeEnd != INVALID_CODEPOINT)
{
char16 leftSingleChar, rightSingleChar, ignore;
if (pendingRangeStart >= 0x10000)
{
Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
}
else
{
leftSingleChar = (char16)pendingRangeStart;
}
if (pendingRangeEnd >= 0x10000)
{
Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
}
else
{
rightSingleChar = (char16)pendingRangeEnd;
}
// Here it is a bit tricky, we don't know if we have a unicode option specified.
// If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
{
Fail(JSERR_RegExpBadRange);
}
else
{
if(leftSingleChar > rightSingleChar)
{
DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
}
if (pendingRangeStart > pendingRangeEnd)
{
DeferredFailIfUnicode(JSERR_RegExpBadRange);
}
}
pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
}
// The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
else
{
pendingRangeEnd = lastCodepoint;
}
}
nextChar = ECLookahead();
}
// We should never have a pendingRangeEnd set when we exit the loop
Assert(pendingRangeEnd == INVALID_CODEPOINT);
}
template <typename P, const bool IsLiteral>
template <bool containsSurrogates>
Node* Parser<P, IsLiteral>::CharacterClassPass1()
{
Assert(containsSurrogates ? unicodeFlagPresent : true);
CharSet<codepoint_t> codePointSet;
MatchSetNode deferredSetNode(false, false);
MatchCharNode deferredCharNode(0);
bool isNegation = false;
if (ECLookahead() == '^')
{
isNegation = true;
ECConsume();
}
// We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
// CharacterClassPass0 should have taken care of terminating null.
codepoint_t pendingCodePoint = INVALID_CODEPOINT;
codepoint_t pendingRangeStart = INVALID_CODEPOINT;
EncodedChar nextChar = ECLookahead();
bool previousWasASurrogate = false;
while(nextChar != ']')
{
codepoint_t codePointToSet = INVALID_CODEPOINT;
// Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
if (nextChar == '\\')
{
ECConsume();
}
// These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
{
codePointToSet = pendingCodePoint;
pendingCodePoint = this->currentSurrogatePairNode->value;
Assert(ECCanConsume(this->currentSurrogatePairNode->length));
ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
}
else if (nextChar == '\\')
{
Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
if (returnedNode->tag == Node::MatchSet)
{
codePointToSet = pendingCodePoint;
pendingCodePoint = INVALID_CODEPOINT;
if (pendingRangeStart != INVALID_CODEPOINT)
{
codePointSet.Set(ctAllocator, '-');
}
pendingRangeStart = INVALID_CODEPOINT;
codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
}
else
{
// Just a character
codePointToSet = pendingCodePoint;
pendingCodePoint = deferredCharNode.cs[0];
}
}
else if (nextChar == '-')
{
if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
{
// - is just a char, or end of a range.
codePointToSet = pendingCodePoint;
pendingCodePoint = '-';
ECConsume();
}
else
{
pendingRangeStart = pendingCodePoint;
ECConsume();
}
}
else
{
// Just a character, consume it
codePointToSet = pendingCodePoint;
pendingCodePoint = NextChar();
}
if (codePointToSet != INVALID_CODEPOINT)
{
if (pendingRangeStart != INVALID_CODEPOINT)
{
if (pendingRangeStart > pendingCodePoint)
{
//We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
//This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
Assert(!unicodeFlagPresent);
Fail(JSERR_RegExpBadRange);
}
codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
}
else
{
codePointSet.Set(ctAllocator, codePointToSet);
}
}
nextChar = ECLookahead();
}
if (pendingCodePoint != INVALID_CODEPOINT)
{
codePointSet.Set(ctAllocator, pendingCodePoint);
}
// At this point, we have a complete set of codepoints representing the range.
// Before performing translation of any kind, we need to do some case filling.
// At the point of this comment, there are no case mappings going cross-plane between simple
// characters (< 0x10000) and supplementary characters (>= 0x10000)
// However, it might still be the case, and this has to be handled.
// On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
// At least for simple characters.
// The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
// Negations and case mappings will be handled later.
if (!unicodeFlagPresent)
{
Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
return simpleToReturn;
}
// Everything past here must be under the flag
Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
if (codePointSet.IsEmpty())
{
return Anew(ctAllocator, MatchSetNode, false, false);
}
Node* prefixNode = nullptr;
Node* suffixNode = nullptr;
CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
// If a singleton, return a simple character
bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
if (isSingleton)
{
codepoint_t singleton = codePointSet.Singleton();
Node* toReturn = nullptr;
if (singleton < 0x10000)
{
toReturn = Anew(ctAllocator, MatchCharNode, (char16)singleton);
}
else
{
Assert(unicodeFlagPresent);
char16 lowerSurrogate, upperSurrogate;
Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
}
codePointSet.Clear(ctAllocator);
return toReturn;
}
// If negation, we want to complement the simple chars.
// When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
CharSet<codepoint_t> negatedSet;
if (!this->caseInsensitiveFlagPresent)
{
if (isNegation)
{
// Complement all characters, and use it as the set toTranslate
codePointSet.ToComplement(ctAllocator, negatedSet);
}
toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
if (isNegation)
{
// Clear this, as we will no longer need this.
codePointSet.FreeBody(ctAllocator);
}
}
else
{
CharSet<codepoint_t> caseEquivalent;
codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
// Equiv set can't have a reduced count of chars
Assert(caseEquivalent.Count() >= codePointSet.Count());
// Here we have a regex that has both case insensitive and unicode options.
// The range might also be negated. If it is negated, we can go ahead and negate
// the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
if (isNegation)
{
codePointSet.Clear(ctAllocator);
caseEquivalent.ToComplement(ctAllocator, codePointSet);
caseEquivalent.FreeBody(ctAllocator);
}
else
{
codePointSet.CloneFrom(ctAllocator, caseEquivalent);
}
Assert(toUseForTranslation == &codePointSet);
}
uint totalCodePointsCount = toUseForTranslation->Count();
uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
if (totalCodePointsCount == simpleCharsCount)
{
MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
return simpleToReturn;
}
if (simpleCharsCount > 0)
{
if (!toUseForTranslation->ContainSurrogateCodeUnits())
{
MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
prefixNode = node;
}
else
{
MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
prefixNode = node;
node = Anew(ctAllocator, MatchSetNode, false, false);
toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
suffixNode = node;
}
}
Assert(unicodeFlagPresent);
AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
AltNode *currentTail = headToReturn;
codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
{
if (lowerCharOfRange == upperCharOfRange)
{
currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
}
else
{
currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
}
if (headToReturn == nullptr)
{
headToReturn = currentTail;
}
AnalysisAssert(currentTail != nullptr);
while (currentTail->tail != nullptr)
{
currentTail = currentTail->tail;
}
charRangeSearchIndex = upperCharOfRange + 1;
}
if (suffixNode != nullptr)
{
currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
}
toUseForTranslation->Clear(ctAllocator);
if (headToReturn != nullptr && headToReturn->tail == nullptr)
{
return headToReturn->head;
}
return headToReturn;
}
#pragma warning(push)
#pragma warning(disable:4702) // unreachable code
template <typename P, const bool IsLiteral>
bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
{
// Could be terminating 0
EncodedChar ec = ECLookahead();
if (ec == 0 && IsEOF())
{
Fail(JSERR_RegExpSyntax);
return false;
}
else if (standardEncodedChars->IsOctal(ec))
{
uint n = 0;
CharCount digits = 0;
do
{
uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
break;
n = m;
ECConsume();
digits++;
}
while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
singleton = UTC((UChar)n);
// Clear possible pair
this->tempLocationOfSurrogatePair = nullptr;
return true;
}
else
{
const EncodedChar* location = this->tempLocationOfSurrogatePair;
// Clear it for now, otherwise to many branches to clear it on.
this->tempLocationOfSurrogatePair = nullptr;
// An escaped '/' is ok
Char c = NextChar();
switch (c)
{
case 'b':
singleton = '\b';
return true;
case 'f':
singleton = '\f';
return true;
case 'n':
singleton = '\n';
return true;
case 'r':
singleton = '\r';
return true;
case 't':
singleton = '\t';
return true;
case 'v':
singleton = '\v';
return true;
case 'd':
case 'D':
case 's':
case 'S':
case 'w':
case 'W':
return false;
case 'c':
if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
{
singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
ECConsume();
}
else
{
if (!IsEOF())
{
EncodedChar ecLookahead = ECLookahead();
switch (ecLookahead)
{
case '-':
case ']':
singleton = c;
break;
default:
singleton = UTC(Chars<EncodedChar>::CTU(ecLookahead) % 32);
ECConsume();
break;
}
}
else
singleton = c;
}
return true;
case 'x':
if (ECCanConsume(2) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)))
{
singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(1))));
ECConsume(2);
}
else
singleton = c;
return true;
case 'u':
this->tempLocationOfSurrogatePair = location;
if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
return true;
else if (ECCanConsume(4) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)) &&
standardEncodedChars->IsHex(ECLookahead(2)) &&
standardEncodedChars->IsHex(ECLookahead(3)))
{
singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
(standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
(standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(3))));
if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
// Current location
TrackIfSurrogatePair(singleton, (next - 1), 5);
}
// The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
ECConsume(4);
}
else
singleton = c;
return true;
default:
// embedded 0 is ok
singleton = c;
return true;
}
}
}
#pragma warning(pop)
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
{
// Checked for terminating 0 is pass 0
Assert(!IsEOF());
if (standardEncodedChars->IsOctal(ECLookahead()))
{
// As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
// Must be between 1 and 3 octal digits.
uint n = 0;
CharCount digits = 0;
do
{
uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
break;
n = m;
ECConsume();
digits++;
}
while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
deferredCharNode->cs[0] = UTC((UChar)n);
return deferredCharNode;
}
else
{
Char c = NextChar();
switch (c)
{
case 'b':
c = '\b';
break; // fall-through for identity escape
case 'f':
c = '\f';
break; // fall-through for identity escape
case 'n':
c = '\n';
break; // fall-through for identity escape
case 'r':
c = '\r';
break; // fall-through for identity escape
case 't':
c = '\t';
break; // fall-through for identity escape
case 'v':
c = '\v';
break; // fall-through for identity escape
case 'd':
standardChars->SetDigits(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 'D':
standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 's':
standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 'S':
standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 'w':
standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 'W':
standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
return deferredSetNode;
case 'c':
if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
{
c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
ECConsume();
// fall-through for identity escape
}
else
{
// SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
// However, '-', ']', and EOF make the \c just a 'c'.
if (!IsEOF())
{
EncodedChar ec = ECLookahead();
switch (ec)
{
case '-':
case ']':
// fall-through for identity escape with 'c'
break;
default:
c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
ECConsume();
// fall-through for identity escape
break;
}
}
// else: fall-through for identity escape with 'c'
}
break;
case 'x':
if (ECCanConsume(2) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)))
{
c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(1))));
ECConsume(2);
// fall-through for identity escape
}
// Take to be identity escape if ill-formed as per Annex B
break;
case 'u':
if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
break;
else if (ECCanConsume(4) &&
standardEncodedChars->IsHex(ECLookahead(0)) &&
standardEncodedChars->IsHex(ECLookahead(1)) &&
standardEncodedChars->IsHex(ECLookahead(2)) &&
standardEncodedChars->IsHex(ECLookahead(3)))
{
c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
(standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
(standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(3))));
ECConsume(4);
// fall-through for identity escape
}
// Take to be identity escape if ill-formed as per Annex B.
break;
default:
// As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
break;
}
// Must be an identity escape
deferredCharNode->cs[0] = c;
return deferredCharNode;
}
}
//
// Options
//
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::Options(RegexFlags& flags)
{
while (true)
{
// Could be terminating 0
EncodedChar ec = ECLookahead();
CharCount consume;
Char c;
if (ec == 0)
// Embedded 0 not valid
return;
else if (IsLiteral &&
ec == '\\' &&
ECCanConsume(6) &&
ECLookahead(1) == 'u' &&
standardEncodedChars->IsHex(ECLookahead(2)) &&
standardEncodedChars->IsHex(ECLookahead(3)) &&
standardEncodedChars->IsHex(ECLookahead(4)) &&
standardEncodedChars->IsHex(ECLookahead(5)))
{
if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
Fail(JSERR_RegExpSyntax);
return;
}
else
{
uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
(standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
(standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
(standardEncodedChars->DigitValue(ECLookahead(5)));
c = UTC(n);
consume = 6;
}
}
else
{
c = Chars<EncodedChar>::CTW(ec);
consume = 1;
}
switch (c) {
case 'i':
if ((flags & IgnoreCaseRegexFlag) != 0)
{
Fail(JSERR_RegExpSyntax);
}
flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
break;
case 'g':
if ((flags & GlobalRegexFlag) != 0)
{
Fail(JSERR_RegExpSyntax);
}
flags = (RegexFlags)(flags | GlobalRegexFlag);
break;
case 'm':
if ((flags & MultilineRegexFlag) != 0)
{
Fail(JSERR_RegExpSyntax);
}
flags = (RegexFlags)(flags | MultilineRegexFlag);
break;
case 'u':
// If we don't have unicode enabled, fall through to default
if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
{
if ((flags & UnicodeRegexFlag) != 0)
{
Fail(JSERR_RegExpSyntax);
}
flags = (RegexFlags)(flags | UnicodeRegexFlag);
// For telemetry
CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(UnicodeRegexFlag, scriptContext);
break;
}
case 'y':
if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
{
if ((flags & StickyRegexFlag) != 0)
{
Fail(JSERR_RegExpSyntax);
}
flags = (RegexFlags)(flags | StickyRegexFlag);
// For telemetry
CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(StickyRegexFlag, scriptContext);
break;
}
default:
if (standardChars->IsWord(c))
{
// Outer context could never parse this character. Signal the syntax error as
// being part of the regex.
Fail(JSERR_RegExpSyntax);
}
return;
}
ECConsume(consume);
}
}
//
// Entry points
//
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::ParseDynamic
( const EncodedChar* body
, const EncodedChar* bodyLim
, const EncodedChar* opts
, const EncodedChar* optsLim
, RegexFlags& flags )
{
Assert(!IsLiteral);
Assert(body != 0);
Assert(bodyLim >= body && *bodyLim == 0);
Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
// Body, pass 0
SetPosition(body, bodyLim, true);
PatternPass0();
if (!IsEOF())
Fail(JSERR_RegExpSyntax);
// Options
if (opts != 0)
{
SetPosition(opts, optsLim, false);
Options(flags);
if (!IsEOF())
Fail(JSERR_RegExpSyntax);
this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
}
else
{
this->unicodeFlagPresent = false;
this->caseInsensitiveFlagPresent = false;
}
// If this HR has been set, that means we have an earlier failure than the one caught above.
if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
{
throw ParseError(*deferredIfNotUnicodeError);
}
else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
{
throw ParseError(*deferredIfUnicodeError);
}
this->currentSurrogatePairNode = this->surrogatePairList;
// Body, pass 1
SetPosition(body, bodyLim, true);
Node* root = PatternPass1();
Assert(IsEOF());
return root;
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::ParseLiteral
( const EncodedChar* input
, const EncodedChar* inputLim
, CharCount& outBodyEncodedChars
, CharCount& outTotalEncodedChars
, CharCount& outBodyChars
, CharCount& outTotalChars
, RegexFlags& flags )
{
Assert(IsLiteral);
Assert(input != 0);
Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
// To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
// during pass 0, regardless if the option is present. (We aren't able to get it at that time)
// During pass 1, we will use that information to correctly create appropriate nodes.
// Body, pass 0
SetPosition(input, inputLim, true);
PatternPass0();
outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
outBodyChars = Pos();
// Options are needed for the next pass
ECMust('/', ERRnoSlash);
Options(flags);
this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
// If this HR has been set, that means we have an earlier failure than the one caught above.
if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
{
throw ParseError(*deferredIfNotUnicodeError);
}
else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
{
throw ParseError(*deferredIfUnicodeError);
}
// Used below to proceed to the end of the regex
const EncodedChar *pastOptions = next;
this->currentSurrogatePairNode = this->surrogatePairList;
// Body, pass 1
SetPosition(input, inputLim, true);
Node* root = PatternPass1();
Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
Assert(outBodyChars == Pos());
next = pastOptions;
outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
outTotalChars = Pos();
return root;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::ParseLiteralNoAST
( const EncodedChar* input
, const EncodedChar* inputLim
, CharCount& outBodyEncodedChars
, CharCount& outTotalEncodedChars
, CharCount& outBodyChars
, CharCount& outTotalChars )
{
Assert(IsLiteral);
Assert(input != 0);
Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
// Body, pass 0
SetPosition(input, inputLim, true);
PatternPass0();
outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
outBodyChars = Pos();
// Options
ECMust('/', ERRnoSlash);
RegexFlags dummyFlags = NoRegexFlags;
Options(dummyFlags);
this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
outTotalChars = Pos();
// If this HR has been set, that means we have an earlier failure than the one caught above.
if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
{
throw ParseError(*deferredIfNotUnicodeError);
}
else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
{
throw ParseError(*deferredIfUnicodeError);
}
}
template <typename P, const bool IsLiteral>
template <const bool buildAST>
RegexPattern * Parser<P, IsLiteral>::CompileProgram
( Node* root,
const EncodedChar*& currentCharacter,
const CharCount totalLen,
const CharCount bodyChars,
const CharCount bodyEncodedChars,
const CharCount totalChars,
const RegexFlags flags )
{
Assert(IsLiteral);
Program* program = nullptr;
if (buildAST)
{
const auto recycler = this->scriptContext->GetRecycler();
program = Program::New(recycler, flags);
this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars, bodyEncodedChars);
}
currentCharacter += totalLen;
Assert(GetMultiUnits() == totalLen - totalChars);
if (!buildAST)
{
return nullptr;
}
RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
#if ENABLE_REGEX_CONFIG_OPTIONS
RegexStats* stats = 0;
if (REGEX_CONFIG_FLAG(RegexProfile))
{
stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
}
if (REGEX_CONFIG_FLAG(RegexTracing))
{
DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
tw->Print(_u("// REGEX COMPILE "));
pattern->Print(tw);
tw->EOL();
}
if (REGEX_CONFIG_FLAG(RegexProfile))
this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
#endif
ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
Compiler::Compile
( this->scriptContext
, ctAllocator
, rtAllocator
, standardChars
, program
, root
, this->GetLitbuf()
, pattern
#if ENABLE_REGEX_CONFIG_OPTIONS
, w
, stats
#endif
);
#if ENABLE_REGEX_CONFIG_OPTIONS
if (REGEX_CONFIG_FLAG(RegexProfile))
this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
#endif
#ifdef PROFILE_EXEC
this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
#endif
return pattern;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
{
Assert(program->source == 0);
program->source = const_cast<Char*>(_u(""));
program->sourceLen = 0;
program->numGroups = 1;
// Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars)
{
Assert(program->source == 0);
Assert(body != 0);
// Program will own source string
program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
// Don't need to zero out since we're writing to the buffer right here
this->ConvertToUnicode(program->source, bodyChars, body, body + bodyEncodedChars);
program->source[bodyChars] = 0;
program->sourceLen = bodyChars;
program->numGroups = nextGroupId;
// Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
}
template <typename P, const bool IsLiteral>
Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
{
Node* nodeToReturn = nullptr;
if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
{
MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
// The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
// i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
switch (cc)
{
case '.':
standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
break;
case 'S':
standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
break;
case 'D':
standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
break;
case 'W':
if (this->caseInsensitiveFlagPresent)
{
standardChars->SetNonWordIUChars(ctAllocator, partialPrefixSetNode->set);
}
else
{
standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
}
break;
default:
AssertMsg(false, "");
}
partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
nodeToReturn = altNode;
}
else
{
MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
switch (cc)
{
case '.':
standardChars->SetNonNewline(ctAllocator, setNode->set);
break;
case 'S':
standardChars->SetNonWhitespace(ctAllocator, setNode->set);
break;
case 'D':
standardChars->SetNonDigits(ctAllocator, setNode->set);
break;
case 'W':
standardChars->SetNonWordChars(ctAllocator, setNode->set);
break;
default:
AssertMsg(false, "");
}
nodeToReturn = setNode;
}
return nodeToReturn;
}
template <typename P, const bool IsLiteral>
void Parser<P, IsLiteral>::FreeBody()
{
if (litbuf != 0)
{
ctAllocator->Free(litbuf, litbufLen);
litbuf = nullptr;
litbufLen = 0;
litbufNext = 0;
}
}
// Instantiate all templates
#define INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, IsLiteral, BuildAST) \
template RegexPattern* Parser<EncodingPolicy, IsLiteral>::CompileProgram<BuildAST>(Node* root, const EncodedChar*& currentCharacter, const CharCount totalLen, const CharCount bodyChars, const CharCount bodyEncodedChars, const CharCount totalChars, const RegexFlags flags );
#define INSTANTIATE_REGEX_PARSER(EncodingPolicy) \
INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, false) \
INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, true) \
INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, false) \
INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, true) \
template class Parser<EncodingPolicy, false>; \
template class Parser<EncodingPolicy, true>;
// Instantiate the Parser
INSTANTIATE_REGEX_PARSER(NullTerminatedUnicodeEncodingPolicy);
INSTANTIATE_REGEX_PARSER(NotNullTerminatedUTF8EncodingPolicy);
}