|  | /* | 
|  | * Copyright (C) 2009-2020 Apple Inc. All rights reserved. | 
|  | * Copyright (C) 2020 Alexey Shvayka <shvaikalesh@gmail.com>. | 
|  | * | 
|  | * Redistribution and use in source and binary forms, with or without | 
|  | * modification, are permitted provided that the following conditions | 
|  | * are met: | 
|  | * 1. Redistributions of source code must retain the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer. | 
|  | * 2. Redistributions in binary form must reproduce the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer in the | 
|  | *    documentation and/or other materials provided with the distribution. | 
|  | * | 
|  | * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' | 
|  | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | 
|  | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
|  | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS | 
|  | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|  | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|  | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|  | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|  | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|  | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF | 
|  | * THE POSSIBILITY OF SUCH DAMAGE. | 
|  | */ | 
|  |  | 
|  | #pragma once | 
|  |  | 
|  | #include "Yarr.h" | 
|  | #include "YarrPattern.h" | 
|  | #include "YarrUnicodeProperties.h" | 
|  | #include <wtf/ASCIICType.h> | 
|  | #include <wtf/HashSet.h> | 
|  | #include <wtf/Optional.h> | 
|  | #include <wtf/text/StringBuilder.h> | 
|  | #include <wtf/text/WTFString.h> | 
|  |  | 
|  | namespace JSC { namespace Yarr { | 
|  |  | 
|  | // The Parser class should not be used directly - only via the Yarr::parse() method. | 
|  | template<class Delegate, typename CharType> | 
|  | class Parser { | 
|  | private: | 
|  | template<class FriendDelegate> | 
|  | friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed); | 
|  |  | 
|  | enum class UnicodeParseContext : uint8_t { PatternCodePoint, GroupName }; | 
|  |  | 
|  | /* | 
|  | * CharacterClassParserDelegate: | 
|  | * | 
|  | * The class CharacterClassParserDelegate is used in the parsing of character | 
|  | * classes.  This class handles detection of character ranges.  This class | 
|  | * implements enough of the delegate interface such that it can be passed to | 
|  | * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused | 
|  | * to perform the parsing of escape characters in character sets. | 
|  | */ | 
|  | class CharacterClassParserDelegate { | 
|  | public: | 
|  | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err, bool isUnicode) | 
|  | : m_delegate(delegate) | 
|  | , m_errorCode(err) | 
|  | , m_isUnicode(isUnicode) | 
|  | , m_state(Empty) | 
|  | , m_character(0) | 
|  | { | 
|  | } | 
|  |  | 
|  | /* | 
|  | * begin(): | 
|  | * | 
|  | * Called at beginning of construction. | 
|  | */ | 
|  | void begin(bool invert) | 
|  | { | 
|  | m_delegate.atomCharacterClassBegin(invert); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * atomPatternCharacter(): | 
|  | * | 
|  | * This method is called either from parseCharacterClass() (for an unescaped | 
|  | * character in a character class), or from parseEscape(). In the former case | 
|  | * the value true will be passed for the argument 'hyphenIsRange', and in this | 
|  | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | 
|  | * is different to /[a\-z]/). | 
|  | */ | 
|  | void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) | 
|  | { | 
|  | switch (m_state) { | 
|  | case AfterCharacterClass: | 
|  | // Following a built-in character class we need look out for a hyphen. | 
|  | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | 
|  | // If we see a hyphen following a character class then unlike usual | 
|  | // we'll report it to the delegate immediately, and put ourself into | 
|  | // a poisoned state. In a unicode pattern, any following calls to add | 
|  | // another character or character class will result in syntax error. | 
|  | // A hypen following a character class is itself valid, but only at | 
|  | // the end of a regex. | 
|  | if (hyphenIsRange && ch == '-') { | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | m_state = AfterCharacterClassHyphen; | 
|  | return; | 
|  | } | 
|  | // Otherwise just fall through - cached character so treat this as Empty. | 
|  | FALLTHROUGH; | 
|  |  | 
|  | case Empty: | 
|  | m_character = ch; | 
|  | m_state = CachedCharacter; | 
|  | return; | 
|  |  | 
|  | case CachedCharacter: | 
|  | if (hyphenIsRange && ch == '-') | 
|  | m_state = CachedCharacterHyphen; | 
|  | else { | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_character = ch; | 
|  | } | 
|  | return; | 
|  |  | 
|  | case CachedCharacterHyphen: | 
|  | if (ch < m_character) { | 
|  | m_errorCode = ErrorCode::CharacterClassRangeOutOfOrder; | 
|  | return; | 
|  | } | 
|  | m_delegate.atomCharacterClassRange(m_character, ch); | 
|  | m_state = Empty; | 
|  | return; | 
|  |  | 
|  | // If we hit this case, we have an invalid range like /[\d-a]/. | 
|  | // See coment in atomBuiltInCharacterClass() below. | 
|  | case AfterCharacterClassHyphen: | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::CharacterClassRangeInvalid; | 
|  | return; | 
|  | } | 
|  | m_delegate.atomCharacterClassAtom(ch); | 
|  | m_state = Empty; | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * atomBuiltInCharacterClass(): | 
|  | * | 
|  | * Adds a built-in character class, called by parseEscape(). | 
|  | */ | 
|  | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | 
|  | { | 
|  | switch (m_state) { | 
|  | case CachedCharacter: | 
|  | // Flush the currently cached character, then fall through. | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | FALLTHROUGH; | 
|  | case Empty: | 
|  | case AfterCharacterClass: | 
|  | m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
|  | m_state = AfterCharacterClass; | 
|  | return; | 
|  |  | 
|  | // If we hit either of these cases, we have an invalid range that | 
|  | // looks something like /[a-\d]/ or /[\d-\d]/. | 
|  | // Since ES2015, this should be syntax error in a unicode pattern, | 
|  | // yet gracefully handled in a regular regex to avoid breaking the web. | 
|  | // Effectively we handle the hyphen as if it was (implicitly) escaped, | 
|  | // e.g. /[\d-a-z]/ is treated as /[\d\-a\-z]/. | 
|  | // See usages of CharacterRangeOrUnion abstract op in | 
|  | // https://tc39.es/ecma262/#sec-regular-expression-patterns-semantics | 
|  | case CachedCharacterHyphen: | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | FALLTHROUGH; | 
|  | case AfterCharacterClassHyphen: | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::CharacterClassRangeInvalid; | 
|  | return; | 
|  | } | 
|  | m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
|  | m_state = Empty; | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * end(): | 
|  | * | 
|  | * Called at end of construction. | 
|  | */ | 
|  | void end() | 
|  | { | 
|  | if (m_state == CachedCharacter) | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | else if (m_state == CachedCharacterHyphen) { | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | } | 
|  | m_delegate.atomCharacterClassEnd(); | 
|  | } | 
|  |  | 
|  | // parseEscape() should never call these delegate methods when | 
|  | // invoked with inCharacterClass set. | 
|  | NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  | NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  | NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  | NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  |  | 
|  | private: | 
|  | Delegate& m_delegate; | 
|  | ErrorCode& m_errorCode; | 
|  | bool m_isUnicode; | 
|  | enum CharacterClassConstructionState { | 
|  | Empty, | 
|  | CachedCharacter, | 
|  | CachedCharacterHyphen, | 
|  | AfterCharacterClass, | 
|  | AfterCharacterClassHyphen, | 
|  | } m_state; | 
|  | UChar32 m_character; | 
|  | }; | 
|  |  | 
|  | Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit, bool isNamedForwardReferenceAllowed) | 
|  | : m_delegate(delegate) | 
|  | , m_data(pattern.characters<CharType>()) | 
|  | , m_size(pattern.length()) | 
|  | , m_isUnicode(isUnicode) | 
|  | , m_backReferenceLimit(backReferenceLimit) | 
|  | , m_isNamedForwardReferenceAllowed(isNamedForwardReferenceAllowed) | 
|  | { | 
|  | } | 
|  |  | 
|  | // The handling of IdentityEscapes is different depending on the unicode flag. | 
|  | // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. | 
|  | // For non-unicode patterns, most any character can be escaped. | 
|  | bool isIdentityEscapeAnError(int ch) | 
|  | { | 
|  | if (m_isUnicode && (!strchr("^$\\.*+?()[]{}|/", ch) || !ch)) { | 
|  | m_errorCode = ErrorCode::InvalidIdentityEscape; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseEscape(): | 
|  | * | 
|  | * Helper for parseTokens() AND parseCharacterClass(). | 
|  | * Unlike the other parser methods, this function does not report tokens | 
|  | * directly to the member delegate (m_delegate), instead tokens are | 
|  | * emitted to the delegate provided as an argument.  In the case of atom | 
|  | * escapes, parseTokens() will call parseEscape() passing m_delegate as | 
|  | * an argument, and as such the escape will be reported to the delegate. | 
|  | * | 
|  | * However this method may also be used by parseCharacterClass(), in which | 
|  | * case a CharacterClassParserDelegate will be passed as the delegate that | 
|  | * tokens should be added to.  A boolean flag is also provided to indicate | 
|  | * whether that an escape in a CharacterClass is being parsed (some parsing | 
|  | * rules change in this context). | 
|  | * | 
|  | * The boolean value returned by this method indicates whether the token | 
|  | * parsed was an atom (outside of a characted class \b and \B will be | 
|  | * interpreted as assertions). | 
|  | */ | 
|  | template<bool inCharacterClass, class EscapeDelegate> | 
|  | bool parseEscape(EscapeDelegate& delegate) | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  | ASSERT(peek() == '\\'); | 
|  | consume(); | 
|  |  | 
|  | if (atEndOfPattern()) { | 
|  | m_errorCode = ErrorCode::EscapeUnterminated; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | switch (peek()) { | 
|  | // Assertions | 
|  | case 'b': | 
|  | consume(); | 
|  | if (inCharacterClass) | 
|  | delegate.atomPatternCharacter('\b'); | 
|  | else { | 
|  | delegate.assertionWordBoundary(false); | 
|  | return false; | 
|  | } | 
|  | break; | 
|  | case 'B': | 
|  | consume(); | 
|  | if (inCharacterClass) { | 
|  | if (isIdentityEscapeAnError('B')) | 
|  | break; | 
|  |  | 
|  | delegate.atomPatternCharacter('B'); | 
|  | } else { | 
|  | delegate.assertionWordBoundary(true); | 
|  | return false; | 
|  | } | 
|  | break; | 
|  |  | 
|  | // CharacterClassEscape | 
|  | case 'd': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); | 
|  | break; | 
|  | case 's': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); | 
|  | break; | 
|  | case 'w': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); | 
|  | break; | 
|  | case 'D': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); | 
|  | break; | 
|  | case 'S': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); | 
|  | break; | 
|  | case 'W': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); | 
|  | break; | 
|  |  | 
|  | case '0': { | 
|  | consume(); | 
|  |  | 
|  | if (!peekIsDigit()) { | 
|  | delegate.atomPatternCharacter(0); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidOctalEscape; | 
|  | break; | 
|  | } | 
|  |  | 
|  | delegate.atomPatternCharacter(consumeOctal(2)); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // DecimalEscape | 
|  | case '1': | 
|  | case '2': | 
|  | case '3': | 
|  | case '4': | 
|  | case '5': | 
|  | case '6': | 
|  | case '7': | 
|  | case '8': | 
|  | case '9': { | 
|  | // For non-Unicode patterns, invalid backreferences are parsed as octal or decimal escapes. | 
|  | // First, try to parse this as backreference. | 
|  | if (!inCharacterClass) { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | unsigned backReference = consumeNumber(); | 
|  | if (backReference <= m_backReferenceLimit) { | 
|  | m_maxSeenBackReference = std::max(m_maxSeenBackReference, backReference); | 
|  | delegate.atomBackReference(backReference); | 
|  | break; | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidBackreference; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidOctalEscape; | 
|  | break; | 
|  | } | 
|  |  | 
|  | delegate.atomPatternCharacter(peek() < '8' ? consumeOctal(3) : consume()); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // ControlEscape | 
|  | case 'f': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\f'); | 
|  | break; | 
|  | case 'n': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\n'); | 
|  | break; | 
|  | case 'r': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\r'); | 
|  | break; | 
|  | case 't': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\t'); | 
|  | break; | 
|  | case 'v': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\v'); | 
|  | break; | 
|  |  | 
|  | // ControlLetter | 
|  | case 'c': { | 
|  | ParseState state = saveState(); | 
|  | consume(); | 
|  | if (!atEndOfPattern()) { | 
|  | int control = consume(); | 
|  |  | 
|  | if (WTF::isASCIIAlpha(control)) { | 
|  | delegate.atomPatternCharacter(control & 0x1f); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidControlLetterEscape; | 
|  | break; | 
|  | } | 
|  |  | 
|  | // https://tc39.es/ecma262/#prod-annexB-ClassControlLetter | 
|  | if (inCharacterClass && (WTF::isASCIIDigit(control) || control == '_')) { | 
|  | delegate.atomPatternCharacter(control & 0x1f); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidIdentityEscape; | 
|  | break; | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | delegate.atomPatternCharacter('\\'); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // HexEscape | 
|  | case 'x': { | 
|  | consume(); | 
|  | int x = tryConsumeHex(2); | 
|  | if (x == -1) { | 
|  | if (isIdentityEscapeAnError('x')) | 
|  | break; | 
|  |  | 
|  | delegate.atomPatternCharacter('x'); | 
|  | } else | 
|  | delegate.atomPatternCharacter(x); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Named backreference | 
|  | case 'k': { | 
|  | consume(); | 
|  | ParseState state = saveState(); | 
|  | if (!inCharacterClass && tryConsume('<')) { | 
|  | auto groupName = tryConsumeGroupName(); | 
|  | if (hasError(m_errorCode)) | 
|  | break; | 
|  |  | 
|  | if (groupName) { | 
|  | if (m_captureGroupNames.contains(groupName.value())) { | 
|  | delegate.atomNamedBackReference(groupName.value()); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (m_isNamedForwardReferenceAllowed) { | 
|  | m_forwardReferenceNames.add(groupName.value()); | 
|  | delegate.atomNamedForwardReference(groupName.value()); | 
|  | break; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | if (!isIdentityEscapeAnError('k')) { | 
|  | delegate.atomPatternCharacter('k'); | 
|  | m_kIdentityEscapeSeen = true; | 
|  | } | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Unicode property escapes | 
|  | case 'p': | 
|  | case 'P': { | 
|  | int escapeChar = consume(); | 
|  |  | 
|  | if (!m_isUnicode) { | 
|  | if (isIdentityEscapeAnError(escapeChar)) | 
|  | break; | 
|  | delegate.atomPatternCharacter(escapeChar); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (!atEndOfPattern() && peek() == '{') { | 
|  | consume(); | 
|  | auto optClassID = tryConsumeUnicodePropertyExpression(); | 
|  | if (!optClassID) { | 
|  | // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression | 
|  | break; | 
|  | } | 
|  | delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); | 
|  | } else | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | break; | 
|  | } | 
|  |  | 
|  | // UnicodeEscape | 
|  | case 'u': { | 
|  | int codePoint = tryConsumeUnicodeEscape<UnicodeParseContext::PatternCodePoint>(); | 
|  | if (hasError(m_errorCode)) | 
|  | break; | 
|  |  | 
|  | delegate.atomPatternCharacter(codePoint == -1 ? 'u' : codePoint); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // IdentityEscape | 
|  | default: | 
|  | int ch = peek(); | 
|  |  | 
|  | if (ch == '-' && m_isUnicode && inCharacterClass) { | 
|  | // \- is allowed for ClassEscape with unicode flag. | 
|  | delegate.atomPatternCharacter(consume()); | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (isIdentityEscapeAnError(ch)) | 
|  | break; | 
|  |  | 
|  | delegate.atomPatternCharacter(consume()); | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | template<UnicodeParseContext context> | 
|  | UChar32 consumePossibleSurrogatePair() | 
|  | { | 
|  | bool unicodePatternOrGroupName = m_isUnicode || context == UnicodeParseContext::GroupName; | 
|  |  | 
|  | UChar32 ch = consume(); | 
|  | if (U16_IS_LEAD(ch) && unicodePatternOrGroupName && !atEndOfPattern()) { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | UChar32 surrogate2 = consume(); | 
|  | if (U16_IS_TRAIL(surrogate2)) | 
|  | ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); | 
|  | else | 
|  | restoreState(state); | 
|  | } | 
|  |  | 
|  | return ch; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseAtomEscape(), parseCharacterClassEscape(): | 
|  | * | 
|  | * These methods alias to parseEscape(). | 
|  | */ | 
|  | bool parseAtomEscape() | 
|  | { | 
|  | return parseEscape<false>(m_delegate); | 
|  | } | 
|  | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | 
|  | { | 
|  | parseEscape<true>(delegate); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseCharacterClass(): | 
|  | * | 
|  | * Helper for parseTokens(); calls directly and indirectly (via parseCharacterClassEscape) | 
|  | * to an instance of CharacterClassParserDelegate, to describe the character class to the | 
|  | * delegate. | 
|  | */ | 
|  | void parseCharacterClass() | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  | ASSERT(peek() == '['); | 
|  | consume(); | 
|  |  | 
|  | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode, m_isUnicode); | 
|  |  | 
|  | characterClassConstructor.begin(tryConsume('^')); | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | switch (peek()) { | 
|  | case ']': | 
|  | consume(); | 
|  | characterClassConstructor.end(); | 
|  | return; | 
|  |  | 
|  | case '\\': | 
|  | parseCharacterClassEscape(characterClassConstructor); | 
|  | break; | 
|  |  | 
|  | default: | 
|  | characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>(), true); | 
|  | } | 
|  |  | 
|  | if (hasError(m_errorCode)) | 
|  | return; | 
|  | } | 
|  |  | 
|  | m_errorCode = ErrorCode::CharacterClassUnmatched; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseParenthesesBegin(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | 
|  | */ | 
|  | void parseParenthesesBegin() | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  | ASSERT(peek() == '('); | 
|  | consume(); | 
|  |  | 
|  | auto type = ParenthesesType::Subpattern; | 
|  |  | 
|  | if (tryConsume('?')) { | 
|  | if (atEndOfPattern()) { | 
|  | m_errorCode = ErrorCode::ParenthesesTypeInvalid; | 
|  | return; | 
|  | } | 
|  |  | 
|  | switch (consume()) { | 
|  | case ':': | 
|  | m_delegate.atomParenthesesSubpatternBegin(false); | 
|  | break; | 
|  |  | 
|  | case '=': | 
|  | m_delegate.atomParentheticalAssertionBegin(); | 
|  | type = ParenthesesType::Assertion; | 
|  | break; | 
|  |  | 
|  | case '!': | 
|  | m_delegate.atomParentheticalAssertionBegin(true); | 
|  | type = ParenthesesType::Assertion; | 
|  | break; | 
|  |  | 
|  | case '<': { | 
|  | auto groupName = tryConsumeGroupName(); | 
|  | if (hasError(m_errorCode)) | 
|  | break; | 
|  |  | 
|  | if (groupName) { | 
|  | if (m_kIdentityEscapeSeen) { | 
|  | m_errorCode = ErrorCode::InvalidNamedBackReference; | 
|  | break; | 
|  | } | 
|  |  | 
|  | auto setAddResult = m_captureGroupNames.add(groupName.value()); | 
|  | if (setAddResult.isNewEntry) | 
|  | m_delegate.atomParenthesesSubpatternBegin(true, groupName); | 
|  | else | 
|  | m_errorCode = ErrorCode::DuplicateGroupName; | 
|  | } else | 
|  | m_errorCode = ErrorCode::InvalidGroupName; | 
|  |  | 
|  | break; | 
|  | } | 
|  |  | 
|  | default: | 
|  | m_errorCode = ErrorCode::ParenthesesTypeInvalid; | 
|  | } | 
|  | } else | 
|  | m_delegate.atomParenthesesSubpatternBegin(); | 
|  |  | 
|  | if (type == ParenthesesType::Subpattern) | 
|  | ++m_numSubpatterns; | 
|  |  | 
|  | m_parenthesesStack.append(type); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseParenthesesEnd(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | 
|  | * | 
|  | * The boolean value returned by this method indicates whether the token parsed | 
|  | * was either an Atom or, for web compatibility reasons, QuantifiableAssertion | 
|  | * in non-Unicode pattern. | 
|  | */ | 
|  | bool parseParenthesesEnd() | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  | ASSERT(peek() == ')'); | 
|  | consume(); | 
|  |  | 
|  | if (m_parenthesesStack.isEmpty()) { | 
|  | m_errorCode = ErrorCode::ParenthesesUnmatched; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | m_delegate.atomParenthesesEnd(); | 
|  | auto type = m_parenthesesStack.takeLast(); | 
|  | return type == ParenthesesType::Subpattern || !m_isUnicode; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseQuantifier(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | 
|  | */ | 
|  | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  | ASSERT(min <= max); | 
|  |  | 
|  | if (min == UINT_MAX) { | 
|  | m_errorCode = ErrorCode::QuantifierTooLarge; | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (lastTokenWasAnAtom) | 
|  | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | 
|  | else | 
|  | m_errorCode = ErrorCode::QuantifierWithoutAtom; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseTokens(): | 
|  | * | 
|  | * This method loops over the input pattern reporting tokens to the delegate. | 
|  | * The method returns when a parse error is detected, or the end of the pattern | 
|  | * is reached.  One piece of state is tracked around the loop, which is whether | 
|  | * the last token passed to the delegate was an atom (this is necessary to detect | 
|  | * a parse error when a quantifier provided without an atom to quantify). | 
|  | */ | 
|  | void parseTokens() | 
|  | { | 
|  | bool lastTokenWasAnAtom = false; | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | switch (peek()) { | 
|  | case '|': | 
|  | consume(); | 
|  | m_delegate.disjunction(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '(': | 
|  | parseParenthesesBegin(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case ')': | 
|  | lastTokenWasAnAtom = parseParenthesesEnd(); | 
|  | break; | 
|  |  | 
|  | case '^': | 
|  | consume(); | 
|  | m_delegate.assertionBOL(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '$': | 
|  | consume(); | 
|  | m_delegate.assertionEOL(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '.': | 
|  | consume(); | 
|  | m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case '[': | 
|  | parseCharacterClass(); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case ']': | 
|  | case '}': | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::BracketUnmatched; | 
|  | break; | 
|  | } | 
|  |  | 
|  | m_delegate.atomPatternCharacter(consume()); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case '\\': | 
|  | lastTokenWasAnAtom = parseAtomEscape(); | 
|  | break; | 
|  |  | 
|  | case '*': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '+': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '?': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 0, 1); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '{': { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | consume(); | 
|  | if (peekIsDigit()) { | 
|  | unsigned min = consumeNumber(); | 
|  | unsigned max = min; | 
|  |  | 
|  | if (tryConsume(',')) | 
|  | max = peekIsDigit() ? consumeNumber() : quantifyInfinite; | 
|  |  | 
|  | if (tryConsume('}')) { | 
|  | if (min <= max) | 
|  | parseQuantifier(lastTokenWasAnAtom, min, max); | 
|  | else | 
|  | m_errorCode = ErrorCode::QuantifierOutOfOrder; | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::QuantifierIncomplete; | 
|  | break; | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | // if we did not find a complete quantifer, fall through to the default case. | 
|  | FALLTHROUGH; | 
|  | } | 
|  |  | 
|  | default: | 
|  | m_delegate.atomPatternCharacter(consumePossibleSurrogatePair<UnicodeParseContext::PatternCodePoint>()); | 
|  | lastTokenWasAnAtom = true; | 
|  | } | 
|  |  | 
|  | if (hasError(m_errorCode)) | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (!m_parenthesesStack.isEmpty()) | 
|  | m_errorCode = ErrorCode::MissingParentheses; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parse(): | 
|  | * | 
|  | * This method calls parseTokens() to parse over the input and returns error code for a result. | 
|  | */ | 
|  | ErrorCode parse() | 
|  | { | 
|  | if (m_size > MAX_PATTERN_SIZE) | 
|  | return ErrorCode::PatternTooLarge; | 
|  |  | 
|  | parseTokens(); | 
|  |  | 
|  | if (!hasError(m_errorCode)) { | 
|  | ASSERT(atEndOfPattern()); | 
|  | handleIllegalReferences(); | 
|  | ASSERT(atEndOfPattern()); | 
|  | } | 
|  |  | 
|  | return m_errorCode; | 
|  | } | 
|  |  | 
|  | void handleIllegalReferences() | 
|  | { | 
|  | bool shouldReparse = false; | 
|  |  | 
|  | if (m_maxSeenBackReference > m_numSubpatterns) { | 
|  | // Contains illegal numeric backreference. See https://tc39.es/ecma262/#prod-annexB-AtomEscape | 
|  | if (m_isUnicode) { | 
|  | m_errorCode = ErrorCode::InvalidBackreference; | 
|  | return; | 
|  | } | 
|  |  | 
|  | m_backReferenceLimit = m_numSubpatterns; | 
|  | shouldReparse = true; | 
|  | } | 
|  |  | 
|  | if (m_kIdentityEscapeSeen && !m_captureGroupNames.isEmpty()) { | 
|  | m_errorCode = ErrorCode::InvalidNamedBackReference; | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (containsIllegalNamedForwardReference()) { | 
|  | // \k<a> is parsed as named reference in Unicode patterns because of strict IdentityEscape grammar. | 
|  | // See https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors | 
|  | if (m_isUnicode || !m_captureGroupNames.isEmpty()) { | 
|  | m_errorCode = ErrorCode::InvalidNamedBackReference; | 
|  | return; | 
|  | } | 
|  |  | 
|  | m_isNamedForwardReferenceAllowed = false; | 
|  | shouldReparse = true; | 
|  | } | 
|  |  | 
|  | if (shouldReparse) { | 
|  | resetForReparsing(); | 
|  | parseTokens(); | 
|  | } | 
|  | } | 
|  |  | 
|  | bool containsIllegalNamedForwardReference() | 
|  | { | 
|  | if (m_forwardReferenceNames.isEmpty()) | 
|  | return false; | 
|  |  | 
|  | if (m_captureGroupNames.isEmpty()) | 
|  | return true; | 
|  |  | 
|  | for (auto& entry : m_forwardReferenceNames) { | 
|  | if (!m_captureGroupNames.contains(entry)) | 
|  | return true; | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | void resetForReparsing() | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  |  | 
|  | m_delegate.resetForReparsing(); | 
|  | m_index = 0; | 
|  | m_numSubpatterns = 0; | 
|  | m_maxSeenBackReference = 0; | 
|  | m_kIdentityEscapeSeen = false; | 
|  | m_parenthesesStack.clear(); | 
|  | m_captureGroupNames.clear(); | 
|  | m_forwardReferenceNames.clear(); | 
|  | } | 
|  |  | 
|  | // Misc helper functions: | 
|  |  | 
|  | typedef unsigned ParseState; | 
|  |  | 
|  | ParseState saveState() | 
|  | { | 
|  | return m_index; | 
|  | } | 
|  |  | 
|  | void restoreState(ParseState state) | 
|  | { | 
|  | m_index = state; | 
|  | } | 
|  |  | 
|  | bool atEndOfPattern() | 
|  | { | 
|  | ASSERT(m_index <= m_size); | 
|  | return m_index == m_size; | 
|  | } | 
|  |  | 
|  | unsigned patternRemaining() | 
|  | { | 
|  | ASSERT(m_index <= m_size); | 
|  | return m_size - m_index; | 
|  | } | 
|  |  | 
|  | int peek() | 
|  | { | 
|  | ASSERT(m_index < m_size); | 
|  | return m_data[m_index]; | 
|  | } | 
|  |  | 
|  | bool peekIsDigit() | 
|  | { | 
|  | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | 
|  | } | 
|  |  | 
|  | unsigned peekDigit() | 
|  | { | 
|  | ASSERT(peekIsDigit()); | 
|  | return peek() - '0'; | 
|  | } | 
|  |  | 
|  | template<UnicodeParseContext context> | 
|  | int tryConsumeUnicodeEscape() | 
|  | { | 
|  | ASSERT(!hasError(m_errorCode)); | 
|  |  | 
|  | bool unicodePatternOrGroupName = m_isUnicode || context == UnicodeParseContext::GroupName; | 
|  |  | 
|  | if (!tryConsume('u') || atEndOfPattern()) { | 
|  | if (unicodePatternOrGroupName) | 
|  | m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | if (unicodePatternOrGroupName && tryConsume('{')) { | 
|  | int codePoint = 0; | 
|  | do { | 
|  | if (atEndOfPattern() || !isASCIIHexDigit(peek())) { | 
|  | m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | codePoint = (codePoint << 4) | toASCIIHexValue(consume()); | 
|  |  | 
|  | if (codePoint > UCHAR_MAX_VALUE) { | 
|  | m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; | 
|  | return -1; | 
|  | } | 
|  | } while (!atEndOfPattern() && peek() != '}'); | 
|  |  | 
|  | if (!tryConsume('}')) { | 
|  | m_errorCode = ErrorCode::InvalidUnicodeCodePointEscape; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | return codePoint; | 
|  | } | 
|  |  | 
|  | int codeUnit = tryConsumeHex(4); | 
|  | if (codeUnit == -1) { | 
|  | if (unicodePatternOrGroupName) | 
|  | m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | // If we have the first of a surrogate pair, look for the second. | 
|  | if (U16_IS_LEAD(codeUnit) && unicodePatternOrGroupName && patternRemaining() >= 6 && peek() == '\\') { | 
|  | ParseState state = saveState(); | 
|  | consume(); | 
|  |  | 
|  | if (tryConsume('u')) { | 
|  | int surrogate2 = tryConsumeHex(4); | 
|  | if (U16_IS_TRAIL(surrogate2)) | 
|  | return U16_GET_SUPPLEMENTARY(codeUnit, surrogate2); | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | } | 
|  |  | 
|  | return codeUnit; | 
|  | } | 
|  |  | 
|  | int tryConsumeIdentifierCharacter() | 
|  | { | 
|  | if (tryConsume('\\')) | 
|  | return tryConsumeUnicodeEscape<UnicodeParseContext::GroupName>(); | 
|  |  | 
|  | return consumePossibleSurrogatePair<UnicodeParseContext::GroupName>(); | 
|  | } | 
|  |  | 
|  | bool isIdentifierStart(int ch) | 
|  | { | 
|  | return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); | 
|  | } | 
|  |  | 
|  | bool isIdentifierPart(int ch) | 
|  | { | 
|  | return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; | 
|  | } | 
|  |  | 
|  | bool isUnicodePropertyValueExpressionChar(int ch) | 
|  | { | 
|  | return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '='; | 
|  | } | 
|  |  | 
|  | int consume() | 
|  | { | 
|  | ASSERT(m_index < m_size); | 
|  | return m_data[m_index++]; | 
|  | } | 
|  |  | 
|  | unsigned consumeDigit() | 
|  | { | 
|  | ASSERT(peekIsDigit()); | 
|  | return consume() - '0'; | 
|  | } | 
|  |  | 
|  | unsigned consumeNumber() | 
|  | { | 
|  | Checked<unsigned, RecordOverflow> n = consumeDigit(); | 
|  | while (peekIsDigit()) | 
|  | n = n * 10 + consumeDigit(); | 
|  | return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); | 
|  | } | 
|  |  | 
|  | // https://tc39.es/ecma262/#prod-annexB-LegacyOctalEscapeSequence | 
|  | unsigned consumeOctal(unsigned count) | 
|  | { | 
|  | unsigned octal = 0; | 
|  | while (count-- && octal < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | 
|  | octal = octal * 8 + consumeDigit(); | 
|  | return octal; | 
|  | } | 
|  |  | 
|  | bool tryConsume(UChar ch) | 
|  | { | 
|  | if (atEndOfPattern() || (m_data[m_index] != ch)) | 
|  | return false; | 
|  | ++m_index; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | int tryConsumeHex(int count) | 
|  | { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | int n = 0; | 
|  | while (count--) { | 
|  | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | 
|  | restoreState(state); | 
|  | return -1; | 
|  | } | 
|  | n = (n << 4) | WTF::toASCIIHexValue(consume()); | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | Optional<String> tryConsumeGroupName() | 
|  | { | 
|  | if (atEndOfPattern()) | 
|  | return WTF::nullopt; | 
|  |  | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | int ch = tryConsumeIdentifierCharacter(); | 
|  |  | 
|  | if (isIdentifierStart(ch)) { | 
|  | StringBuilder identifierBuilder; | 
|  | identifierBuilder.appendCharacter(ch); | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | ch = tryConsumeIdentifierCharacter(); | 
|  | if (ch == '>') | 
|  | return Optional<String>(identifierBuilder.toString()); | 
|  |  | 
|  | if (!isIdentifierPart(ch)) | 
|  | break; | 
|  |  | 
|  | identifierBuilder.appendCharacter(ch); | 
|  | } | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  |  | 
|  | return WTF::nullopt; | 
|  | } | 
|  |  | 
|  | Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() | 
|  | { | 
|  | if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) { | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | return WTF::nullopt; | 
|  | } | 
|  |  | 
|  | StringBuilder expressionBuilder; | 
|  | String unicodePropertyName; | 
|  | bool foundEquals = false; | 
|  | unsigned errors = 0; | 
|  |  | 
|  | expressionBuilder.appendCharacter(consume()); | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | int ch = peek(); | 
|  | if (ch == '}') { | 
|  | consume(); | 
|  | if (errors) { | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | return WTF::nullopt; | 
|  | } | 
|  |  | 
|  | if (foundEquals) { | 
|  | auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); | 
|  | if (!result) | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | return result; | 
|  | } | 
|  |  | 
|  | auto result = unicodeMatchProperty(expressionBuilder.toString()); | 
|  | if (!result) | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | return result; | 
|  | } | 
|  |  | 
|  | consume(); | 
|  | if (ch == '=') { | 
|  | if (!foundEquals) { | 
|  | foundEquals = true; | 
|  | unicodePropertyName = expressionBuilder.toString(); | 
|  | expressionBuilder.clear(); | 
|  | } else | 
|  | errors++; | 
|  | } else if (!isUnicodePropertyValueExpressionChar(ch)) | 
|  | errors++; | 
|  | else | 
|  | expressionBuilder.appendCharacter(ch); | 
|  | } | 
|  |  | 
|  | m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
|  | return WTF::nullopt; | 
|  | } | 
|  |  | 
|  | enum class ParenthesesType : uint8_t { Subpattern, Assertion }; | 
|  |  | 
|  | Delegate& m_delegate; | 
|  | ErrorCode m_errorCode { ErrorCode::NoError }; | 
|  | const CharType* m_data; | 
|  | unsigned m_size; | 
|  | unsigned m_index { 0 }; | 
|  | bool m_isUnicode; | 
|  | unsigned m_backReferenceLimit; | 
|  | unsigned m_numSubpatterns { 0 }; | 
|  | unsigned m_maxSeenBackReference { 0 }; | 
|  | bool m_isNamedForwardReferenceAllowed; | 
|  | bool m_kIdentityEscapeSeen { false }; | 
|  | Vector<ParenthesesType, 16> m_parenthesesStack; | 
|  | HashSet<String> m_captureGroupNames; | 
|  | HashSet<String> m_forwardReferenceNames; | 
|  |  | 
|  | // Derived by empirical testing of compile time in PCRE and WREC. | 
|  | static constexpr unsigned MAX_PATTERN_SIZE = 1024 * 1024; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * Yarr::parse(): | 
|  | * | 
|  | * The parse method is passed a pattern to be parsed and a delegate upon which | 
|  | * callbacks will be made to record the parsed tokens forming the regex. | 
|  | * Yarr::parse() returns null on success, or a const C string providing an error | 
|  | * message where a parse error occurs. | 
|  | * | 
|  | * The Delegate must implement the following interface: | 
|  | * | 
|  | *    void assertionBOL(); | 
|  | *    void assertionEOL(); | 
|  | *    void assertionWordBoundary(bool invert); | 
|  | * | 
|  | *    void atomPatternCharacter(UChar32 ch); | 
|  | *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | 
|  | *    void atomCharacterClassBegin(bool invert) | 
|  | *    void atomCharacterClassAtom(UChar32 ch) | 
|  | *    void atomCharacterClassRange(UChar32 begin, UChar32 end) | 
|  | *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | 
|  | *    void atomCharacterClassEnd() | 
|  | *    void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); | 
|  | *    void atomParentheticalAssertionBegin(bool invert = false); | 
|  | *    void atomParenthesesEnd(); | 
|  | *    void atomBackReference(unsigned subpatternId); | 
|  | *    void atomNamedBackReference(const String& subpatternName); | 
|  | *    void atomNamedForwardReference(const String& subpatternName); | 
|  | * | 
|  | *    void quantifyAtom(unsigned min, unsigned max, bool greedy); | 
|  | * | 
|  | *    void disjunction(); | 
|  | * | 
|  | *    void resetForReparsing(); | 
|  | * | 
|  | * The regular expression is described by a sequence of assertion*() and atom*() | 
|  | * callbacks to the delegate, describing the terms in the regular expression. | 
|  | * Following an atom a quantifyAtom() call may occur to indicate that the previous | 
|  | * atom should be quantified.  In the case of atoms described across multiple | 
|  | * calls (parentheses and character classes) the call to quantifyAtom() will come | 
|  | * after the call to the atom*End() method, never after atom*Begin(). | 
|  | * | 
|  | * Character classes may either be described by a single call to | 
|  | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | 
|  | * In the latter case, ...Begin() will be called, followed by a sequence of | 
|  | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | 
|  | * | 
|  | * Sequences of atoms and assertions are broken into alternatives via calls to | 
|  | * disjunction().  Assertions, atoms, and disjunctions emitted between calls to | 
|  | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | 
|  | * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular | 
|  | * capturing subpattern, this will be the subpatternId associated with these | 
|  | * parentheses, and will also by definition be the lowest subpatternId of these | 
|  | * parentheses and of any nested paretheses.  The atomParenthesesEnd() method | 
|  | * is passed the subpatternId of the last capturing subexpression nested within | 
|  | * these paretheses.  In the case of a capturing subpattern with no nested | 
|  | * capturing subpatterns, the same subpatternId will be passed to the begin and | 
|  | * end functions.  In the case of non-capturing subpatterns the subpatternId | 
|  | * passed to the begin method is also the first possible subpatternId that might | 
|  | * be nested within these paretheses.  If a set of non-capturing parentheses does | 
|  | * not contain any capturing subpatterns, then the subpatternId passed to begin | 
|  | * will be greater than the subpatternId passed to end. | 
|  | */ | 
|  |  | 
|  | template<class Delegate> | 
|  | ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite, bool isNamedForwardReferenceAllowed = true) | 
|  | { | 
|  | if (pattern.is8Bit()) | 
|  | return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit, isNamedForwardReferenceAllowed).parse(); | 
|  | return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit, isNamedForwardReferenceAllowed).parse(); | 
|  | } | 
|  |  | 
|  | } } // namespace JSC::Yarr |