|  | /* | 
|  | * Copyright (C) 2009 Apple Inc. All rights reserved. | 
|  | * | 
|  | * Redistribution and use in source and binary forms, with or without | 
|  | * modification, are permitted provided that the following conditions | 
|  | * are met: | 
|  | * 1. Redistributions of source code must retain the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer. | 
|  | * 2. Redistributions in binary form must reproduce the above copyright | 
|  | *    notice, this list of conditions and the following disclaimer in the | 
|  | *    documentation and/or other materials provided with the distribution. | 
|  | * | 
|  | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
|  | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
|  | * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
|  | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
|  | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
|  | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
|  | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
|  | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|  | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  | */ | 
|  |  | 
|  | #ifndef YarrParser_h | 
|  | #define YarrParser_h | 
|  |  | 
|  | #include "Yarr.h" | 
|  | #include <wtf/ASCIICType.h> | 
|  | #include <wtf/text/WTFString.h> | 
|  | #include <wtf/unicode/Unicode.h> | 
|  |  | 
|  | namespace JSC { namespace Yarr { | 
|  |  | 
|  | #define REGEXP_ERROR_PREFIX "Invalid regular expression: " | 
|  |  | 
|  | enum BuiltInCharacterClassID { | 
|  | DigitClassID, | 
|  | SpaceClassID, | 
|  | WordClassID, | 
|  | NewlineClassID, | 
|  | }; | 
|  |  | 
|  | // The Parser class should not be used directly - only via the Yarr::parse() method. | 
|  | template<class Delegate, typename CharType> | 
|  | class Parser { | 
|  | private: | 
|  | template<class FriendDelegate> | 
|  | friend const char* parse(FriendDelegate&, const String& pattern, unsigned backReferenceLimit); | 
|  |  | 
|  | enum ErrorCode { | 
|  | NoError, | 
|  | PatternTooLarge, | 
|  | QuantifierOutOfOrder, | 
|  | QuantifierWithoutAtom, | 
|  | QuantifierTooLarge, | 
|  | MissingParentheses, | 
|  | ParenthesesUnmatched, | 
|  | ParenthesesTypeInvalid, | 
|  | CharacterClassUnmatched, | 
|  | CharacterClassOutOfOrder, | 
|  | EscapeUnterminated, | 
|  | NumberOfErrorCodes | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * CharacterClassParserDelegate: | 
|  | * | 
|  | * The class CharacterClassParserDelegate is used in the parsing of character | 
|  | * classes.  This class handles detection of character ranges.  This class | 
|  | * implements enough of the delegate interface such that it can be passed to | 
|  | * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused | 
|  | * to perform the parsing of escape characters in character sets. | 
|  | */ | 
|  | class CharacterClassParserDelegate { | 
|  | public: | 
|  | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | 
|  | : m_delegate(delegate) | 
|  | , m_err(err) | 
|  | , m_state(Empty) | 
|  | , m_character(0) | 
|  | { | 
|  | } | 
|  |  | 
|  | /* | 
|  | * begin(): | 
|  | * | 
|  | * Called at beginning of construction. | 
|  | */ | 
|  | void begin(bool invert) | 
|  | { | 
|  | m_delegate.atomCharacterClassBegin(invert); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * atomPatternCharacter(): | 
|  | * | 
|  | * This method is called either from parseCharacterClass() (for an unescaped | 
|  | * character in a character class), or from parseEscape(). In the former case | 
|  | * the value true will be passed for the argument 'hyphenIsRange', and in this | 
|  | * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | 
|  | * is different to /[a\-z]/). | 
|  | */ | 
|  | void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) | 
|  | { | 
|  | switch (m_state) { | 
|  | case AfterCharacterClass: | 
|  | // Following a builtin character class we need look out for a hyphen. | 
|  | // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | 
|  | // If we see a hyphen following a charater class then unlike usual | 
|  | // we'll report it to the delegate immediately, and put ourself into | 
|  | // a poisoned state. Any following calls to add another character or | 
|  | // character class will result in an error. (A hypen following a | 
|  | // character-class is itself valid, but only  at the end of a regex). | 
|  | if (hyphenIsRange && ch == '-') { | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | m_state = AfterCharacterClassHyphen; | 
|  | return; | 
|  | } | 
|  | // Otherwise just fall through - cached character so treat this as Empty. | 
|  |  | 
|  | case Empty: | 
|  | m_character = ch; | 
|  | m_state = CachedCharacter; | 
|  | return; | 
|  |  | 
|  | case CachedCharacter: | 
|  | if (hyphenIsRange && ch == '-') | 
|  | m_state = CachedCharacterHyphen; | 
|  | else { | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_character = ch; | 
|  | } | 
|  | return; | 
|  |  | 
|  | case CachedCharacterHyphen: | 
|  | if (ch < m_character) { | 
|  | m_err = CharacterClassOutOfOrder; | 
|  | return; | 
|  | } | 
|  | m_delegate.atomCharacterClassRange(m_character, ch); | 
|  | m_state = Empty; | 
|  | return; | 
|  |  | 
|  | // See coment in atomBuiltInCharacterClass below. | 
|  | // This too is technically an error, per ECMA-262, and again we | 
|  | // we chose to allow this.  Note a subtlely here that while we | 
|  | // diverge from the spec's definition of CharacterRange we do | 
|  | // remain in compliance with the grammar.  For example, consider | 
|  | // the expression /[\d-a-z]/.  We comply with the grammar in | 
|  | // this case by not allowing a-z to be matched as a range. | 
|  | case AfterCharacterClassHyphen: | 
|  | m_delegate.atomCharacterClassAtom(ch); | 
|  | m_state = Empty; | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * atomBuiltInCharacterClass(): | 
|  | * | 
|  | * Adds a built-in character class, called by parseEscape(). | 
|  | */ | 
|  | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | 
|  | { | 
|  | switch (m_state) { | 
|  | case CachedCharacter: | 
|  | // Flush the currently cached character, then fall through. | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  |  | 
|  | case Empty: | 
|  | case AfterCharacterClass: | 
|  | m_state = AfterCharacterClass; | 
|  | m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
|  | return; | 
|  |  | 
|  | // If we hit either of these cases, we have an invalid range that | 
|  | // looks something like /[x-\d]/ or /[\d-\d]/. | 
|  | // According to ECMA-262 this should be a syntax error, but | 
|  | // empirical testing shows this to break teh webz.  Instead we | 
|  | // comply with to the ECMA-262 grammar, and assume the grammar to | 
|  | // have matched the range correctly, but tweak our interpretation | 
|  | // of CharacterRange.  Effectively we implicitly handle the hyphen | 
|  | // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. | 
|  | case CachedCharacterHyphen: | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | // fall through | 
|  | case AfterCharacterClassHyphen: | 
|  | m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
|  | m_state = Empty; | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* | 
|  | * end(): | 
|  | * | 
|  | * Called at end of construction. | 
|  | */ | 
|  | void end() | 
|  | { | 
|  | if (m_state == CachedCharacter) | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | else if (m_state == CachedCharacterHyphen) { | 
|  | m_delegate.atomCharacterClassAtom(m_character); | 
|  | m_delegate.atomCharacterClassAtom('-'); | 
|  | } | 
|  | m_delegate.atomCharacterClassEnd(); | 
|  | } | 
|  |  | 
|  | // parseEscape() should never call these delegate methods when | 
|  | // invoked with inCharacterClass set. | 
|  | NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  | NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } | 
|  |  | 
|  | private: | 
|  | Delegate& m_delegate; | 
|  | ErrorCode& m_err; | 
|  | enum CharacterClassConstructionState { | 
|  | Empty, | 
|  | CachedCharacter, | 
|  | CachedCharacterHyphen, | 
|  | AfterCharacterClass, | 
|  | AfterCharacterClassHyphen, | 
|  | } m_state; | 
|  | UChar m_character; | 
|  | }; | 
|  |  | 
|  | Parser(Delegate& delegate, const String& pattern, unsigned backReferenceLimit) | 
|  | : m_delegate(delegate) | 
|  | , m_backReferenceLimit(backReferenceLimit) | 
|  | , m_err(NoError) | 
|  | , m_data(pattern.getCharacters<CharType>()) | 
|  | , m_size(pattern.length()) | 
|  | , m_index(0) | 
|  | , m_parenthesesNestingDepth(0) | 
|  | { | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseEscape(): | 
|  | * | 
|  | * Helper for parseTokens() AND parseCharacterClass(). | 
|  | * Unlike the other parser methods, this function does not report tokens | 
|  | * directly to the member delegate (m_delegate), instead tokens are | 
|  | * emitted to the delegate provided as an argument.  In the case of atom | 
|  | * escapes, parseTokens() will call parseEscape() passing m_delegate as | 
|  | * an argument, and as such the escape will be reported to the delegate. | 
|  | * | 
|  | * However this method may also be used by parseCharacterClass(), in which | 
|  | * case a CharacterClassParserDelegate will be passed as the delegate that | 
|  | * tokens should be added to.  A boolean flag is also provided to indicate | 
|  | * whether that an escape in a CharacterClass is being parsed (some parsing | 
|  | * rules change in this context). | 
|  | * | 
|  | * The boolean value returned by this method indicates whether the token | 
|  | * parsed was an atom (outside of a characted class \b and \B will be | 
|  | * interpreted as assertions). | 
|  | */ | 
|  | template<bool inCharacterClass, class EscapeDelegate> | 
|  | bool parseEscape(EscapeDelegate& delegate) | 
|  | { | 
|  | ASSERT(!m_err); | 
|  | ASSERT(peek() == '\\'); | 
|  | consume(); | 
|  |  | 
|  | if (atEndOfPattern()) { | 
|  | m_err = EscapeUnterminated; | 
|  | return false; | 
|  | } | 
|  |  | 
|  | switch (peek()) { | 
|  | // Assertions | 
|  | case 'b': | 
|  | consume(); | 
|  | if (inCharacterClass) | 
|  | delegate.atomPatternCharacter('\b'); | 
|  | else { | 
|  | delegate.assertionWordBoundary(false); | 
|  | return false; | 
|  | } | 
|  | break; | 
|  | case 'B': | 
|  | consume(); | 
|  | if (inCharacterClass) | 
|  | delegate.atomPatternCharacter('B'); | 
|  | else { | 
|  | delegate.assertionWordBoundary(true); | 
|  | return false; | 
|  | } | 
|  | break; | 
|  |  | 
|  | // CharacterClassEscape | 
|  | case 'd': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(DigitClassID, false); | 
|  | break; | 
|  | case 's': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(SpaceClassID, false); | 
|  | break; | 
|  | case 'w': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(WordClassID, false); | 
|  | break; | 
|  | case 'D': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(DigitClassID, true); | 
|  | break; | 
|  | case 'S': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(SpaceClassID, true); | 
|  | break; | 
|  | case 'W': | 
|  | consume(); | 
|  | delegate.atomBuiltInCharacterClass(WordClassID, true); | 
|  | break; | 
|  |  | 
|  | // DecimalEscape | 
|  | case '1': | 
|  | case '2': | 
|  | case '3': | 
|  | case '4': | 
|  | case '5': | 
|  | case '6': | 
|  | case '7': | 
|  | case '8': | 
|  | case '9': { | 
|  | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | 
|  | // First, try to parse this as backreference. | 
|  | if (!inCharacterClass) { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | unsigned backReference = consumeNumber(); | 
|  | if (backReference <= m_backReferenceLimit) { | 
|  | delegate.atomBackReference(backReference); | 
|  | break; | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | } | 
|  |  | 
|  | // Not a backreference, and not octal. | 
|  | if (peek() >= '8') { | 
|  | delegate.atomPatternCharacter('\\'); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Fall-through to handle this as an octal escape. | 
|  | } | 
|  |  | 
|  | // Octal escape | 
|  | case '0': | 
|  | delegate.atomPatternCharacter(consumeOctal()); | 
|  | break; | 
|  |  | 
|  | // ControlEscape | 
|  | case 'f': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\f'); | 
|  | break; | 
|  | case 'n': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\n'); | 
|  | break; | 
|  | case 'r': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\r'); | 
|  | break; | 
|  | case 't': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\t'); | 
|  | break; | 
|  | case 'v': | 
|  | consume(); | 
|  | delegate.atomPatternCharacter('\v'); | 
|  | break; | 
|  |  | 
|  | // ControlLetter | 
|  | case 'c': { | 
|  | ParseState state = saveState(); | 
|  | consume(); | 
|  | if (!atEndOfPattern()) { | 
|  | int control = consume(); | 
|  |  | 
|  | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | 
|  | if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { | 
|  | delegate.atomPatternCharacter(control & 0x1f); | 
|  | break; | 
|  | } | 
|  | } | 
|  | restoreState(state); | 
|  | delegate.atomPatternCharacter('\\'); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // HexEscape | 
|  | case 'x': { | 
|  | consume(); | 
|  | int x = tryConsumeHex(2); | 
|  | if (x == -1) | 
|  | delegate.atomPatternCharacter('x'); | 
|  | else | 
|  | delegate.atomPatternCharacter(x); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // UnicodeEscape | 
|  | case 'u': { | 
|  | consume(); | 
|  | int u = tryConsumeHex(4); | 
|  | if (u == -1) | 
|  | delegate.atomPatternCharacter('u'); | 
|  | else | 
|  | delegate.atomPatternCharacter(u); | 
|  | break; | 
|  | } | 
|  |  | 
|  | // IdentityEscape | 
|  | default: | 
|  | delegate.atomPatternCharacter(consume()); | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseAtomEscape(), parseCharacterClassEscape(): | 
|  | * | 
|  | * These methods alias to parseEscape(). | 
|  | */ | 
|  | bool parseAtomEscape() | 
|  | { | 
|  | return parseEscape<false>(m_delegate); | 
|  | } | 
|  | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | 
|  | { | 
|  | parseEscape<true>(delegate); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseCharacterClass(): | 
|  | * | 
|  | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | 
|  | * to an instance of CharacterClassParserDelegate, to describe the character class to the | 
|  | * delegate. | 
|  | */ | 
|  | void parseCharacterClass() | 
|  | { | 
|  | ASSERT(!m_err); | 
|  | ASSERT(peek() == '['); | 
|  | consume(); | 
|  |  | 
|  | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); | 
|  |  | 
|  | characterClassConstructor.begin(tryConsume('^')); | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | switch (peek()) { | 
|  | case ']': | 
|  | consume(); | 
|  | characterClassConstructor.end(); | 
|  | return; | 
|  |  | 
|  | case '\\': | 
|  | parseCharacterClassEscape(characterClassConstructor); | 
|  | break; | 
|  |  | 
|  | default: | 
|  | characterClassConstructor.atomPatternCharacter(consume(), true); | 
|  | } | 
|  |  | 
|  | if (m_err) | 
|  | return; | 
|  | } | 
|  |  | 
|  | m_err = CharacterClassUnmatched; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseParenthesesBegin(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | 
|  | */ | 
|  | void parseParenthesesBegin() | 
|  | { | 
|  | ASSERT(!m_err); | 
|  | ASSERT(peek() == '('); | 
|  | consume(); | 
|  |  | 
|  | if (tryConsume('?')) { | 
|  | if (atEndOfPattern()) { | 
|  | m_err = ParenthesesTypeInvalid; | 
|  | return; | 
|  | } | 
|  |  | 
|  | switch (consume()) { | 
|  | case ':': | 
|  | m_delegate.atomParenthesesSubpatternBegin(false); | 
|  | break; | 
|  |  | 
|  | case '=': | 
|  | m_delegate.atomParentheticalAssertionBegin(); | 
|  | break; | 
|  |  | 
|  | case '!': | 
|  | m_delegate.atomParentheticalAssertionBegin(true); | 
|  | break; | 
|  |  | 
|  | default: | 
|  | m_err = ParenthesesTypeInvalid; | 
|  | } | 
|  | } else | 
|  | m_delegate.atomParenthesesSubpatternBegin(); | 
|  |  | 
|  | ++m_parenthesesNestingDepth; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseParenthesesEnd(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | 
|  | */ | 
|  | void parseParenthesesEnd() | 
|  | { | 
|  | ASSERT(!m_err); | 
|  | ASSERT(peek() == ')'); | 
|  | consume(); | 
|  |  | 
|  | if (m_parenthesesNestingDepth > 0) | 
|  | m_delegate.atomParenthesesEnd(); | 
|  | else | 
|  | m_err = ParenthesesUnmatched; | 
|  |  | 
|  | --m_parenthesesNestingDepth; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseQuantifier(): | 
|  | * | 
|  | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | 
|  | */ | 
|  | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | 
|  | { | 
|  | ASSERT(!m_err); | 
|  | ASSERT(min <= max); | 
|  |  | 
|  | if (min == UINT_MAX) { | 
|  | m_err = QuantifierTooLarge; | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (lastTokenWasAnAtom) | 
|  | m_delegate.quantifyAtom(min, max, !tryConsume('?')); | 
|  | else | 
|  | m_err = QuantifierWithoutAtom; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parseTokens(): | 
|  | * | 
|  | * This method loops over the input pattern reporting tokens to the delegate. | 
|  | * The method returns when a parse error is detected, or the end of the pattern | 
|  | * is reached.  One piece of state is tracked around the loop, which is whether | 
|  | * the last token passed to the delegate was an atom (this is necessary to detect | 
|  | * a parse error when a quantifier provided without an atom to quantify). | 
|  | */ | 
|  | void parseTokens() | 
|  | { | 
|  | bool lastTokenWasAnAtom = false; | 
|  |  | 
|  | while (!atEndOfPattern()) { | 
|  | switch (peek()) { | 
|  | case '|': | 
|  | consume(); | 
|  | m_delegate.disjunction(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '(': | 
|  | parseParenthesesBegin(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case ')': | 
|  | parseParenthesesEnd(); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case '^': | 
|  | consume(); | 
|  | m_delegate.assertionBOL(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '$': | 
|  | consume(); | 
|  | m_delegate.assertionEOL(); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '.': | 
|  | consume(); | 
|  | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case '[': | 
|  | parseCharacterClass(); | 
|  | lastTokenWasAnAtom = true; | 
|  | break; | 
|  |  | 
|  | case '\\': | 
|  | lastTokenWasAnAtom = parseAtomEscape(); | 
|  | break; | 
|  |  | 
|  | case '*': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '+': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '?': | 
|  | consume(); | 
|  | parseQuantifier(lastTokenWasAnAtom, 0, 1); | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  |  | 
|  | case '{': { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | consume(); | 
|  | if (peekIsDigit()) { | 
|  | unsigned min = consumeNumber(); | 
|  | unsigned max = min; | 
|  |  | 
|  | if (tryConsume(',')) | 
|  | max = peekIsDigit() ? consumeNumber() : quantifyInfinite; | 
|  |  | 
|  | if (tryConsume('}')) { | 
|  | if (min <= max) | 
|  | parseQuantifier(lastTokenWasAnAtom, min, max); | 
|  | else | 
|  | m_err = QuantifierOutOfOrder; | 
|  | lastTokenWasAnAtom = false; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | restoreState(state); | 
|  | } // if we did not find a complete quantifer, fall through to the default case. | 
|  |  | 
|  | default: | 
|  | m_delegate.atomPatternCharacter(consume()); | 
|  | lastTokenWasAnAtom = true; | 
|  | } | 
|  |  | 
|  | if (m_err) | 
|  | return; | 
|  | } | 
|  |  | 
|  | if (m_parenthesesNestingDepth > 0) | 
|  | m_err = MissingParentheses; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * parse(): | 
|  | * | 
|  | * This method calls parseTokens() to parse over the input and converts any | 
|  | * error code to a const char* for a result. | 
|  | */ | 
|  | const char* parse() | 
|  | { | 
|  | if (m_size > MAX_PATTERN_SIZE) | 
|  | m_err = PatternTooLarge; | 
|  | else | 
|  | parseTokens(); | 
|  | ASSERT(atEndOfPattern() || m_err); | 
|  |  | 
|  | // The order of this array must match the ErrorCode enum. | 
|  | static const char* errorMessages[NumberOfErrorCodes] = { | 
|  | 0, // NoError | 
|  | REGEXP_ERROR_PREFIX "regular expression too large", | 
|  | REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", | 
|  | REGEXP_ERROR_PREFIX "nothing to repeat", | 
|  | REGEXP_ERROR_PREFIX "number too large in {} quantifier", | 
|  | REGEXP_ERROR_PREFIX "missing )", | 
|  | REGEXP_ERROR_PREFIX "unmatched parentheses", | 
|  | REGEXP_ERROR_PREFIX "unrecognized character after (?", | 
|  | REGEXP_ERROR_PREFIX "missing terminating ] for character class", | 
|  | REGEXP_ERROR_PREFIX "range out of order in character class", | 
|  | REGEXP_ERROR_PREFIX "\\ at end of pattern" | 
|  | }; | 
|  |  | 
|  | return errorMessages[m_err]; | 
|  | } | 
|  |  | 
|  | // Misc helper functions: | 
|  |  | 
|  | typedef unsigned ParseState; | 
|  |  | 
|  | ParseState saveState() | 
|  | { | 
|  | return m_index; | 
|  | } | 
|  |  | 
|  | void restoreState(ParseState state) | 
|  | { | 
|  | m_index = state; | 
|  | } | 
|  |  | 
|  | bool atEndOfPattern() | 
|  | { | 
|  | ASSERT(m_index <= m_size); | 
|  | return m_index == m_size; | 
|  | } | 
|  |  | 
|  | int peek() | 
|  | { | 
|  | ASSERT(m_index < m_size); | 
|  | return m_data[m_index]; | 
|  | } | 
|  |  | 
|  | bool peekIsDigit() | 
|  | { | 
|  | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | 
|  | } | 
|  |  | 
|  | unsigned peekDigit() | 
|  | { | 
|  | ASSERT(peekIsDigit()); | 
|  | return peek() - '0'; | 
|  | } | 
|  |  | 
|  | int consume() | 
|  | { | 
|  | ASSERT(m_index < m_size); | 
|  | return m_data[m_index++]; | 
|  | } | 
|  |  | 
|  | unsigned consumeDigit() | 
|  | { | 
|  | ASSERT(peekIsDigit()); | 
|  | return consume() - '0'; | 
|  | } | 
|  |  | 
|  | unsigned consumeNumber() | 
|  | { | 
|  | unsigned n = consumeDigit(); | 
|  | // check for overflow. | 
|  | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { | 
|  | n = newValue; | 
|  | consume(); | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | unsigned consumeOctal() | 
|  | { | 
|  | ASSERT(WTF::isASCIIOctalDigit(peek())); | 
|  |  | 
|  | unsigned n = consumeDigit(); | 
|  | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | 
|  | n = n * 8 + consumeDigit(); | 
|  | return n; | 
|  | } | 
|  |  | 
|  | bool tryConsume(UChar ch) | 
|  | { | 
|  | if (atEndOfPattern() || (m_data[m_index] != ch)) | 
|  | return false; | 
|  | ++m_index; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | int tryConsumeHex(int count) | 
|  | { | 
|  | ParseState state = saveState(); | 
|  |  | 
|  | int n = 0; | 
|  | while (count--) { | 
|  | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | 
|  | restoreState(state); | 
|  | return -1; | 
|  | } | 
|  | n = (n << 4) | WTF::toASCIIHexValue(consume()); | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | Delegate& m_delegate; | 
|  | unsigned m_backReferenceLimit; | 
|  | ErrorCode m_err; | 
|  | const CharType* m_data; | 
|  | unsigned m_size; | 
|  | unsigned m_index; | 
|  | unsigned m_parenthesesNestingDepth; | 
|  |  | 
|  | // Derived by empirical testing of compile time in PCRE and WREC. | 
|  | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | 
|  | }; | 
|  |  | 
|  | /* | 
|  | * Yarr::parse(): | 
|  | * | 
|  | * The parse method is passed a pattern to be parsed and a delegate upon which | 
|  | * callbacks will be made to record the parsed tokens forming the regex. | 
|  | * Yarr::parse() returns null on success, or a const C string providing an error | 
|  | * message where a parse error occurs. | 
|  | * | 
|  | * The Delegate must implement the following interface: | 
|  | * | 
|  | *    void assertionBOL(); | 
|  | *    void assertionEOL(); | 
|  | *    void assertionWordBoundary(bool invert); | 
|  | * | 
|  | *    void atomPatternCharacter(UChar ch); | 
|  | *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | 
|  | *    void atomCharacterClassBegin(bool invert) | 
|  | *    void atomCharacterClassAtom(UChar ch) | 
|  | *    void atomCharacterClassRange(UChar begin, UChar end) | 
|  | *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | 
|  | *    void atomCharacterClassEnd() | 
|  | *    void atomParenthesesSubpatternBegin(bool capture = true); | 
|  | *    void atomParentheticalAssertionBegin(bool invert = false); | 
|  | *    void atomParenthesesEnd(); | 
|  | *    void atomBackReference(unsigned subpatternId); | 
|  | * | 
|  | *    void quantifyAtom(unsigned min, unsigned max, bool greedy); | 
|  | * | 
|  | *    void disjunction(); | 
|  | * | 
|  | * The regular expression is described by a sequence of assertion*() and atom*() | 
|  | * callbacks to the delegate, describing the terms in the regular expression. | 
|  | * Following an atom a quantifyAtom() call may occur to indicate that the previous | 
|  | * atom should be quantified.  In the case of atoms described across multiple | 
|  | * calls (parentheses and character classes) the call to quantifyAtom() will come | 
|  | * after the call to the atom*End() method, never after atom*Begin(). | 
|  | * | 
|  | * Character classes may either be described by a single call to | 
|  | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | 
|  | * In the latter case, ...Begin() will be called, followed by a sequence of | 
|  | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | 
|  | * | 
|  | * Sequences of atoms and assertions are broken into alternatives via calls to | 
|  | * disjunction().  Assertions, atoms, and disjunctions emitted between calls to | 
|  | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | 
|  | * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular | 
|  | * capturing subpattern, this will be the subpatternId associated with these | 
|  | * parentheses, and will also by definition be the lowest subpatternId of these | 
|  | * parentheses and of any nested paretheses.  The atomParenthesesEnd() method | 
|  | * is passed the subpatternId of the last capturing subexpression nested within | 
|  | * these paretheses.  In the case of a capturing subpattern with no nested | 
|  | * capturing subpatterns, the same subpatternId will be passed to the begin and | 
|  | * end functions.  In the case of non-capturing subpatterns the subpatternId | 
|  | * passed to the begin method is also the first possible subpatternId that might | 
|  | * be nested within these paretheses.  If a set of non-capturing parentheses does | 
|  | * not contain any capturing subpatterns, then the subpatternId passed to begin | 
|  | * will be greater than the subpatternId passed to end. | 
|  | */ | 
|  |  | 
|  | template<class Delegate> | 
|  | const char* parse(Delegate& delegate, const String& pattern, unsigned backReferenceLimit = quantifyInfinite) | 
|  | { | 
|  | if (pattern.is8Bit()) | 
|  | return Parser<Delegate, LChar>(delegate, pattern, backReferenceLimit).parse(); | 
|  | return Parser<Delegate, UChar>(delegate, pattern, backReferenceLimit).parse(); | 
|  | } | 
|  |  | 
|  | } } // namespace JSC::Yarr | 
|  |  | 
|  | #endif // YarrParser_h |