| /* |
| * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved. |
| * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "config.h" |
| #include "YarrInterpreter.h" |
| |
| #include "Options.h" |
| #include "SuperSampler.h" |
| #include "Yarr.h" |
| #include "YarrCanonicalize.h" |
| #include <wtf/BitVector.h> |
| #include <wtf/BumpPointerAllocator.h> |
| #include <wtf/CheckedArithmetic.h> |
| #include <wtf/DataLog.h> |
| #include <wtf/StackCheck.h> |
| #include <wtf/text/WTFString.h> |
| |
| namespace JSC { namespace Yarr { |
| |
| class ByteTermDumper { |
| public: |
| ByteTermDumper(YarrPattern* pattern = nullptr) |
| : m_pattern(pattern) |
| { |
| if (pattern) |
| m_compileMode = pattern->compileMode(); |
| } |
| |
| ByteTermDumper(CompileMode compileMode) |
| : m_compileMode(compileMode) |
| { |
| } |
| |
| void dumpTerm(size_t idx, ByteTerm); |
| void dumpDisjunction(ByteDisjunction*, unsigned nesting = 0); |
| bool unicode() const { return m_compileMode == CompileMode::Unicode; } |
| bool unicodeSets() const { return m_compileMode == CompileMode::UnicodeSets; } |
| bool eitherUnicode() const { return unicode() || unicodeSets(); } |
| |
| private: |
| YarrPattern* m_pattern { nullptr }; |
| unsigned m_nesting { 0 }; |
| unsigned m_lineIndent { 0 }; |
| CompileMode m_compileMode { CompileMode::Legacy }; |
| bool m_recursiveDump { false }; |
| }; |
| |
| template<typename CharType> |
| class Interpreter { |
| public: |
| static constexpr bool verbose = false; |
| |
| struct ParenthesesDisjunctionContext; |
| |
| struct BackTrackInfoParentheses { |
| uintptr_t begin; |
| uintptr_t matchAmount; |
| ParenthesesDisjunctionContext* lastContext; |
| }; |
| |
| static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) |
| { |
| context->next = backTrack->lastContext; |
| backTrack->lastContext = context; |
| ++backTrack->matchAmount; |
| } |
| |
| static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) |
| { |
| RELEASE_ASSERT(backTrack->matchAmount); |
| RELEASE_ASSERT(backTrack->lastContext); |
| backTrack->lastContext = backTrack->lastContext->next; |
| --backTrack->matchAmount; |
| } |
| |
| struct DisjunctionContext |
| { |
| DisjunctionContext() = default; |
| |
| void* operator new(size_t, void* where) |
| { |
| return where; |
| } |
| |
| static size_t allocationSize(unsigned numberOfFrames) |
| { |
| static_assert(alignof(DisjunctionContext) <= sizeof(void*)); |
| size_t rawSize = sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t); |
| size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize); |
| RELEASE_ASSERT(roundedSize >= rawSize); |
| return roundedSize; |
| } |
| |
| int term { 0 }; |
| unsigned matchBegin; |
| unsigned matchEnd; |
| uintptr_t frame[1]; |
| }; |
| |
| DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) |
| { |
| size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize); |
| allocatorPool = allocatorPool->ensureCapacity(size); |
| RELEASE_ASSERT(allocatorPool); |
| return new (allocatorPool->alloc(size)) DisjunctionContext(); |
| } |
| |
| void freeDisjunctionContext(DisjunctionContext* context) |
| { |
| allocatorPool = allocatorPool->dealloc(context); |
| } |
| |
| struct ParenthesesDisjunctionContext |
| { |
| ParenthesesDisjunctionContext(BytecodePattern* pattern, unsigned* output, ByteTerm& term, unsigned numDuplicateNamedGroups, BitVector& duplicateNamedGroups) |
| : m_pattern(pattern) |
| , m_numNestedSubpatterns(term.atom.parenthesesDisjunction->m_numSubpatterns) |
| , m_duplicateNamedGroups(duplicateNamedGroups) |
| |
| { |
| m_numBackupIds = m_numNestedSubpatterns * 2 + numDuplicateNamedGroups; |
| unsigned firstSubpatternId = term.subpatternId(); |
| |
| for (unsigned i = 0; i < (m_numNestedSubpatterns << 1); ++i) { |
| subpatternAndGroupIdBackup[i] = output[(firstSubpatternId << 1) + i]; |
| output[(firstSubpatternId << 1) + i] = offsetNoMatch; |
| } |
| |
| unsigned nameGroupIdx = 0; |
| for (unsigned duplicateNamedGroupId : m_duplicateNamedGroups) { |
| subpatternAndGroupIdBackup[backupOffsetForDuplicateNamedGroup(nameGroupIdx)] = output[m_pattern->offsetForDuplicateNamedGroupId(duplicateNamedGroupId)]; |
| output[pattern->offsetForDuplicateNamedGroupId(duplicateNamedGroupId)] = 0; |
| ++nameGroupIdx; |
| } |
| |
| new (getDisjunctionContext()) DisjunctionContext(); |
| } |
| |
| void* operator new(size_t, void* where) |
| { |
| return where; |
| } |
| |
| void restoreOutput(unsigned* output, unsigned firstSubpatternId) |
| { |
| for (unsigned i = 0; i < (m_numNestedSubpatterns << 1); ++i) |
| output[(firstSubpatternId << 1) + i] = subpatternAndGroupIdBackup[i]; |
| |
| unsigned nameGroupIdx = 0; |
| for (unsigned duplicateNamedGroupId : m_duplicateNamedGroups) { |
| output[m_pattern->offsetForDuplicateNamedGroupId(duplicateNamedGroupId)] = subpatternAndGroupIdBackup[backupOffsetForDuplicateNamedGroup(nameGroupIdx)]; |
| ++nameGroupIdx; |
| } |
| } |
| |
| DisjunctionContext* getDisjunctionContext() |
| { |
| return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(m_numBackupIds)); |
| } |
| |
| unsigned backupOffsetForDuplicateNamedGroup(unsigned duplicateNamedGroup) |
| { |
| unsigned offset = (m_numNestedSubpatterns << 1) + duplicateNamedGroup; |
| ASSERT(offset < m_numBackupIds); |
| return offset; |
| } |
| |
| static size_t allocationSize(unsigned numberOfSubpatterns, unsigned numDuplicateNamedGroups) |
| { |
| Checked<size_t> numBackupIds = (Checked<size_t>(numberOfSubpatterns) * 2U) + Checked<size_t>(numDuplicateNamedGroups); |
| return allocationSize(numBackupIds.value()); |
| } |
| |
| static size_t allocationSize(unsigned numBackupIds) |
| { |
| static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*)); |
| size_t rawSize = sizeof(ParenthesesDisjunctionContext) + Checked<size_t>(numBackupIds) * sizeof(unsigned); |
| size_t roundedSize = roundUpToMultipleOf<sizeof(void*)>(rawSize); |
| RELEASE_ASSERT(roundedSize >= rawSize); |
| return roundedSize; |
| } |
| |
| ParenthesesDisjunctionContext* next { nullptr }; |
| BytecodePattern* m_pattern; |
| unsigned m_numNestedSubpatterns; |
| size_t m_numBackupIds; |
| BitVector m_duplicateNamedGroups; |
| unsigned subpatternAndGroupIdBackup[1]; |
| }; |
| |
| ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term) |
| { |
| BitVector duplicateNamedCaptureGroups; |
| unsigned firstSubpatternId = term.subpatternId(); |
| unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; |
| unsigned numDuplicateNamedGroups = 0; |
| |
| if (pattern->hasDuplicateNamedCaptureGroups()) { |
| for (unsigned i = 0; i < numNestedSubpatterns; ++i) { |
| unsigned subpatternId = firstSubpatternId + i; |
| unsigned duplicateNamedGroup = pattern->m_duplicateNamedGroupForSubpatternId[subpatternId]; |
| if (duplicateNamedGroup) |
| duplicateNamedCaptureGroups.set(duplicateNamedGroup); |
| } |
| |
| numDuplicateNamedGroups = duplicateNamedCaptureGroups.bitCount(); |
| } |
| |
| size_t size = Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(numNestedSubpatterns, numDuplicateNamedGroups)) + DisjunctionContext::allocationSize(disjunction->m_frameSize); |
| allocatorPool = allocatorPool->ensureCapacity(size); |
| RELEASE_ASSERT(allocatorPool); |
| return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(pattern, output, term, numDuplicateNamedGroups, duplicateNamedCaptureGroups); |
| } |
| |
| void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) |
| { |
| allocatorPool = allocatorPool->dealloc(context); |
| } |
| |
| class InputStream { |
| public: |
| InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs) |
| : input(input) |
| , pos(start) |
| , length(length) |
| , decodeSurrogatePairs(decodeSurrogatePairs) |
| { |
| } |
| |
| void next() |
| { |
| ++pos; |
| } |
| |
| void rewind(unsigned amount) |
| { |
| ASSERT(pos >= amount); |
| pos -= amount; |
| } |
| |
| int read() |
| { |
| ASSERT(pos < length); |
| if (pos < length) |
| return input[pos]; |
| return -1; |
| } |
| |
| int readChecked(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| unsigned p = pos - negativePositionOffest; |
| ASSERT(p < length); |
| int result = input[p]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { |
| if (atEnd()) |
| return -1; |
| |
| result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); |
| next(); |
| } |
| return result; |
| } |
| |
| int readCheckedDontAdvance(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| unsigned p = pos - negativePositionOffest; |
| ASSERT(p < length); |
| int result = input[p]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { |
| if (atEnd()) |
| return -1; |
| |
| result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); |
| } |
| return result; |
| } |
| |
| // readForCharacterDump() is only for use by the DUMP_CURR_CHAR macro. |
| // We don't want any side effects like the next() in readChecked() above. |
| int readForCharacterDump(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| unsigned p = pos - negativePositionOffest; |
| ASSERT(p < length); |
| int result = input[p]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) { |
| if (atEnd()) |
| return -1; |
| |
| result = U16_GET_SUPPLEMENTARY(result, input[p + 1]); |
| } |
| return result; |
| } |
| |
| int tryReadBackward(unsigned negativePositionOffest) |
| { |
| if (pos < negativePositionOffest) |
| return -1; |
| |
| unsigned p = pos - negativePositionOffest; |
| ASSERT(p < length); |
| int result = input[p]; |
| if (U16_IS_TRAIL(result) && decodeSurrogatePairs && p > 0 && U16_IS_LEAD(input[p - 1])) { |
| result = U16_GET_SUPPLEMENTARY(input[p - 1], result); |
| rewind(1); |
| } |
| return result; |
| } |
| |
| int readSurrogatePairChecked(unsigned negativePositionOffset) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffset); |
| unsigned p = pos - negativePositionOffset; |
| ASSERT(p < length); |
| if (p + 1 >= length) |
| return -1; |
| |
| int first = input[p]; |
| int second = input[p + 1]; |
| if (U16_IS_LEAD(first) && U16_IS_TRAIL(second)) |
| return U16_GET_SUPPLEMENTARY(first, second); |
| |
| return -1; |
| } |
| |
| int reread(unsigned from) |
| { |
| ASSERT(from < length); |
| int result = input[from]; |
| if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1])) |
| result = U16_GET_SUPPLEMENTARY(result, input[from + 1]); |
| return result; |
| } |
| |
| int prev() |
| { |
| ASSERT(!(pos > length)); |
| if (pos && length) |
| return input[pos - 1]; |
| return -1; |
| } |
| |
| unsigned getPos() |
| { |
| return pos; |
| } |
| |
| void setPos(unsigned p) |
| { |
| pos = p; |
| } |
| |
| bool atStart() |
| { |
| return pos == 0; |
| } |
| |
| bool atEnd() |
| { |
| return pos == length; |
| } |
| |
| unsigned end() |
| { |
| return length; |
| } |
| |
| bool checkInput(unsigned count) |
| { |
| if (((pos + count) <= length) && ((pos + count) >= pos)) { |
| pos += count; |
| return true; |
| } |
| return false; |
| } |
| |
| void uncheckInput(unsigned count) |
| { |
| RELEASE_ASSERT(pos >= count); |
| pos -= count; |
| } |
| |
| bool tryUncheckInput(unsigned count) |
| { |
| if (count > pos) |
| return false; |
| pos -= count; |
| return true; |
| } |
| |
| bool atStart(unsigned negativePositionOffset) |
| { |
| return pos == negativePositionOffset; |
| } |
| |
| bool atEnd(unsigned negativePositionOffest) |
| { |
| RELEASE_ASSERT(pos >= negativePositionOffest); |
| return (pos - negativePositionOffest) == length; |
| } |
| |
| bool isAvailableInput(unsigned offset) |
| { |
| return (((pos + offset) <= length) && ((pos + offset) >= pos)); |
| } |
| |
| bool isValidNegativeInputOffset(unsigned offset) |
| { |
| return (pos >= offset) && ((pos - offset) < length); |
| } |
| |
| void dump(PrintStream& out) const |
| { |
| const unsigned maxPrintLen = 50; |
| |
| out.print(sizeof(CharType) == 1 ? "8bit" : "16bit", " len: ", length, " \""); |
| unsigned printLen = std::min(length, maxPrintLen); |
| for (unsigned i = 0; i < printLen; ++i) |
| out.print(CharacterDump(input[i])); |
| if (length > maxPrintLen) |
| out.print("..."); |
| out.print("\""); |
| } |
| |
| private: |
| const CharType* input; |
| unsigned pos; |
| unsigned length; |
| bool decodeSurrogatePairs; |
| }; |
| |
| bool testCharacterClass(CharacterClass* characterClass, int ch) |
| { |
| auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) { |
| for (unsigned i = 0; i < matches.size(); ++i) { |
| if (ch == matches[i]) |
| return true; |
| } |
| |
| return false; |
| }; |
| |
| auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) { |
| size_t low = 0; |
| size_t high = matches.size() - 1; |
| |
| while (low <= high) { |
| size_t mid = low + (high - low) / 2; |
| int diff = ch - matches[mid]; |
| if (!diff) |
| return true; |
| |
| if (diff < 0) { |
| if (mid == low) |
| return false; |
| high = mid - 1; |
| } else |
| low = mid + 1; |
| } |
| return false; |
| }; |
| |
| auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) { |
| for (unsigned i = 0; i < ranges.size(); ++i) { |
| if ((ch >= ranges[i].begin) && (ch <= ranges[i].end)) |
| return true; |
| } |
| |
| return false; |
| }; |
| |
| auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) { |
| size_t low = 0; |
| size_t high = ranges.size() - 1; |
| |
| while (low <= high) { |
| size_t mid = low + (high - low) / 2; |
| int rangeBeginDiff = ch - ranges[mid].begin; |
| if (rangeBeginDiff >= 0 && ch <= ranges[mid].end) |
| return true; |
| |
| if (rangeBeginDiff < 0) { |
| if (mid == low) |
| return false; |
| high = mid - 1; |
| } else |
| low = mid + 1; |
| } |
| return false; |
| }; |
| |
| if (characterClass->m_anyCharacter) |
| return true; |
| |
| const size_t thresholdForBinarySearch = 6; |
| |
| if (!isASCII(ch)) { |
| if (characterClass->m_matchesUnicode.size()) { |
| if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) { |
| if (binarySearchMatches(characterClass->m_matchesUnicode)) |
| return true; |
| } else if (linearSearchMatches(characterClass->m_matchesUnicode)) |
| return true; |
| } |
| |
| if (characterClass->m_rangesUnicode.size()) { |
| if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) { |
| if (binarySearchRanges(characterClass->m_rangesUnicode)) |
| return true; |
| } else if (linearSearchRanges(characterClass->m_rangesUnicode)) |
| return true; |
| } |
| } else { |
| if (characterClass->m_matches.size()) { |
| if (characterClass->m_matches.size() > thresholdForBinarySearch) { |
| if (binarySearchMatches(characterClass->m_matches)) |
| return true; |
| } else if (linearSearchMatches(characterClass->m_matches)) |
| return true; |
| } |
| |
| if (characterClass->m_ranges.size()) { |
| if (characterClass->m_ranges.size() > thresholdForBinarySearch) { |
| if (binarySearchRanges(characterClass->m_ranges)) |
| return true; |
| } else if (linearSearchRanges(characterClass->m_ranges)) |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| bool checkCharacter(ByteTerm& term, unsigned negativeInputOffset) |
| { |
| ASSERT(term.isCharacterType()); |
| if (term.matchDirection() == Forward) |
| return term.atom.patternCharacter == input.readChecked(negativeInputOffset); |
| |
| return term.atom.patternCharacter == input.tryReadBackward(negativeInputOffset); |
| } |
| |
| bool checkSurrogatePair(ByteTerm& term, unsigned negativeInputOffset) |
| { |
| ASSERT(term.isCharacterType()); |
| return term.atom.patternCharacter == input.readSurrogatePairChecked(negativeInputOffset); |
| } |
| |
| bool checkCasedCharacter(ByteTerm& term, unsigned negativeInputOffset) |
| { |
| ASSERT(term.isCasedCharacterType()); |
| int ch = term.matchDirection() == Forward ? input.readChecked(negativeInputOffset) : input.tryReadBackward(negativeInputOffset); |
| return (term.atom.casedCharacter.lo == ch) || (term.atom.casedCharacter.hi == ch); |
| } |
| |
| bool checkCharacterClass(ByteTerm& term, unsigned negativeInputOffset) |
| { |
| ASSERT(term.isCharacterClass()); |
| |
| int inputChar = term.matchDirection() == Forward ? input.readChecked(negativeInputOffset) : input.tryReadBackward(negativeInputOffset); |
| if (inputChar < 0) |
| return false; |
| |
| bool match = testCharacterClass(term.atom.characterClass, inputChar); |
| return term.invert() ? !match : match; |
| } |
| |
| bool checkCharacterClassDontAdvanceInputForNonBMP(ByteTerm& term, unsigned negativeInputOffset) |
| { |
| ASSERT(term.isCharacterClass()); |
| CharacterClass* characterClass = term.atom.characterClass; |
| |
| if (term.matchDirection() == Backward && negativeInputOffset > input.getPos()) |
| return false; |
| |
| int readCharacter = characterClass->hasOnlyNonBMPCharacters() ? input.readSurrogatePairChecked(negativeInputOffset) : input.readChecked(negativeInputOffset); |
| |
| if (readCharacter < 0) |
| return false; |
| |
| return testCharacterClass(characterClass, readCharacter); |
| } |
| |
| bool tryConsumeBackReference(int matchBegin, int matchEnd, ByteTerm& term) |
| { |
| unsigned matchSize = (unsigned)(matchEnd - matchBegin); |
| |
| if (term.matchDirection() == Forward) { |
| if (!input.checkInput(matchSize)) |
| return false; |
| } |
| |
| for (unsigned i = 0; i < matchSize; ++i) { |
| unsigned negativeInputOffset = term.inputPosition + matchSize - i; |
| if (term.matchDirection() == Backward && negativeInputOffset > input.getPos()) |
| return false; |
| |
| int oldCh = input.reread(matchBegin + i); |
| int ch; |
| if (!U_IS_BMP(oldCh)) { |
| ch = input.readSurrogatePairChecked(negativeInputOffset); |
| ++i; |
| } else |
| ch = term.matchDirection() == Forward ? input.readChecked(negativeInputOffset) : input.tryReadBackward(negativeInputOffset); |
| |
| if (oldCh == ch) |
| continue; |
| |
| if (pattern->ignoreCase()) { |
| // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode |
| // patterns, Unicode values are never allowed to match against ASCII ones. |
| // For Unicode, we need to check all canonical equivalents of a character. |
| if (isLegacyCompilation() && (isASCII(oldCh) || isASCII(ch))) { |
| if (toASCIIUpper(oldCh) == toASCIIUpper(ch)) |
| continue; |
| } else if (areCanonicallyEquivalent(oldCh, ch, isEitherUnicodeCompilation() ? CanonicalMode::Unicode : CanonicalMode::UCS2)) |
| continue; |
| } |
| |
| if (term.matchDirection() == Forward) |
| input.uncheckInput(matchSize); |
| |
| return false; |
| } |
| |
| if (term.matchDirection() == Backward) |
| input.uncheckInput(matchSize); |
| |
| return true; |
| } |
| |
| bool matchAssertionBOL(ByteTerm& term) |
| { |
| return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readCheckedDontAdvance(term.inputPosition + 1))); |
| } |
| |
| bool matchAssertionEOL(ByteTerm& term) |
| { |
| if (term.inputPosition) |
| return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.readCheckedDontAdvance(term.inputPosition))); |
| |
| return (input.atEnd()) || (pattern->multiline() && testCharacterClass(pattern->newlineCharacterClass, input.read())); |
| } |
| |
| bool matchAssertionWordBoundary(ByteTerm& term) |
| { |
| unsigned inputOffset = term.inputPosition; |
| |
| bool prevIsWordchar = !input.atStart(inputOffset) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(inputOffset + 1)); |
| bool readIsWordchar; |
| if (inputOffset) |
| readIsWordchar = !input.atEnd(inputOffset) && testCharacterClass(pattern->wordcharCharacterClass, input.readChecked(inputOffset)); |
| else |
| readIsWordchar = !input.atEnd() && testCharacterClass(pattern->wordcharCharacterClass, input.read()); |
| |
| bool wordBoundary = prevIsWordchar != readIsWordchar; |
| return term.invert() ? !wordBoundary : wordBoundary; |
| } |
| |
| bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) |
| { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| if (term.matchDirection() == Forward) |
| input.uncheckInput(U16_LENGTH(term.atom.patternCharacter)); |
| else { |
| if (!input.checkInput(U16_LENGTH(term.atom.patternCharacter))) |
| break; |
| } |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if (term.matchDirection() == Forward) { |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacter(term, term.inputPosition + 1)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| // matchDirection Backward |
| unsigned position = input.getPos(); |
| |
| if (position < term.inputPosition) |
| break; |
| |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacter(term, term.inputPosition)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| |
| } |
| |
| return false; |
| } |
| |
| bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) |
| { |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| if (term.matchDirection() == Forward) |
| input.uncheckInput(1); |
| else { |
| if (!input.checkInput(1)) |
| break; |
| } |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if (term.matchDirection() == Forward) { |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCasedCharacter(term, term.inputPosition + 1)) |
| return true; |
| } |
| input.uncheckInput(backTrack->matchAmount); |
| break; |
| } |
| // matchDirection Backward |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCasedCharacter(term, term.inputPosition)) |
| return true; |
| } |
| input.uncheckInput(backTrack->matchAmount); |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::CharacterClass); |
| BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| if (term.matchDirection() == Forward) { |
| if (isEitherUnicodeCompilation()) { |
| backTrack->begin = input.getPos(); |
| unsigned matchAmount = 0; |
| for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (term.invert()) { |
| if (!checkCharacterClass(term, term.inputPosition - matchAmount)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } else { |
| unsigned matchOffset = matchAmount * (term.atom.characterClass->hasOnlyNonBMPCharacters() ? 2 : 1); |
| if (!checkCharacterClassDontAdvanceInputForNonBMP(term, term.inputPosition - matchOffset)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacterClass(term, term.inputPosition - matchAmount)) |
| return false; |
| } |
| return true; |
| } |
| |
| // matchDirection is Backward |
| if (isEitherUnicodeCompilation()) { |
| backTrack->begin = input.getPos(); |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| unsigned matchOffset = term.atom.quantityMaxCount - 1 - matchAmount; |
| if (term.invert()) { |
| if (!checkCharacterClass(term, term.inputPosition - matchOffset)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } else { |
| matchOffset = matchOffset * (term.atom.characterClass->hasOnlyNonBMPCharacters() ? 2 : 1); |
| if (!checkCharacterClassDontAdvanceInputForNonBMP(term, term.inputPosition - matchOffset)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| if (input.getPos() < term.inputPosition) |
| return false; |
| |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacterClass(term, term.inputPosition - term.atom.quantityMaxCount + matchAmount + 1)) |
| return false; |
| } |
| return true; |
| } |
| |
| case QuantifierType::Greedy: { |
| unsigned position = input.getPos(); |
| unsigned matchAmount = 0; |
| if (term.matchDirection() == Forward) { |
| while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCharacterClass(term, term.inputPosition + 1)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| backTrack->matchAmount = matchAmount; |
| return true; |
| } |
| |
| // matchDirection = Backward |
| if (input.getPos() < term.inputPosition) |
| return false; |
| |
| while ((matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| if (!checkCharacterClass(term, term.inputPosition)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| backTrack->matchAmount = matchAmount; |
| return true; |
| } |
| |
| case QuantifierType::NonGreedy: |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| return true; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::CharacterClass); |
| BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| if (isEitherUnicodeCompilation()) |
| input.setPos(backTrack->begin); |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| if (isEitherUnicodeCompilation()) { |
| // Unmatch one codepoint |
| if (term.matchDirection() == Forward) { |
| --backTrack->matchAmount; |
| input.uncheckInput(1); |
| input.tryReadBackward(term.inputPosition); |
| return true; |
| } |
| // matchDirection Backwards |
| --backTrack->matchAmount; |
| input.readChecked(term.inputPosition); |
| input.checkInput(1); |
| return true; |
| } |
| --backTrack->matchAmount; |
| if (term.matchDirection() == Forward) |
| input.uncheckInput(1); |
| else |
| input.checkInput(1); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if (term.matchDirection() == Forward) { |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacterClass(term, term.inputPosition + 1)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| // matchDirection Backward |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| ++backTrack->matchAmount; |
| if (checkCharacterClass(term, term.inputPosition)) |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchBackReference(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::BackReference); |
| BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
| |
| unsigned subpatternId; |
| |
| if (auto duplicateNamedGroupId = term.duplicateNamedGroupId()) { |
| subpatternId = output[pattern->offsetForDuplicateNamedGroupId(duplicateNamedGroupId)]; |
| if (subpatternId < 1) { |
| // If we don't have a subpattern that matched, then the string to match is empty. |
| return true; |
| } |
| } else |
| subpatternId = term.subpatternId(); |
| |
| unsigned matchBegin = output[(subpatternId << 1)]; |
| unsigned matchEnd = output[(subpatternId << 1) + 1]; |
| |
| // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that. |
| // In this case the result of match is empty string like when it references to a parentheses with zero-width match. |
| // Eg.: /(a\1)/ |
| if (matchEnd == offsetNoMatch) |
| return true; |
| |
| if (matchBegin == offsetNoMatch) |
| return true; |
| |
| ASSERT(matchBegin <= matchEnd); |
| |
| if (matchBegin == matchEnd) |
| return true; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| backTrack->begin = input.getPos(); |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!tryConsumeBackReference(matchBegin, matchEnd, term)) { |
| input.setPos(backTrack->begin); |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| case QuantifierType::Greedy: { |
| unsigned matchAmount = 0; |
| while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term)) |
| ++matchAmount; |
| backTrack->matchAmount = matchAmount; |
| backTrack->backReferenceSize = matchEnd - matchBegin; |
| return true; |
| } |
| |
| case QuantifierType::NonGreedy: |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| return true; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::BackReference); |
| BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
| |
| unsigned matchBegin = output[(term.subpatternId() << 1)]; |
| unsigned matchEnd = output[(term.subpatternId() << 1) + 1]; |
| |
| if (matchBegin == offsetNoMatch) |
| return false; |
| |
| ASSERT(matchBegin <= matchEnd); |
| |
| if (matchBegin == matchEnd) |
| return false; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: |
| // for quantityMaxCount == 1, could rewind. |
| input.setPos(backTrack->begin); |
| break; |
| |
| case QuantifierType::Greedy: |
| if (backTrack->matchAmount) { |
| --backTrack->matchAmount; |
| if (term.matchDirection() == Backward) |
| return input.checkInput(matchEnd - matchBegin); |
| input.rewind(matchEnd - matchBegin); |
| return true; |
| } |
| break; |
| |
| case QuantifierType::NonGreedy: |
| if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, term)) { |
| ++backTrack->matchAmount; |
| return true; |
| } |
| input.setPos(backTrack->begin); |
| break; |
| } |
| |
| return false; |
| } |
| |
| void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) |
| { |
| if (term.capture()) { |
| unsigned subpatternId = term.subpatternId(); |
| // For Backward matches, the captured indexes are recorded end then start. |
| output[(subpatternId << 1) + term.matchDirection()] = context->getDisjunctionContext()->matchBegin - term.inputPosition; |
| output[(subpatternId << 1) + 1 - term.matchDirection()] = context->getDisjunctionContext()->matchEnd - term.inputPosition; |
| |
| if (term.duplicateNamedGroupId()) { |
| // Record which of the duplicate named subpatterns matched. |
| output[pattern->offsetForDuplicateNamedGroupId(term.duplicateNamedGroupId())] = subpatternId; |
| } |
| } |
| } |
| void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) |
| { |
| unsigned firstSubpatternId = term.subpatternId(); |
| context->restoreOutput(output, firstSubpatternId); |
| } |
| |
| JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) |
| { |
| while (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| |
| JSRegExpResult result = matchDisjunction(term.atom.parenthesesDisjunction, context->getDisjunctionContext(), true); |
| if (result == JSRegExpResult::Match) |
| return JSRegExpResult::Match; |
| |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| return JSRegExpResult::NoMatch; |
| } |
| |
| bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: { |
| // set this speculatively; if we get to the parens end this will be true. |
| backTrack->begin = input.getPos(); |
| break; |
| } |
| case QuantifierType::NonGreedy: { |
| backTrack->begin = notFound; |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.subpatternId(); |
| // For Backward matches, the captured indexes are recorded end then start. |
| output[(subpatternId << 1) + term.matchDirection()] = input.getPos() - term.inputPosition; |
| } |
| |
| return true; |
| } |
| |
| bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.subpatternId(); |
| // For Backward matches, the captured indexes are recorded end then start. |
| output[(subpatternId << 1) + 1 - term.matchDirection()] = input.getPos() - term.inputPosition; |
| |
| if (term.duplicateNamedGroupId()) { |
| // Record which of the duplicate named subpatterns matched. |
| output[pattern->offsetForDuplicateNamedGroupId(term.duplicateNamedGroupId())] = subpatternId; |
| } |
| } |
| |
| if (term.atom.quantityType == QuantifierType::FixedCount) |
| return true; |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| return backTrack->begin != input.getPos(); |
| } |
| |
| bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| if (term.capture()) { |
| unsigned subpatternId = term.subpatternId(); |
| output[(subpatternId << 1)] = offsetNoMatch; |
| output[(subpatternId << 1) + 1] = offsetNoMatch; |
| |
| if (term.duplicateNamedGroupId()) { |
| // Clear matching subpatternId. |
| output[pattern->offsetForDuplicateNamedGroupId(term.duplicateNamedGroupId())] = 0; |
| } |
| } |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: |
| // if we backtrack to this point, there is another chance - try matching nothing. |
| ASSERT(backTrack->begin != notFound); |
| backTrack->begin = notFound; |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| case QuantifierType::NonGreedy: |
| ASSERT(backTrack->begin != notFound); |
| FALLTHROUGH; |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternOnceEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::Greedy: |
| if (backTrack->begin == notFound) { |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| FALLTHROUGH; |
| case QuantifierType::NonGreedy: |
| if (backTrack->begin == notFound) { |
| backTrack->begin = input.getPos(); |
| if (term.capture()) { |
| // Technically this access to inputPosition should be accessing the begin term's |
| // inputPosition, but for repeats other than fixed these values should be |
| // the same anyway! (We don't pre-check for greedy or non-greedy matches.) |
| ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition); |
| unsigned subpatternId = term.subpatternId(); |
| // For Backward matches, the captured indexes are recorded end then start. |
| output[(subpatternId << 1) + term.matchDirection()] = input.getPos() - term.inputPosition; |
| } |
| context->term -= term.atom.parenthesesWidth; |
| return true; |
| } |
| FALLTHROUGH; |
| case QuantifierType::FixedCount: |
| break; |
| } |
| |
| return false; |
| } |
| |
| bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| ASSERT(term.atom.quantityType == QuantifierType::Greedy); |
| ASSERT(term.atom.quantityMaxCount == quantifyInfinite); |
| ASSERT(!term.capture()); |
| |
| BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); |
| backTrack->begin = input.getPos(); |
| return true; |
| } |
| |
| bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalEnd); |
| |
| BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation); |
| // Empty match is a failed match. |
| if (backTrack->begin == input.getPos()) |
| return false; |
| |
| // Successful match! Okay, what's next? - loop around and try to match more! |
| context->term -= (term.atom.parenthesesWidth + 1); |
| return true; |
| } |
| |
| bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| ASSERT(term.atom.quantityType == QuantifierType::Greedy); |
| ASSERT(term.atom.quantityMaxCount == quantifyInfinite); |
| ASSERT(!term.capture()); |
| |
| // If we backtrack to this point, we have failed to match this iteration of the parens. |
| // Since this is greedy / zero minimum a failed is also accepted as a match! |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| |
| bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*) |
| { |
| // 'Terminal' parentheses are at the end of the regex, and as such a match past end |
| // should always be returned as a successful match - we should never backtrack to here. |
| RELEASE_ASSERT_NOT_REACHED(); |
| return false; |
| } |
| |
| bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| backTrack->begin = input.getPos(); |
| |
| return true; |
| } |
| |
| bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| input.setPos(backTrack->begin); |
| |
| // We've reached the end of the parens; if they are inverted, this is failure. |
| if (term.invert()) { |
| if (term.containsAnyCaptures()) { |
| for (unsigned subpattern = term.subpatternId(); subpattern <= term.lastSubpatternId(); subpattern++) |
| output[subpattern << 1] = offsetNoMatch; |
| } |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionBegin); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| // We've failed to match parens; if they are inverted, this is win! |
| if (term.invert()) { |
| context->term += term.atom.parenthesesWidth; |
| return true; |
| } |
| |
| if (term.matchDirection() == Backward) { |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| input.setPos(backTrack->begin); |
| } |
| |
| return false; |
| } |
| |
| bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParentheticalAssertionEnd); |
| ASSERT(term.atom.quantityMaxCount == 1); |
| |
| BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
| |
| input.setPos(backTrack->begin); |
| |
| if (term.containsAnyCaptures()) { |
| for (unsigned subpattern = term.subpatternId(); subpattern <= term.lastSubpatternId(); subpattern++) |
| output[subpattern << 1] = offsetNoMatch; |
| } |
| |
| context->term -= term.atom.parenthesesWidth; |
| return false; |
| } |
| |
| JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpattern); |
| |
| BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
| ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
| |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| backTrack->lastContext = nullptr; |
| |
| ASSERT(term.atom.quantityType != QuantifierType::FixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount); |
| |
| unsigned minimumMatchCount = term.atom.quantityMinCount; |
| JSRegExpResult fixedMatchResult; |
| |
| // Handle fixed matches and the minimum part of a variable length match. |
| if (minimumMatchCount) { |
| // While we haven't yet reached our fixed limit, |
| while (backTrack->matchAmount < minimumMatchCount) { |
| // Try to do a match, and it it succeeds, add it to the list. |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| fixedMatchResult = matchDisjunction(disjunctionBody, context->getDisjunctionContext()); |
| if (fixedMatchResult == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| // The match failed; try to find an alternate point to carry on from. |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (fixedMatchResult != JSRegExpResult::NoMatch) |
| return fixedMatchResult; |
| JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); |
| if (backtrackResult != JSRegExpResult::Match) |
| return backtrackResult; |
| } |
| } |
| |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::Greedy: { |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext()); |
| if (result == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| break; |
| } |
| } |
| |
| if (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::NonGreedy: |
| return JSRegExpResult::Match; |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| // Rules for backtracking differ depending on whether this is greedy or non-greedy. |
| // |
| // Greedy matches never should try just adding more - you should already have done |
| // the 'more' cases. Always backtrack, at least a leetle bit. However cases where |
| // you backtrack an item off the list needs checking, since we'll never have matched |
| // the one less case. Tracking forwards, still add as much as possible. |
| // |
| // Non-greedy, we've already done the one less case, so don't match on popping. |
| // We haven't done the one more case, so always try to add that. |
| // |
| JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context) |
| { |
| ASSERT(term.type == ByteTerm::Type::ParenthesesSubpattern); |
| |
| BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
| ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
| |
| switch (term.atom.quantityType) { |
| case QuantifierType::FixedCount: { |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| |
| ParenthesesDisjunctionContext* context = nullptr; |
| JSRegExpResult result = parenthesesDoBacktrack(term, backTrack); |
| |
| if (result != JSRegExpResult::Match) |
| return result; |
| |
| // While we haven't yet reached our fixed limit, |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| // Try to do a match, and it it succeeds, add it to the list. |
| context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| result = matchDisjunction(disjunctionBody, context->getDisjunctionContext()); |
| |
| if (result == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| // The match failed; try to find an alternate point to carry on from. |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack); |
| if (backtrackResult != JSRegExpResult::Match) |
| return backtrackResult; |
| } |
| } |
| |
| ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount); |
| context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::Greedy: { |
| if (!backTrack->matchAmount) |
| return JSRegExpResult::NoMatch; |
| |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(), true); |
| if (result == JSRegExpResult::Match) { |
| while (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext()); |
| if (parenthesesResult == JSRegExpResult::Match) |
| appendParenthesesDisjunctionContext(backTrack, context); |
| else { |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (parenthesesResult != JSRegExpResult::NoMatch) |
| return parenthesesResult; |
| |
| break; |
| } |
| } |
| } else { |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (backTrack->matchAmount < term.atom.quantityMinCount) { |
| while (backTrack->matchAmount) { |
| context = backTrack->lastContext; |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| } |
| |
| input.setPos(backTrack->begin); |
| return result; |
| } |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| if (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| case QuantifierType::NonGreedy: { |
| // If we've not reached the limit, try to add one more match. |
| if (backTrack->matchAmount < term.atom.quantityMaxCount) { |
| ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunctionBody, output, term); |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext()); |
| if (result == JSRegExpResult::Match) { |
| appendParenthesesDisjunctionContext(backTrack, context); |
| recordParenthesesMatch(term, context); |
| return JSRegExpResult::Match; |
| } |
| |
| resetMatches(term, context); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| // Nope - okay backtrack looking for an alternative. |
| while (backTrack->matchAmount) { |
| ParenthesesDisjunctionContext* context = backTrack->lastContext; |
| JSRegExpResult result = matchNonZeroDisjunction(disjunctionBody, context->getDisjunctionContext(), true); |
| if (result == JSRegExpResult::Match) { |
| // successful backtrack! we're back in the game! |
| if (backTrack->matchAmount) { |
| context = backTrack->lastContext; |
| recordParenthesesMatch(term, context); |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| // pop a match off the stack |
| resetMatches(term, context); |
| popParenthesesDisjunctionContext(backTrack); |
| freeParenthesesDisjunctionContext(context); |
| |
| if (result != JSRegExpResult::NoMatch) |
| return result; |
| } |
| |
| return JSRegExpResult::NoMatch; |
| } |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context) |
| { |
| UNUSED_PARAM(term); |
| |
| if (pattern->dotAll()) { |
| context->matchBegin = startOffset; |
| context->matchEnd = input.end(); |
| return true; |
| } |
| |
| unsigned matchBegin = context->matchBegin; |
| |
| if (matchBegin > startOffset) { |
| for (matchBegin--; true; matchBegin--) { |
| if (testCharacterClass(pattern->newlineCharacterClass, input.reread(matchBegin))) { |
| ++matchBegin; |
| break; |
| } |
| |
| if (matchBegin == startOffset) |
| break; |
| } |
| } |
| |
| unsigned matchEnd = input.getPos(); |
| |
| for (; (matchEnd != input.end()) |
| && (!testCharacterClass(pattern->newlineCharacterClass, input.reread(matchEnd))); matchEnd++) { } |
| |
| if (((matchBegin && term.anchors.m_bol) |
| || ((matchEnd != input.end()) && term.anchors.m_eol)) |
| && !pattern->multiline()) |
| return false; |
| |
| context->matchBegin = matchBegin; |
| context->matchEnd = matchEnd; |
| return true; |
| } |
| |
| #define MATCH_NEXT() { if (verbose) { dataLog(" - Match\n"); if (btrack) { dataLog(" Matching\n"); btrack = false; } } ++context->term; goto matchAgain; } |
| #define BACKTRACK() { if (verbose) { dataLog(" - Backtrack\n"); if (!btrack) { dataLog(" Backtracking\n"); btrack = true; } } --context->term; goto backtrack; } |
| #define currentTerm() (disjunction->terms[context->term]) |
| |
| #define DUMP_TERM() \ |
| { \ |
| if (verbose) { \ |
| termDumper.dumpTerm(context->term, currentTerm()); \ |
| dataLog(" pos:", input.getPos()); \ |
| } \ |
| } |
| |
| #define DUMP_EXTRA(...) \ |
| { \ |
| dataLogIf(verbose, " ", __VA_ARGS__); \ |
| } |
| |
| #define DUMP_EXTRA_IF(predicate, ...) \ |
| { \ |
| dataLogIf(verbose && predicate, " ", __VA_ARGS__); \ |
| } |
| |
| #define DUMP_CURR_CHAR() \ |
| { \ |
| if (verbose) { \ |
| unsigned offset = currentTerm().inputPosition; \ |
| if (currentTerm().matchDirection() == Backward && currentTerm().atom.quantityType == QuantifierType::FixedCount) \ |
| offset -= currentTerm().atom.quantityMaxCount - 1; \ |
| dataLog(" off:", offset, " got:'"); \ |
| int ch = -1; \ |
| if (input.isValidNegativeInputOffset(offset)) \ |
| ch = input.readForCharacterDump(offset); \ |
| if (ch == -1) \ |
| dataLog("<illegal>"); \ |
| else if (ch < 0x10000 && (ch < 0xd800 || ch > 0xdfff))\ |
| dataLog(static_cast<char16_t>(ch)); \ |
| else \ |
| dataLogF("\\u{%x}", ch); \ |
| dataLog("'"); \ |
| } \ |
| } |
| |
| JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) |
| return JSRegExpResult::ErrorNoMemory; |
| |
| if (!--remainingMatchCount) { |
| dataLogLnIf(verbose, " Reached match limit - Returning ErrorHitLimit"); |
| return JSRegExpResult::ErrorHitLimit; |
| } |
| |
| ByteTermDumper termDumper(compileMode); |
| |
| if (btrack) |
| BACKTRACK(); |
| |
| context->matchBegin = input.getPos(); |
| context->term = 0; |
| |
| matchAgain: |
| ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
| |
| DUMP_TERM(); |
| |
| switch (currentTerm().type) { |
| case ByteTerm::Type::SubpatternBegin: |
| DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().subpatternId()); |
| MATCH_NEXT(); |
| case ByteTerm::Type::SubpatternEnd: |
| DUMP_EXTRA_IF(currentTerm().capture(), "id:", currentTerm().subpatternId(), " - Return Match\n"); |
| context->matchEnd = input.getPos(); |
| return JSRegExpResult::Match; |
| |
| case ByteTerm::Type::BodyAlternativeBegin: |
| MATCH_NEXT(); |
| case ByteTerm::Type::BodyAlternativeDisjunction: |
| case ByteTerm::Type::BodyAlternativeEnd: |
| context->matchEnd = input.getPos(); |
| DUMP_EXTRA("- Return Match\n"); |
| return JSRegExpResult::Match; |
| |
| case ByteTerm::Type::AlternativeBegin: |
| MATCH_NEXT(); |
| case ByteTerm::Type::AlternativeDisjunction: |
| case ByteTerm::Type::AlternativeEnd: { |
| int offset = currentTerm().alternative.end; |
| BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
| backTrack->offset = offset; |
| context->term += offset; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::AssertionBOL: |
| if (matchAssertionBOL(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::AssertionEOL: |
| if (matchAssertionEOL(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::AssertionWordBoundary: |
| if (matchAssertionWordBoundary(currentTerm())) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::PatternCharacterOnce: |
| case ByteTerm::Type::PatternCharacterFixed: { |
| DUMP_CURR_CHAR(); |
| if (currentTerm().matchDirection() == Forward) { |
| if (isEitherUnicodeCompilation()) { |
| if (!U_IS_BMP(currentTerm().atom.patternCharacter)) { |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkSurrogatePair(currentTerm(), currentTerm().inputPosition - 2 * matchAmount)) |
| BACKTRACK(); |
| } |
| MATCH_NEXT(); |
| } |
| } |
| |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacter(currentTerm(), currentTerm().inputPosition - matchAmount)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| } else { |
| auto& term = currentTerm(); |
| |
| if (isEitherUnicodeCompilation()) { |
| if (!U_IS_BMP(term.atom.patternCharacter)) { |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| auto inputPosition = term.inputPosition + 2 * matchAmount; |
| if (input.getPos() < inputPosition) |
| BACKTRACK(); |
| if (!checkSurrogatePair(term, inputPosition)) |
| BACKTRACK(); |
| } |
| MATCH_NEXT(); |
| } |
| } |
| |
| if (input.getPos() < term.inputPosition) |
| BACKTRACK(); |
| |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| |
| for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCharacter(term, term.inputPosition + matchAmount + 1 - term.atom.quantityMaxCount)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| } |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCharacterGreedy: { |
| DUMP_CURR_CHAR(); |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| unsigned matchAmount = 0; |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| if (currentTerm().matchDirection() == Forward) { |
| while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCharacter(currentTerm(), currentTerm().inputPosition + 1)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| } else { |
| auto& term = currentTerm(); |
| if (input.getPos() < term.inputPosition) |
| BACKTRACK(); |
| |
| while ((matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| if (!checkCharacter(currentTerm(), term.inputPosition)) { |
| input.setPos(position); |
| break; |
| } |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCharacterNonGreedy: { |
| DUMP_CURR_CHAR(); |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| backTrack->begin = input.getPos(); |
| backTrack->matchAmount = 0; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| case ByteTerm::Type::PatternCasedCharacterFixed: { |
| DUMP_CURR_CHAR(); |
| if (isEitherUnicodeCompilation()) { |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| unsigned position = input.getPos(); // May need to back out reading a surrogate pair. |
| |
| if (currentTerm().matchDirection() == Forward) { |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCasedCharacter(currentTerm(), currentTerm().inputPosition - matchAmount)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| } else { |
| auto& term = currentTerm(); |
| |
| if (input.getPos() < term.inputPosition) |
| BACKTRACK(); |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCasedCharacter(term, term.inputPosition - term.atom.quantityMaxCount + matchAmount + 1)) { |
| input.setPos(position); |
| BACKTRACK(); |
| } |
| } |
| } |
| MATCH_NEXT(); |
| } |
| |
| for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) { |
| if (!checkCasedCharacter(currentTerm(), currentTerm().inputPosition - matchAmount)) |
| BACKTRACK(); |
| } |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::PatternCasedCharacterGreedy: { |
| DUMP_CURR_CHAR(); |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(!isEitherUnicodeCompilation() || U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| if (currentTerm().matchDirection() == Forward) { |
| unsigned matchAmount = 0; |
| while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) { |
| if (!checkCasedCharacter(currentTerm(), currentTerm().inputPosition + 1)) { |
| input.uncheckInput(1); |
| break; |
| } |
| ++matchAmount; |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| MATCH_NEXT(); |
| } else { |
| auto& term = currentTerm(); |
| |
| if (input.getPos() < term.inputPosition) |
| BACKTRACK(); |
| |
| unsigned position = input.getPos(); |
| unsigned matchAmount = 0; |
| while ((matchAmount < term.atom.quantityMaxCount) && input.tryUncheckInput(1)) { |
| if (!checkCasedCharacter(term, term.inputPosition)) { |
| input.setPos(position); |
| break; |
| } |
| |
| ++matchAmount; |
| position = input.getPos(); |
| } |
| backTrack->matchAmount = matchAmount; |
| |
| MATCH_NEXT(); |
| } |
| } |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: { |
| DUMP_CURR_CHAR(); |
| BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
| |
| // Case insensitive matching of unicode characters is handled as Type::CharacterClass. |
| ASSERT(!isEitherUnicodeCompilation() || U_IS_BMP(currentTerm().atom.patternCharacter)); |
| |
| backTrack->matchAmount = 0; |
| MATCH_NEXT(); |
| } |
| |
| case ByteTerm::Type::CharacterClass: |
| DUMP_CURR_CHAR(); |
| if (matchCharacterClass(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::BackReference: |
| if (matchBackReference(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpattern: { |
| DUMP_EXTRA("\n"); |
| JSRegExpResult result = matchParentheses(currentTerm(), context); |
| |
| if (result == JSRegExpResult::Match) { |
| DUMP_EXTRA(" ParenthesesSubpattern"); |
| MATCH_NEXT(); |
| } else if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| DUMP_EXTRA(" ParenthesesSubpattern"); |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| if (matchParenthesesOnceBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| if (matchParenthesesOnceEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| if (matchParenthesesTerminalBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| if (matchParenthesesTerminalEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| if (matchParentheticalAssertionBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| if (matchParentheticalAssertionEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::CheckInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| if (input.checkInput(currentTerm().checkInputCount)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::UncheckInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| input.uncheckInput(currentTerm().checkInputCount); |
| MATCH_NEXT(); |
| |
| case ByteTerm::Type::HaveCheckedInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| if (input.isValidNegativeInputOffset(currentTerm().checkInputCount)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::DotStarEnclosure: |
| if (matchDotStarEnclosure(currentTerm(), context)) { |
| DUMP_EXTRA("- Return Match\n"); |
| return JSRegExpResult::Match; |
| } |
| BACKTRACK(); |
| } |
| |
| // We should never fall-through to here. |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| backtrack: |
| ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
| |
| DUMP_TERM(); |
| |
| switch (currentTerm().type) { |
| case ByteTerm::Type::SubpatternBegin: |
| DUMP_EXTRA("id:", currentTerm().subpatternId(), " - Return NoMatch\n"); |
| return JSRegExpResult::NoMatch; |
| case ByteTerm::Type::SubpatternEnd: |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| case ByteTerm::Type::BodyAlternativeBegin: |
| case ByteTerm::Type::BodyAlternativeDisjunction: { |
| int offset = currentTerm().alternative.next; |
| context->term += offset; |
| if (offset > 0) |
| MATCH_NEXT(); |
| |
| if (input.atEnd() || pattern->sticky()) { |
| DUMP_EXTRA("- Return NoMatch\n"); |
| return JSRegExpResult::NoMatch; |
| } |
| |
| input.next(); |
| |
| context->matchBegin = input.getPos(); |
| |
| if (currentTerm().alternative.onceThrough) |
| context->term += currentTerm().alternative.next; |
| |
| MATCH_NEXT(); |
| } |
| case ByteTerm::Type::BodyAlternativeEnd: |
| RELEASE_ASSERT_NOT_REACHED(); |
| |
| case ByteTerm::Type::AlternativeBegin: |
| case ByteTerm::Type::AlternativeDisjunction: { |
| int offset = currentTerm().alternative.next; |
| context->term += offset; |
| if (offset > 0) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::AlternativeEnd: { |
| // We should never backtrack back into an alternative of the main body of the regex. |
| BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
| unsigned offset = backTrack->offset; |
| context->term -= offset; |
| BACKTRACK(); |
| } |
| |
| case ByteTerm::Type::AssertionBOL: |
| case ByteTerm::Type::AssertionEOL: |
| case ByteTerm::Type::AssertionWordBoundary: |
| BACKTRACK(); |
| |
| case ByteTerm::Type::PatternCharacterOnce: |
| case ByteTerm::Type::PatternCharacterFixed: |
| case ByteTerm::Type::PatternCharacterGreedy: |
| case ByteTerm::Type::PatternCharacterNonGreedy: |
| if (backtrackPatternCharacter(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| case ByteTerm::Type::PatternCasedCharacterFixed: |
| case ByteTerm::Type::PatternCasedCharacterGreedy: |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: |
| if (backtrackPatternCasedCharacter(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::CharacterClass: |
| if (backtrackCharacterClass(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::BackReference: |
| if (backtrackBackReference(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpattern: { |
| JSRegExpResult result = backtrackParentheses(currentTerm(), context); |
| |
| if (result == JSRegExpResult::Match) { |
| MATCH_NEXT(); |
| } else if (result != JSRegExpResult::NoMatch) |
| return result; |
| |
| BACKTRACK(); |
| } |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| if (backtrackParenthesesOnceBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| if (backtrackParenthesesOnceEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| if (backtrackParenthesesTerminalBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| if (backtrackParenthesesTerminalEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| if (backtrackParentheticalAssertionBegin(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| if (backtrackParentheticalAssertionEnd(currentTerm(), context)) |
| MATCH_NEXT(); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::CheckInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| input.uncheckInput(currentTerm().checkInputCount); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::UncheckInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| input.checkInput(currentTerm().checkInputCount); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::HaveCheckedInput: |
| DUMP_EXTRA("count:", currentTerm().checkInputCount); |
| BACKTRACK(); |
| |
| case ByteTerm::Type::DotStarEnclosure: |
| RELEASE_ASSERT_NOT_REACHED(); |
| } |
| |
| RELEASE_ASSERT_NOT_REACHED(); |
| return JSRegExpResult::ErrorNoMatch; |
| } |
| |
| JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
| { |
| JSRegExpResult result = matchDisjunction(disjunction, context, btrack); |
| |
| if (result == JSRegExpResult::Match) { |
| while (context->matchBegin == context->matchEnd) { |
| result = matchDisjunction(disjunction, context, true); |
| if (result != JSRegExpResult::Match) |
| return result; |
| } |
| return JSRegExpResult::Match; |
| } |
| |
| return result; |
| } |
| |
| // WTF_IGNORES_THREAD_SAFETY_ANALYSIS because this function does conditional locking. |
| unsigned interpret() WTF_IGNORES_THREAD_SAFETY_ANALYSIS |
| { |
| // FIXME: https://bugs.webkit.org/show_bug.cgi?id=195970 |
| // [Yarr Interpreter] The interpreter doesn't have checks for stack overflow due to deep recursion |
| if (!input.isAvailableInput(0)) |
| return offsetNoMatch; |
| |
| if (pattern->m_lock) |
| pattern->m_lock->lock(); |
| |
| for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i) |
| output[i << 1] = offsetNoMatch; |
| |
| for (unsigned i = pattern->m_offsetVectorBaseForNamedCaptures; i < pattern->m_offsetsSize; ++i) |
| output[i] = 0; |
| |
| allocatorPool = pattern->m_allocator->startAllocator(); |
| RELEASE_ASSERT(allocatorPool); |
| |
| DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get()); |
| |
| dataLogLnIf(verbose, " Interpret input: ", input, "\n Matching"); |
| |
| JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false); |
| if (result == JSRegExpResult::Match) { |
| output[0] = context->matchBegin; |
| output[1] = context->matchEnd; |
| } |
| |
| freeDisjunctionContext(context); |
| |
| pattern->m_allocator->stopAllocator(); |
| |
| ASSERT((result == JSRegExpResult::Match) == (output[0] != offsetNoMatch)); |
| |
| if (pattern->m_lock) |
| pattern->m_lock->unlock(); |
| |
| return output[0]; |
| } |
| |
| Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start) |
| : pattern(pattern) |
| , compileMode(pattern->compileMode()) |
| , output(output) |
| , input(input, start, length, pattern->eitherUnicode()) |
| , startOffset(start) |
| , remainingMatchCount(matchLimit) |
| { |
| } |
| |
| private: |
| inline bool isLegacyCompilation() const { return compileMode == CompileMode::Legacy; } |
| inline bool isUnicodeCompilation() const { return compileMode == CompileMode::Unicode; } |
| inline bool isUnicodeSetsCompilation() const { return compileMode == CompileMode::UnicodeSets; } |
| inline bool isEitherUnicodeCompilation() const { return isUnicodeCompilation() || isUnicodeSetsCompilation(); } |
| |
| inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); } |
| |
| BytecodePattern* pattern; |
| CompileMode compileMode; |
| unsigned* output; |
| InputStream input; |
| StackCheck m_stackCheck; |
| WTF::BumpPointerPool* allocatorPool { nullptr }; |
| unsigned startOffset; |
| unsigned remainingMatchCount; |
| }; |
| |
| class ByteCompiler { |
| struct ParenthesesStackEntry { |
| unsigned beginTerm; |
| unsigned savedAlternativeIndex; |
| ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) |
| : beginTerm(beginTerm) |
| , savedAlternativeIndex(savedAlternativeIndex) |
| { |
| } |
| }; |
| |
| public: |
| ByteCompiler(YarrPattern& pattern) |
| : m_pattern(pattern) |
| { |
| } |
| |
| std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock, ErrorCode& errorCode) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) { |
| errorCode = ErrorCode::TooManyDisjunctions; |
| return nullptr; |
| } |
| |
| regexBegin(m_pattern.m_numSubpatterns, m_pattern.m_body->m_callFrameSize, m_pattern.m_body->m_alternatives[0]->onceThrough()); |
| if (auto error = emitDisjunction(m_pattern.m_body, 0, 0)) { |
| errorCode = error.value(); |
| return nullptr; |
| } |
| regexEnd(); |
| |
| #ifdef ASSERT_ENABLED |
| if (Options::dumpCompiledRegExpPatterns()) |
| ByteTermDumper(&m_pattern).dumpDisjunction(m_bodyDisjunction.get()); |
| #endif |
| |
| return makeUnique<BytecodePattern>(WTFMove(m_bodyDisjunction), m_allParenthesesInfo, m_pattern, allocator, lock, m_pattern.offsetVectorBaseForNamedCaptures(), m_pattern.offsetsSize()); |
| } |
| |
| void checkInput(unsigned count) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::CheckInput(count)); |
| } |
| |
| void uncheckInput(unsigned count) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::UncheckInput(count)); |
| } |
| |
| void haveCheckedInput(unsigned count) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::HaveCheckedInput(count)); |
| } |
| |
| void assertionBOL(unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::BOL(inputPosition)); |
| } |
| |
| void assertionEOL(unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::EOL(inputPosition)); |
| } |
| |
| void assertionWordBoundary(bool invert, MatchDirection matchDirection, unsigned inputPosition) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::WordBoundary(invert, matchDirection, inputPosition)); |
| } |
| |
| void atomPatternCharacter(UChar32 ch, MatchDirection matchDirection, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| if (m_pattern.ignoreCase()) { |
| UChar32 lo = u_tolower(ch); |
| UChar32 hi = u_toupper(ch); |
| |
| if (lo != hi) { |
| m_bodyDisjunction->terms.append(ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType)); |
| m_bodyDisjunction->terms.last().m_matchDirection = matchDirection; |
| return; |
| } |
| } |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType)); |
| m_bodyDisjunction->terms.last().m_matchDirection = matchDirection; |
| } |
| |
| void atomCharacterClass(CharacterClass* characterClass, bool invert, MatchDirection matchDirection, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm(characterClass, invert, inputPosition)); |
| |
| if (quantityType != QuantifierType::FixedCount) |
| m_bodyDisjunction->terms.last().atom.quantityMinCount = 0; |
| m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms.last().atom.quantityType = quantityType; |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.last().m_matchDirection = matchDirection; |
| } |
| |
| void atomBackReference(unsigned subpatternId, MatchDirection matchDirection, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| ASSERT(subpatternId); |
| |
| m_bodyDisjunction->terms.append(ByteTerm::BackReference(subpatternId, matchDirection, inputPosition)); |
| |
| if (m_pattern.hasDuplicateNamedCaptureGroups()) { |
| auto duplicateNamedGroupId = m_pattern.m_duplicateNamedGroupForSubpatternId[subpatternId]; |
| if (duplicateNamedGroupId) |
| m_bodyDisjunction->terms.last().atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| } |
| m_bodyDisjunction->terms.last().atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms.last().atom.quantityType = quantityType; |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesOnceBegin(unsigned subpatternId, MatchDirection matchDirection, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceBegin, subpatternId, capture, false, matchDirection, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParenthesesTerminalBegin(unsigned subpatternId, MatchDirection matchDirection, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternTerminalBegin, subpatternId, capture, false, matchDirection, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParenthesesSubpatternBegin(unsigned subpatternId, MatchDirection matchDirection, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| // Errrk! - this is a little crazy, we initially generate as a Type::ParenthesesSubpatternOnceBegin, |
| // then fix this up at the end! - simplifying this should make it much clearer. |
| // https://bugs.webkit.org/show_bug.cgi?id=50136 |
| |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceBegin, subpatternId, capture, false, matchDirection, inputPosition)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, MatchDirection matchDirection, unsigned frameLocation, unsigned alternativeFrameLocation) |
| { |
| unsigned beginTerm = m_bodyDisjunction->terms.size(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm::ParentheticalAssertionBegin(subpatternId, invert, matchDirection)); |
| m_bodyDisjunction->terms.last().frameLocation = frameLocation; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeBegin()); |
| m_bodyDisjunction->terms.last().frameLocation = alternativeFrameLocation; |
| |
| m_parenthesesStack.append(ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
| m_currentAlternativeIndex = beginTerm + 1; |
| } |
| |
| void atomParentheticalAssertionEnd(unsigned lastSubpatternId, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParentheticalAssertionBegin); |
| |
| bool invert = m_bodyDisjunction->terms[beginTerm].invert(); |
| MatchDirection matchDirection = m_bodyDisjunction->terms[beginTerm].matchDirection(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm::ParentheticalAssertionEnd(subpatternId, lastSubpatternId, invert, matchDirection)); |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored) |
| { |
| m_bodyDisjunction->terms.append(ByteTerm::DotStarEnclosure(bolAnchored, eolAnchored)); |
| } |
| |
| unsigned popParenthesesStack() |
| { |
| ASSERT(m_parenthesesStack.size()); |
| unsigned beginTerm = m_parenthesesStack.last().beginTerm; |
| m_currentAlternativeIndex = m_parenthesesStack.last().savedAlternativeIndex; |
| m_parenthesesStack.removeLast(); |
| |
| ASSERT(beginTerm < m_bodyDisjunction->terms.size()); |
| ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); |
| |
| return beginTerm; |
| } |
| |
| void closeAlternative(unsigned beginTerm) |
| { |
| unsigned origBeginTerm = beginTerm; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::AlternativeBegin); |
| unsigned endIndex = m_bodyDisjunction->terms.size(); |
| |
| unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
| |
| if (!m_bodyDisjunction->terms[beginTerm].alternative.next) |
| m_bodyDisjunction->terms.remove(beginTerm); |
| else { |
| while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
| beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::AlternativeDisjunction); |
| m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
| |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeEnd()); |
| m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
| } |
| } |
| |
| void closeBodyAlternative() |
| { |
| unsigned beginTerm = 0; |
| unsigned origBeginTerm = 0; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::BodyAlternativeBegin); |
| unsigned endIndex = m_bodyDisjunction->terms.size(); |
| |
| unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
| |
| while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
| beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::BodyAlternativeDisjunction); |
| m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
| |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeEnd()); |
| m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| |
| ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; |
| |
| auto parenthesesMatchDirection = parenthesesBegin.matchDirection(); |
| bool capture = parenthesesBegin.capture(); |
| unsigned subpatternId = parenthesesBegin.subpatternId(); |
| |
| unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; |
| auto parenthesesDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize); |
| |
| unsigned firstTermInParentheses = beginTerm + 1; |
| parenthesesDisjunction->terms.reserveInitialCapacity(endTerm - firstTermInParentheses + 2); |
| |
| parenthesesDisjunction->terms.append(ByteTerm::SubpatternBegin()); |
| for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses) |
| parenthesesDisjunction->terms.append(m_bodyDisjunction->terms[termInParentheses]); |
| parenthesesDisjunction->terms.append(ByteTerm::SubpatternEnd()); |
| |
| m_bodyDisjunction->terms.shrink(beginTerm); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition)); |
| m_bodyDisjunction->terms.last().m_matchDirection = parenthesesMatchDirection; |
| m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction)); |
| |
| if (m_pattern.hasDuplicateNamedCaptureGroups() && capture) { |
| auto duplicateNamedGroupId = m_pattern.m_duplicateNamedGroupForSubpatternId[subpatternId]; |
| if (duplicateNamedGroupId) |
| m_bodyDisjunction->terms[beginTerm].atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
| } |
| |
| void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternOnceBegin); |
| |
| bool capture = m_bodyDisjunction->terms[beginTerm].capture(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition)); |
| if (m_bodyDisjunction->terms[beginTerm].matchDirection() == Backward) { |
| // Swap input positions for backward captures. |
| m_bodyDisjunction->terms[endTerm].inputPosition = m_bodyDisjunction->terms[beginTerm].inputPosition; |
| m_bodyDisjunction->terms[beginTerm].inputPosition = inputPosition; |
| } |
| |
| if (m_pattern.hasDuplicateNamedCaptureGroups() && m_bodyDisjunction->terms[beginTerm].capture()) { |
| auto duplicateNamedGroupId = m_pattern.m_duplicateNamedGroupForSubpatternId[subpatternId]; |
| if (duplicateNamedGroupId) { |
| m_bodyDisjunction->terms[endTerm].atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| m_bodyDisjunction->terms[beginTerm].atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| } |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| m_bodyDisjunction->terms[endTerm].m_matchDirection = m_bodyDisjunction->terms[beginTerm].matchDirection(); |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType) |
| { |
| unsigned beginTerm = popParenthesesStack(); |
| closeAlternative(beginTerm + 1); |
| unsigned endTerm = m_bodyDisjunction->terms.size(); |
| |
| ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::Type::ParenthesesSubpatternTerminalBegin); |
| |
| if (m_bodyDisjunction->terms[beginTerm].matchDirection() == Backward) |
| inputPosition = 0; |
| bool capture = m_bodyDisjunction->terms[beginTerm].capture(); |
| unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].subpatternId(); |
| |
| m_bodyDisjunction->terms.append(ByteTerm(ByteTerm::Type::ParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition)); |
| m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
| m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
| |
| if (m_pattern.hasDuplicateNamedCaptureGroups() && m_bodyDisjunction->terms[beginTerm].capture()) { |
| auto duplicateNamedGroupId = m_pattern.m_duplicateNamedGroupForSubpatternId[subpatternId]; |
| if (duplicateNamedGroupId) { |
| m_bodyDisjunction->terms[endTerm].atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| m_bodyDisjunction->terms[beginTerm].atom.parenIds.duplicateNamedGroupId = duplicateNamedGroupId; |
| } |
| } |
| |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount; |
| m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
| } |
| |
| void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough) |
| { |
| m_bodyDisjunction = makeUnique<ByteDisjunction>(numSubpatterns, callFrameSize); |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeBegin(onceThrough)); |
| m_bodyDisjunction->terms[0].frameLocation = 0; |
| m_currentAlternativeIndex = 0; |
| } |
| |
| void regexEnd() |
| { |
| closeBodyAlternative(); |
| } |
| |
| void alternativeBodyDisjunction(bool onceThrough) |
| { |
| unsigned newAlternativeIndex = m_bodyDisjunction->terms.size(); |
| m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
| m_bodyDisjunction->terms.append(ByteTerm::BodyAlternativeDisjunction(onceThrough)); |
| |
| m_currentAlternativeIndex = newAlternativeIndex; |
| } |
| |
| void alternativeDisjunction() |
| { |
| unsigned newAlternativeIndex = m_bodyDisjunction->terms.size(); |
| m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
| m_bodyDisjunction->terms.append(ByteTerm::AlternativeDisjunction()); |
| |
| m_currentAlternativeIndex = newAlternativeIndex; |
| } |
| |
| std::optional<ErrorCode> WARN_UNUSED_RETURN emitDisjunction(PatternDisjunction* disjunction, CheckedUint32 inputCountAlreadyChecked, unsigned parenthesesInputCountAlreadyChecked, MatchDirection matchDirection = Forward) |
| { |
| if (UNLIKELY(!isSafeToRecurse())) |
| return ErrorCode::TooManyDisjunctions; |
| |
| for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
| auto currentCountAlreadyChecked = inputCountAlreadyChecked; |
| |
| PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); |
| |
| if (alt) { |
| if (disjunction == m_pattern.m_body) |
| alternativeBodyDisjunction(alternative->onceThrough()); |
| else |
| alternativeDisjunction(); |
| } |
| |
| unsigned minimumSize = alternative->m_minimumSize; |
| ASSERT(matchDirection == Backward || minimumSize >= parenthesesInputCountAlreadyChecked); |
| |
| unsigned countToCheck = 0; |
| |
| if (matchDirection == Forward) |
| countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; |
| else if (minimumSize > parenthesesInputCountAlreadyChecked) { |
| countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; |
| haveCheckedInput(minimumSize); |
| } else if (minimumSize > disjunction->m_minimumSize) { |
| countToCheck = minimumSize - disjunction->m_minimumSize; |
| haveCheckedInput(currentCountAlreadyChecked); |
| } |
| |
| if (countToCheck) { |
| if (matchDirection == Forward) |
| checkInput(countToCheck); |
| |
| currentCountAlreadyChecked += countToCheck; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| |
| auto termCount = alternative->m_terms.size(); |
| for (unsigned i = 0; i < termCount; ++i) { |
| unsigned termIndex = matchDirection == Forward ? i : termCount - 1 - i; |
| auto& term = alternative->m_terms[termIndex]; |
| |
| auto currentInputPosition = [&]() { |
| return currentCountAlreadyChecked - term.inputPosition; |
| }; |
| |
| switch (term.type) { |
| case PatternTerm::Type::AssertionBOL: |
| assertionBOL(currentInputPosition()); |
| break; |
| |
| case PatternTerm::Type::AssertionEOL: |
| assertionEOL(currentInputPosition()); |
| break; |
| |
| case PatternTerm::Type::AssertionWordBoundary: |
| assertionWordBoundary(term.invert(), matchDirection, currentInputPosition()); |
| break; |
| |
| case PatternTerm::Type::PatternCharacter: |
| atomPatternCharacter(term.patternCharacter, matchDirection, currentInputPosition(), term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::CharacterClass: |
| atomCharacterClass(term.characterClass, term.invert(), matchDirection, currentInputPosition(), term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::BackReference: |
| atomBackReference(term.backReferenceSubpatternId, matchDirection, currentInputPosition(), term.frameLocation, term.quantityMaxCount, term.quantityType); |
| break; |
| |
| case PatternTerm::Type::ForwardReference: |
| break; |
| |
| case PatternTerm::Type::ParenthesesSubpattern: { |
| unsigned disjunctionAlreadyCheckedCount = 0; |
| if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { |
| unsigned alternativeFrameLocation = term.frameLocation; |
| // For QuantifierType::FixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame. |
| if (term.quantityType == QuantifierType::FixedCount) |
| disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; |
| else |
| alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; |
| unsigned delegateEndInputOffset = currentInputPosition(); |
| atomParenthesesOnceBegin(term.parentheses.subpatternId, matchDirection, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, alternativeFrameLocation); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount, matchDirection)) |
| return error; |
| atomParenthesesOnceEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); |
| } else if (term.parentheses.isTerminal) { |
| unsigned delegateEndInputOffset = currentInputPosition(); |
| atomParenthesesTerminalBegin(term.parentheses.subpatternId, matchDirection, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, disjunctionAlreadyCheckedCount, matchDirection)) |
| return error; |
| atomParenthesesTerminalEnd(delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType); |
| } else { |
| unsigned delegateEndInputOffset = currentInputPosition(); |
| atomParenthesesSubpatternBegin(term.parentheses.subpatternId, matchDirection, term.capture(), disjunctionAlreadyCheckedCount + delegateEndInputOffset, term.frameLocation, 0); |
| unsigned inputOffset = 0; |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, inputOffset, matchDirection)) |
| return error; |
| atomParenthesesSubpatternEnd(term.parentheses.lastSubpatternId, delegateEndInputOffset, term.frameLocation, term.quantityMinCount, term.quantityMaxCount, term.quantityType, term.parentheses.disjunction->m_callFrameSize); |
| } |
| break; |
| } |
| |
| case PatternTerm::Type::ParentheticalAssertion: { |
| unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion; |
| unsigned positiveInputOffset = currentInputPosition(); |
| if (term.matchDirection() == Forward) { |
| unsigned uncheckAmount = 0; |
| if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) { |
| uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize; |
| uncheckInput(uncheckAmount); |
| currentCountAlreadyChecked -= uncheckAmount; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| |
| atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.matchDirection(), term.frameLocation, alternativeFrameLocation); |
| if (auto error = emitDisjunction(term.parentheses.disjunction, currentCountAlreadyChecked, positiveInputOffset - uncheckAmount, term.matchDirection())) |
| return error; |
| atomParentheticalAssertionEnd(term.parentheses.lastSubpatternId, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| if (uncheckAmount) { |
| checkInput(uncheckAmount); |
| currentCountAlreadyChecked += uncheckAmount; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| } else { // Backward |
| unsigned uncheckAmount = 0; |
| CheckedUint32 checkedCountForLookbehind = currentCountAlreadyChecked; |
| ASSERT(checkedCountForLookbehind >= term.inputPosition); |
| checkedCountForLookbehind -= term.inputPosition; |
| auto minimumSize = term.parentheses.disjunction->m_minimumSize; |
| if (minimumSize) { |
| checkedCountForLookbehind += minimumSize; |
| if (checkedCountForLookbehind.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| if (checkedCountForLookbehind > currentCountAlreadyChecked |
| && !term.invert()) { |
| // Do a quick check for what is required for the lookbehind. |
| // An inverted lookbehind can "match" without processing any input. |
| haveCheckedInput(checkedCountForLookbehind); |
| } |
| } |
| atomParentheticalAssertionBegin(term.parentheses.subpatternId, term.invert(), term.matchDirection(), term.frameLocation, alternativeFrameLocation); |
| |
| if (auto error = emitDisjunction(term.parentheses.disjunction, checkedCountForLookbehind, positiveInputOffset + minimumSize, term.matchDirection())) |
| return error; |
| atomParentheticalAssertionEnd(term.parentheses.lastSubpatternId, term.frameLocation, term.quantityMaxCount, term.quantityType); |
| |
| if (uncheckAmount) { |
| checkInput(uncheckAmount); |
| currentCountAlreadyChecked += uncheckAmount; |
| if (currentCountAlreadyChecked.hasOverflowed()) |
| return ErrorCode::OffsetTooLarge; |
| } |
| } |
| break; |
| } |
| |
| case PatternTerm::Type::DotStarEnclosure: |
| assertionDotStarEnclosure(term.anchors.bolAnchor, term.anchors.eolAnchor); |
| break; |
| } |
| } |
| |
| if (matchDirection == Backward && countToCheck) |
| uncheckInput(countToCheck); |
| } |
| return std::nullopt; |
| } |
| |
| private: |
| inline bool isSafeToRecurse() { return m_stackCheck.isSafeToRecurse(); } |
| |
| YarrPattern& m_pattern; |
| std::unique_ptr<ByteDisjunction> m_bodyDisjunction; |
| StackCheck m_stackCheck; |
| unsigned m_currentAlternativeIndex { 0 }; |
| Vector<ParenthesesStackEntry> m_parenthesesStack; |
| Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; |
| }; |
| |
| void ByteTermDumper::dumpTerm(size_t idx, ByteTerm term) |
| { |
| PrintStream& out = WTF::dataFile(); |
| |
| auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) { |
| if (!m_recursiveDump) |
| termNesting = 1; |
| |
| for (unsigned lineIndent = 0; lineIndent < m_lineIndent; lineIndent++) |
| out.print(" "); |
| out.printf("%4zu", index); |
| for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++) |
| out.print(" "); |
| }; |
| |
| auto dumpQuantity = [&](ByteTerm& term) { |
| if (term.atom.quantityType == QuantifierType::FixedCount) { |
| if (term.atom.quantityMaxCount > 1) |
| out.print(" {", term.atom.quantityMaxCount, "}"); |
| return; |
| } |
| |
| out.print(" {", term.atom.quantityMinCount); |
| if (term.atom.quantityMaxCount == UINT_MAX) |
| out.print(",inf"); |
| else |
| out.print(",", term.atom.quantityMaxCount); |
| out.print("}"); |
| |
| if (term.atom.quantityType == QuantifierType::Greedy) |
| out.print(" greedy"); |
| else if (term.atom.quantityType == QuantifierType::NonGreedy) |
| out.print(" non-greedy"); |
| }; |
| |
| auto dumpCaptured = [&](ByteTerm& term) { |
| if (term.capture()) |
| out.print(" captured (#", term.subpatternId(), ")"); |
| }; |
| |
| auto dumpInverted = [&](ByteTerm& term) { |
| if (term.invert()) |
| out.print(" inverted"); |
| }; |
| |
| auto dumpMatchDirection = [&](ByteTerm& term) { |
| if (term.matchDirection() == Backward) { |
| if (term.type == ByteTerm::Type::ParentheticalAssertionBegin |
| || term.type == ByteTerm::Type::ParentheticalAssertionEnd) { |
| out.print(" lookbehind"); |
| } else |
| out.print(" backward"); |
| } |
| }; |
| |
| auto dumpInputPosition = [&](ByteTerm& term) { |
| out.printf(" inputPosition %u", term.inputPosition); |
| }; |
| |
| auto dumpFrameLocation = [&](ByteTerm& term) { |
| out.printf(" frameLocation %u", term.frameLocation); |
| }; |
| |
| auto dumpCharacter = [&](ByteTerm& term) { |
| out.print(" "); |
| dumpUChar32(out, term.atom.patternCharacter); |
| }; |
| |
| auto dumpCharClass = [&](ByteTerm& term) { |
| out.print(" "); |
| dumpCharacterClass(out, m_pattern, term.atom.characterClass); |
| }; |
| |
| switch (term.type) { |
| case ByteTerm::Type::BodyAlternativeBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("BodyAlternativeBegin"); |
| if (term.alternative.onceThrough) |
| out.print(" onceThrough"); |
| break; |
| case ByteTerm::Type::BodyAlternativeDisjunction: |
| outputTermIndexAndNest(idx, m_nesting - 1); |
| out.print("BodyAlternativeDisjunction"); |
| break; |
| case ByteTerm::Type::BodyAlternativeEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("BodyAlternativeEnd"); |
| break; |
| case ByteTerm::Type::AlternativeBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("AlternativeBegin"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::AlternativeDisjunction: |
| outputTermIndexAndNest(idx, m_nesting - 1); |
| out.print("AlternativeDisjunction"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::AlternativeEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("AlternativeEnd"); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::SubpatternBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("SubpatternBegin"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::SubpatternEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("SubpatternEnd"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::AssertionBOL: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("AssertionBOL"); |
| break; |
| case ByteTerm::Type::AssertionEOL: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("AssertionEOL"); |
| break; |
| case ByteTerm::Type::AssertionWordBoundary: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("AssertionWordBoundary"); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCharacterOnce: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCharacterOnce"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCharacterFixed: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCharacterFixed"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| out.print(" {", term.atom.quantityMinCount, "}"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCharacterGreedy: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCharacterGreedy"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCharacterNonGreedy: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCharacterNonGreedy"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpCharacter(term); |
| dumpQuantity(term); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterOnce: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCasedCharacterOnce"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterFixed: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCasedCharacterFixed"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterGreedy: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCasedCharacterGreedy"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::PatternCasedCharacterNonGreedy: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("PatternCasedCharacterNonGreedy"); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::CharacterClass: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("CharacterClass"); |
| dumpInverted(term); |
| dumpInputPosition(term); |
| if (term.atom.quantityType != QuantifierType::FixedCount || eitherUnicode()) |
| dumpFrameLocation(term); |
| dumpCharClass(term); |
| dumpQuantity(term); |
| dumpMatchDirection(term); |
| break; |
| case ByteTerm::Type::BackReference: |
| // Need to update this for named capture group back references |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("BackReference #", term.subpatternId()); |
| dumpInputPosition(term); |
| dumpQuantity(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpattern: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("ParenthesesSubpattern"); |
| dumpCaptured(term); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| dumpQuantity(term); |
| if (m_recursiveDump) { |
| out.print("\n"); |
| dumpDisjunction(term.atom.parenthesesDisjunction, m_nesting); |
| } |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternOnceBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("ParenthesesSubpatternOnceBegin"); |
| dumpCaptured(term); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternOnceEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("ParenthesesSubpatternOnceEnd"); |
| dumpCaptured(term); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternTerminalBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("ParenthesesSubpatternTerminalBegin"); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParenthesesSubpatternTerminalEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("ParenthesesSubpatternTerminalEnd"); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParentheticalAssertionBegin: |
| outputTermIndexAndNest(idx, m_nesting++); |
| out.print("ParentheticalAssertionBegin"); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::ParentheticalAssertionEnd: |
| outputTermIndexAndNest(idx, --m_nesting); |
| out.print("ParentheticalAssertionEnd"); |
| dumpInverted(term); |
| dumpMatchDirection(term); |
| dumpInputPosition(term); |
| dumpFrameLocation(term); |
| break; |
| case ByteTerm::Type::CheckInput: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("CheckInput ", term.checkInputCount); |
| break; |
| case ByteTerm::Type::UncheckInput: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("UncheckInput ", term.checkInputCount); |
| break; |
| case ByteTerm::Type::HaveCheckedInput: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("HaveCheckedInput ", term.checkInputCount); |
| break; |
| case ByteTerm::Type::DotStarEnclosure: |
| outputTermIndexAndNest(idx, m_nesting); |
| out.print("DotStarEnclosure"); |
| break; |
| } |
| } |
| |
| void ByteTermDumper::dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting) |
| { |
| PrintStream& out = WTF::dataFile(); |
| |
| unsigned savedLineIndent = m_lineIndent; |
| if (!nesting) { |
| out.printf("ByteDisjunction(%p):\n", disjunction); |
| m_recursiveDump = true; |
| m_nesting = 1; |
| } else |
| m_lineIndent = nesting - 1; |
| |
| for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) { |
| ByteTerm term = disjunction->terms[idx]; |
| |
| dumpTerm(idx, term); |
| |
| if (term.type != ByteTerm::Type::ParenthesesSubpattern) |
| out.print("\n"); |
| } |
| |
| m_lineIndent = savedLineIndent; |
| } |
| |
| std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ErrorCode& errorCode, ConcurrentJSLock* lock) |
| { |
| return ByteCompiler(pattern).compile(allocator, lock, errorCode); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, StringView input, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| if (input.is8Bit()) |
| return Interpreter<LChar>(bytecode, output, input.characters8(), input.length(), start).interpret(); |
| return Interpreter<UChar>(bytecode, output, input.characters16(), input.length(), start).interpret(); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| return Interpreter<LChar>(bytecode, output, input, length, start).interpret(); |
| } |
| |
| unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output) |
| { |
| SuperSamplerScope superSamplerScope(false); |
| return Interpreter<UChar>(bytecode, output, input, length, start).interpret(); |
| } |
| |
| // These should be the same for both UChar & LChar. |
| static_assert(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t))); |
| static_assert(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t))); |
| static_assert(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t))); |
| |
| |
| } } |