blob: 9157e2163720570029d0bd5ee6b2b43d7132e869 [file] [log] [blame]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#include "ParserPch.h"
namespace UnifiedRegex
{
template <typename C>
void TextbookBoyerMooreSetup<C>::Init()
{
Assert(patLen > 0);
for (uint i = 0; i < MaxCharMapLinearChars; i++)
{
lastOcc[i] = -1;
}
numLinearChars = 1;
// Always put the last character in the first index
linearChar[0] = pat[patLen - 1];
for (CharCount i = 0; i < patLen; i++)
{
if (numLinearChars <= MaxCharMapLinearChars)
{
uint j = 0;
for (; j < numLinearChars; j++)
{
if (linearChar[j] == pat[i])
{
lastOcc[j] = i;
break;
}
}
if (j == numLinearChars)
{
if (numLinearChars < MaxCharMapLinearChars)
{
linearChar[numLinearChars] = pat[i];
lastOcc[numLinearChars] = i;
}
numLinearChars++;
}
}
if (numLinearChars > MaxCharMapLinearChars)
{
break;
}
}
if (numLinearChars <= MaxCharMapLinearChars)
{
scheme = LinearScheme;
}
else
{
scheme = DefaultScheme;
}
}
template <typename C>
void TextbookBoyerMoore<C>::Setup(ArenaAllocator* allocator, TextbookBoyerMooreSetup<C> const& info)
{
Assert(info.GetScheme() == TextbookBoyerMooreSetup<C>::DefaultScheme);
this->Setup(allocator, info.pat, info.patLen, 1);
}
template <typename C>
void TextbookBoyerMoore<C>::Setup(ArenaAllocator * allocator, const Char * pat, CharCount patLen, int skip)
{
// character c |-> index of last occurrence of c in pat, otherwise -1
for (CharCount i = 0; i < patLen; i++)
{
for (int j = 0; j < skip; j++)
lastOccurrence.Set(allocator, pat[i * skip + j], i);
}
goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, pat, patLen, skip);
}
template <typename C>
int32 * TextbookBoyerMooreSetup<C>::GetGoodSuffix(ArenaAllocator* allocator, const Char * pat, CharCount patLen, int skip)
{
// pat offset q |-> longest prefix of pat which is a proper suffix of pat[0..q]
// (thanks to equivalence classes being in canonical order we only need to look at the first
// character of each skip grouping in the pattern)
int32* prefix = AnewArray(allocator, int32, patLen);
prefix[0] = 0;
int32 k = 0;
for (CharCount q = 1; q < patLen; q++)
{
while (k > 0 && pat[k * skip] != pat[q * skip])
k = prefix[k - 1];
if (pat[k * skip] == pat[q * skip])
k++;
prefix[q] = k;
}
// As above, but for rev(pat)
int32* revPrefix = AnewArray(allocator, int32, patLen);
revPrefix[0] = 0;
k = 0;
for (CharCount q = 1; q < patLen; q++)
{
while (k > 0 && pat[(patLen - k - 1) * skip] != pat[(patLen - q - 1) * skip])
k = revPrefix[k - 1];
if (pat[(patLen - k - 1) * skip] == pat[(patLen - q - 1) * skip])
k++;
revPrefix[q] = k;
}
// pat prefix length l |-> least shift s.t. pat[0..l-1] is not mismatched
int32 * goodSuffix = AnewArray(allocator, int32, patLen + 1);
for (CharCount j = 0; j <= patLen; j++)
goodSuffix[j] = patLen - prefix[patLen - 1];
for (CharCount l = 1; l <= patLen; l++)
{
CharCount j = patLen - revPrefix[l - 1];
int32 s = l - revPrefix[l - 1];
if (goodSuffix[j] > s)
goodSuffix[j] = s;
}
// shift above one to the left
for (CharCount j = 0; j < patLen; j++)
goodSuffix[j] = goodSuffix[j + 1];
AdeleteArray(allocator, patLen, prefix);
AdeleteArray(allocator, patLen, revPrefix);
return goodSuffix;
}
template <typename C>
void TextbookBoyerMoore<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
{
if(goodSuffix)
{
AdeleteArray(allocator, patLen + 1, PointerValue(goodSuffix));
#if DBG
goodSuffix = nullptr;
#endif
}
lastOccurrence.FreeBody(allocator);
}
template <uint equivClassSize, uint compareCount>
static bool MatchPatternAt(uint inputChar, char16 const * pat, CharCount index);
template <>
bool MatchPatternAt<1, 1>(uint inputChar, char16 const* pat, CharCount index)
{
return inputChar == pat[index];
}
template <>
bool MatchPatternAt<CaseInsensitive::EquivClassSize, CaseInsensitive::EquivClassSize>(uint inputChar, char16 const * pat, CharCount index)
{
CompileAssert(CaseInsensitive::EquivClassSize == 4);
return inputChar == pat[index * CaseInsensitive::EquivClassSize]
|| inputChar == pat[index * CaseInsensitive::EquivClassSize + 1]
|| inputChar == pat[index * CaseInsensitive::EquivClassSize + 2]
|| inputChar == pat[index * CaseInsensitive::EquivClassSize + 3];
}
template <>
bool MatchPatternAt<CaseInsensitive::EquivClassSize, 1>(uint inputChar, char16 const * pat, CharCount index)
{
CompileAssert(CaseInsensitive::EquivClassSize == 4);
return inputChar == pat[index * 4];
}
template <typename C>
template <uint equivClassSize, uint lastPatCharEquivClass>
bool TextbookBoyerMoore<C>::Match
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const
{
Assert(input != 0);
Assert(inputOffset <= inputLength);
if (inputLength < patLen)
return false;
CharCount offset = inputOffset;
const CharCount endOffset = inputLength - (patLen - 1);
const int32* const localGoodSuffix = goodSuffix;
const LastOccMap* const localLastOccurrence = &lastOccurrence;
const CharCount lastPatCharIndex = (patLen - 1);
while (offset < endOffset)
{
// A separate tight loop to find the last character
while (true)
{
uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
if (MatchPatternAt<equivClassSize, lastPatCharEquivClass>(inputChar, pat, lastPatCharIndex))
{
// Found a match. Break out of this loop and go to the match pattern loop
break;
}
// Negative case is more common,
// Write the checks so that we have a super tight loop
int lastOcc;
if (inputChar < localLastOccurrence->GetDirectMapSize())
{
if (!localLastOccurrence->IsInDirectMap(inputChar))
{
offset += patLen;
if (offset >= endOffset)
{
return false;
}
continue;
}
lastOcc = localLastOccurrence->GetDirectMap(inputChar);
}
else if (!localLastOccurrence->GetNonDirect(inputChar, lastOcc))
{
offset += patLen;
if (offset >= endOffset)
{
return false;
}
continue;
}
Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
offset += lastPatCharIndex - lastOcc;
if (offset >= endOffset)
{
return false;
}
}
// CONSIDER: we can remove this check if we stop using TextbookBoyerMoore for one char pattern
if (lastPatCharIndex == 0)
{
inputOffset = offset;
return true;
}
// Match the rest of the pattern
int32 j = lastPatCharIndex - 1;
while (true)
{
#if ENABLE_REGEX_CONFIG_OPTIONS
if (stats != 0)
stats->numCompares++;
#endif
uint inputChar = Chars<Char>::CTU(input[offset + j]);
if (!MatchPatternAt<equivClassSize, equivClassSize>(inputChar, pat, j))
{
const int32 e = j - localLastOccurrence->Get((Char)inputChar);
offset += e > localGoodSuffix[j] ? e : localGoodSuffix[j];
break;
}
if (--j < 0)
{
inputOffset = offset;
return true;
}
}
}
return false;
}
// Specialized linear char map version
template <typename C>
void TextbookBoyerMooreWithLinearMap<C>::Setup(ArenaAllocator * allocator, TextbookBoyerMooreSetup<C> const& setup)
{
Assert(setup.GetScheme() == TextbookBoyerMooreSetup<C>::LinearScheme);
lastOccurrence.Set(setup.numLinearChars, setup.linearChar, setup.lastOcc);
goodSuffix = TextbookBoyerMooreSetup<C>::GetGoodSuffix(allocator, setup.pat, setup.patLen);
}
template <typename C>
void TextbookBoyerMooreWithLinearMap<C>::FreeBody(ArenaAllocator* allocator, CharCount patLen)
{
if(goodSuffix)
{
AdeleteArray(allocator, patLen + 1, goodSuffix);
#if DBG
goodSuffix = 0;
#endif
}
}
template <typename C>
template <uint equivClassSize>
bool TextbookBoyerMooreWithLinearMap<C>::Match
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const
{
CompileAssert(equivClassSize == 1);
Assert(input != 0);
Assert(inputOffset <= inputLength);
if (inputLength < patLen)
return false;
const int32* const localGoodSuffix = goodSuffix;
const LastOccMap* const localLastOccurrence = &lastOccurrence;
CharCount offset = inputOffset;
const CharCount lastPatCharIndex = (patLen - 1);
const CharCount endOffset = inputLength - lastPatCharIndex;
// Using int size instead of Char value is faster
const uint lastPatChar = pat[lastPatCharIndex];
Assert(lastPatChar == localLastOccurrence->GetChar(0));
while (offset < endOffset)
{
// A separate tight loop to find the last character
while (true)
{
#if ENABLE_REGEX_CONFIG_OPTIONS
if (stats != 0)
stats->numCompares++;
#endif
uint inputChar = Chars<Char>::CTU(input[offset + lastPatCharIndex]);
if (inputChar == lastPatChar)
{
// Found a match. Break out of this loop and go to the match pattern loop
break;
}
// Negative case is more common,
// Write the checks so that we have a super tight loop
Assert(inputChar != localLastOccurrence->GetChar(0));
int32 lastOcc;
if (localLastOccurrence->GetChar(1) != inputChar)
{
if (localLastOccurrence->GetChar(2) != inputChar)
{
if (localLastOccurrence->GetChar(3) != inputChar)
{
offset += patLen;
if (offset >= endOffset)
{
return false;
}
continue;
}
lastOcc = localLastOccurrence->GetLastOcc(3);
}
else
{
lastOcc = localLastOccurrence->GetLastOcc(2);
}
}
else
{
lastOcc = localLastOccurrence->GetLastOcc(1);
}
Assert((int)lastPatCharIndex - lastOcc >= localGoodSuffix[lastPatCharIndex]);
offset += lastPatCharIndex - lastOcc;
if (offset >= endOffset)
{
return false;
}
}
// CONSIDER: we can remove this check if we stop using
// TextbookBoyerMoore for one char pattern
if (lastPatCharIndex == 0)
{
inputOffset = offset;
return true;
}
// Match the rest of the pattern
int32 j = lastPatCharIndex - 1;
while (true)
{
#if ENABLE_REGEX_CONFIG_OPTIONS
if (stats != 0)
stats->numCompares++;
#endif
uint inputChar = Chars<Char>::CTU(input[offset + j]);
if (inputChar != pat[j])
{
int goodSuffix = localGoodSuffix[j];
Assert(patLen <= MaxCharCount);
if (goodSuffix == (int)patLen)
{
offset += patLen;
}
else
{
const int32 e = j - localLastOccurrence->Get(inputChar);
offset += e > goodSuffix ? e : goodSuffix;
}
break;
}
if (--j < 0)
{
inputOffset = offset;
return true;
}
}
}
return false;
}
// explicit instantiation
template struct TextbookBoyerMooreSetup<char16>;
template class TextbookBoyerMoore<char16>;
template class TextbookBoyerMooreWithLinearMap<char16>;
template
bool TextbookBoyerMoore<char16>::Match<1>
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const;
template
bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize>
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const;
template
bool TextbookBoyerMoore<char16>::Match<CaseInsensitive::EquivClassSize, 1>
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const;
template
bool TextbookBoyerMooreWithLinearMap<char16>::Match<1>
( const Char *const input
, const CharCount inputLength
, CharCount& inputOffset
, const Char* pat
, const CharCount patLen
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
) const;
}