blob: 97b0f19da0d7a31164f0503c87ea0d94a6c1d2e6 [file] [log] [blame]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#pragma once
namespace UnifiedRegex
{
template <typename C>
class StandardChars {};
class ASCIIChars : public Chars<char>
{
private:
enum CharClass : uint8
{
Word = 1 << 0,
Newline = 1 << 1,
Whitespace = 1 << 2,
Letter = 1 << 3,
Digit = 1 << 4,
Octal = 1 << 5,
Hex = 1 << 6
};
static const uint8 classes[NumChars];
static const uint8 values[NumChars];
public:
inline static bool IsWord(Char c)
{
return (classes[CTU(c)] & Word) != 0;
}
inline static bool IsNewline(Char c)
{
return (classes[CTU(c)] & Newline) != 0;
}
inline static bool IsWhitespace(Char c)
{
return (classes[CTU(c)] & Whitespace) != 0;
}
inline static bool IsLetter(Char c)
{
return (classes[CTU(c)] & Letter) != 0;
}
inline static bool IsDigit(Char c)
{
return (classes[CTU(c)] & Digit) != 0;
}
inline static bool IsOctal(Char c)
{
return (classes[CTU(c)] & Octal) != 0;
}
inline static bool IsHex(Char c)
{
return (classes[CTU(c)] & Hex) != 0;
}
inline static uint DigitValue(Char c)
{
return values[CTU(c)];
}
};
template <>
class StandardChars<uint8> : Chars<uint8>
{
public:
inline StandardChars(ArenaAllocator* allocator) {}
inline bool IsWord(Char c) const
{
return ASCIIChars::IsWord(ASCIIChars::UTC(CTU(c)));
}
inline bool IsNewline(Char c) const
{
return ASCIIChars::IsNewline(ASCIIChars::UTC(CTU(c)));
}
inline bool IsWhitespaceOrNewline(Char c) const
{
return ASCIIChars::IsWhitespace(ASCIIChars::UTC(CTU(c)));
}
inline bool IsLetter(Char c) const
{
return ASCIIChars::IsLetter(ASCIIChars::UTC(CTU(c)));
}
inline bool IsDigit(Char c) const
{
return ASCIIChars::IsDigit(ASCIIChars::UTC(CTU(c)));
}
inline bool IsOctal(Char c) const
{
return ASCIIChars::IsOctal(ASCIIChars::UTC(CTU(c)));
}
inline bool IsHex(Char c) const
{
return ASCIIChars::IsHex(ASCIIChars::UTC(CTU(c)));
}
inline uint DigitValue(Char c) const
{
return ASCIIChars::DigitValue(ASCIIChars::UTC(CTU(c)));
}
};
template <typename FallbackCaseMapper>
class CaseMapper
{
public:
CaseMapper(ArenaAllocator *allocator, CaseInsensitive::MappingSource mappingSource, const FallbackCaseMapper *fallbackMapper) :
toEquivs((uint64) -1),
fallbackMapper(fallbackMapper)
{
CompileAssert(sizeof(char16) == 2);
CompileAssert(sizeof(uint) > sizeof(char16));
const uint maxUChar = Chars<char16>::MaxUChar;
uint l = 0;
uint h = maxUChar;
uint tblidx = 0;
do {
uint acth;
char16 equivl[CaseInsensitive::EquivClassSize];
bool isNonTrivial = CaseInsensitive::RangeToEquivClassOnlyInSource(mappingSource, tblidx, l, h, acth, equivl);
if (isNonTrivial)
{
__assume(acth <= maxUChar); // property of algorithm: acth never greater than h
do
{
uint64 r = 0;
CompileAssert(sizeof(r) >= sizeof(char16) * CaseInsensitive::EquivClassSize);
for (int i = CaseInsensitive::EquivClassSize - 1; i >= 0; i--)
{
__assume(equivl[i] <= maxUChar); // property of algorithm: never map outside of range
r <<= 16;
r |= Chars<char16>::CTU(equivl[i]++);
}
toEquivs.Set(allocator, Chars<char16>::UTC(l++), r);
}
while (l <= acth);
}
else
{
l = acth + 1;
}
}
while (l <= h);
}
inline char16 ToCanonical(char16 c) const
{
uint64 r = toEquivs.Get(c);
return r == EQUIV_MISSING ? fallbackMapper->ToCanonical(c) : Chars<char16>::UTC(r & 0xffff);
}
CompileAssert(CaseInsensitive::EquivClassSize == 4);
inline bool ToEquivs(char16 c, __out_ecount(4) char16* equivs) const
{
uint64 r = toEquivs.Get(c);
if (r == EQUIV_MISSING)
{
return fallbackMapper->ToEquivs(c, equivs);
}
else
{
for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
{
equivs[i] = Chars<char16>::UTC(r & 0xffff);
r >>= 16;
}
return true;
}
}
inline bool IsTrivialString(const char16* str, CharCount strLen) const
{
for (CharCount i = 0; i < strLen; i++)
{
if (toEquivs.Get(str[i]) != EQUIV_MISSING)
return false;
}
return fallbackMapper->IsTrivialString(str, strLen);
}
private:
// Map character to:
// - -1 if trivial equivalence class
// - otherwise to four 16-bit fields: <equiv 4><equiv 3><equiv 2><equiv 1>
const static uint64 EQUIV_MISSING = static_cast<uint64>(-1);
CharMap<char16, uint64> toEquivs;
const FallbackCaseMapper *fallbackMapper;
};
class TrivialCaseMapper
{
public:
inline char16 ToCanonical(char16 c) const
{
return c;
}
CompileAssert(CaseInsensitive::EquivClassSize == 4);
inline bool ToEquivs(char16 c, __out_ecount(4) char16* equivs) const
{
for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
equivs[i] = c;
return false;
}
inline bool IsTrivialString(const char16* str, CharCount strLen) const
{
return true;
}
// This class is instantiated as a global const instance
// C++ requires that a default constructor be provided in that case
// See http://stackoverflow.com/questions/7411515/why-does-c-require-a-user-provided-default-constructor-to-default-construct-a
TrivialCaseMapper() {}
static const TrivialCaseMapper Instance;
};
template <>
class StandardChars<char16> : public Chars<char16>
{
private:
static const int numDigitPairs;
static const Char* const digitStr;
static const int numWhitespacePairs;
static const Char* const whitespaceStr;
static const int numWordPairs;
static const Char* const wordStr;
static const int numWordIUPairs;
static const Char* const wordIUStr;
static const int numNewlinePairs;
static const Char* const newlineStr;
ArenaAllocator* allocator;
typedef CaseMapper<TrivialCaseMapper> UnicodeDataCaseMapper;
const UnicodeDataCaseMapper unicodeDataCaseMapper;
typedef CaseMapper<UnicodeDataCaseMapper> CaseFoldingCaseMapper;
const CaseFoldingCaseMapper caseFoldingCaseMapper;
CharSet<Char>* fullSet;
CharSet<Char>* emptySet;
CharSet<Char>* wordSet;
CharSet<Char>* nonWordSet;
CharSet<Char>* wordIUSet;
CharSet<Char>* nonWordIUSet;
CharSet<Char>* newlineSet;
CharSet<Char>* whitespaceSet;
CharSet<Char>* surrogateUpperRange;
public:
StandardChars(ArenaAllocator* allocator);
inline bool IsWord(Char c) const
{
return CTU(c) < ASCIIChars::NumChars && ASCIIChars::IsWord(ASCIIChars::UTC(CTU(c)));
}
inline bool IsNewline(Char c) const
{
return CTU(c) < ASCIIChars::NumChars ? ASCIIChars::IsNewline(ASCIIChars::UTC(CTU(c))) : (CTU(c) & 0xfffe) == 0x2028;
}
inline bool IsWhitespaceOrNewline(Char c) const
{
if (CTU(c) < ASCIIChars::NumChars)
return ASCIIChars::IsWhitespace(ASCIIChars::UTC(CTU(c)));
else
return CTU(c) == 0x1680 || (CTU(c) >= 0x2000 && CTU(c) <= 0x200a) ||
CTU(c) == 0x2028 || CTU(c) == 0x2029 || CTU(c) == 0x202f || CTU(c) == 0x205f ||
CTU(c) == 0x3000 || CTU(c) == 0xfeff;
}
inline bool IsLetter(Char c) const
{
return CTU(c) < ASCIIChars::NumChars && ASCIIChars::IsLetter(ASCIIChars::UTC(CTU(c)));
}
inline bool IsDigit(Char c) const
{
return CTU(c) < ASCIIChars::NumChars && ASCIIChars::IsDigit(ASCIIChars::UTC(CTU(c)));
}
inline bool IsOctal(Char c) const
{
return CTU(c) < ASCIIChars::NumChars && ASCIIChars::IsOctal(ASCIIChars::UTC(CTU(c)));
}
inline bool IsHex(Char c) const
{
return CTU(c) < ASCIIChars::NumChars && ASCIIChars::IsHex(ASCIIChars::UTC(CTU(c)));
}
inline uint DigitValue(Char c) const
{
return CTU(c) < ASCIIChars::NumChars ? ASCIIChars::DigitValue(ASCIIChars::UTC(CTU(c))) : 0;
}
void SetDigits(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNonDigits(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNonWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNonWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNonWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNewline(ArenaAllocator* setAllocator, CharSet<Char> &set);
void SetNonNewline(ArenaAllocator* setAllocator, CharSet<Char> &set);
CharSet<Char>* GetFullSet();
CharSet<Char>* GetEmptySet();
CharSet<Char>* GetWordSet();
CharSet<Char>* GetNonWordSet();
CharSet<Char>* GetNewlineSet();
CharSet<Char>* GetWhitespaceSet();
CharSet<Char>* GetSurrogateUpperRange();
inline Char ToCanonical(CaseInsensitive::MappingSource mappingSource, Char c) const
{
if (mappingSource == CaseInsensitive::MappingSource::UnicodeData)
{
return unicodeDataCaseMapper.ToCanonical(c);
}
else
{
Assert(mappingSource == CaseInsensitive::MappingSource::CaseFolding);
return caseFoldingCaseMapper.ToCanonical(c);
}
}
CompileAssert(CaseInsensitive::EquivClassSize == 4);
inline bool ToEquivs(CaseInsensitive::MappingSource mappingSource, Char c, __out_ecount(4) Char* equivs) const
{
if (mappingSource == CaseInsensitive::MappingSource::UnicodeData)
{
return unicodeDataCaseMapper.ToEquivs(c, equivs);
}
else
{
Assert(mappingSource == CaseInsensitive::MappingSource::CaseFolding);
return caseFoldingCaseMapper.ToEquivs(c, equivs);
}
}
inline bool IsTrivialString(CaseInsensitive::MappingSource mappingSource, const Char* str, CharCount strLen) const
{
if (mappingSource == CaseInsensitive::MappingSource::UnicodeData)
{
return unicodeDataCaseMapper.IsTrivialString(str, strLen);
}
else
{
Assert(mappingSource == CaseInsensitive::MappingSource::CaseFolding);
return caseFoldingCaseMapper.IsTrivialString(str, strLen);
}
}
};
typedef UnifiedRegex::StandardChars<uint8> UTF8StandardChars;
typedef UnifiedRegex::StandardChars<char16> UnicodeStandardChars;
}