blob: e8fe8564bce7a9b4827294a5079c18046691a694 [file] [log] [blame]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
namespace UnifiedRegex
{
template <typename C>
class CharSet;
template <typename C>
class RuntimeCharSet;
class CharBitvec : private Chars<char>
{
public:
static const int Width = Chars<char>::CharWidth;
static const int Size = NumChars;
private:
static const int wordSize = sizeof(uint32) * 8;
static const int vecSize = Size / wordSize;
static const uint32 ones = (uint32)-1;
static const uint8 oneBits[Size];
uint32 vec[vecSize];
inline static void setrng(uint32 &v, uint l, uint h)
{
uint w = h - l + 1;
if (w == wordSize)
{
v = ones;
}
else
{
v |= ((1U << w) - 1) << l;
}
}
inline static void clearrng(uint32 &v, uint l, uint h)
{
uint w = h - l + 1;
if (w == wordSize)
{
v = 0;
}
else
{
v &= ~(((1U << w) - 1) << l);
}
}
public:
inline void CloneFrom(const CharBitvec& other)
{
for (int w = 0; w < vecSize; w++)
{
vec[w] = other.vec[w];
}
}
inline void Clear()
{
for (int w = 0; w < vecSize; w++)
{
vec[w] = 0;
}
}
inline void SetAll()
{
for (int w = 0; w < vecSize; w++)
{
vec[w] = ones;
}
}
inline void Set(uint k)
{
Assert(k < Size);
__assume(k < Size);
if (k < Size)
{
vec[k / wordSize] |= 1U << (k % wordSize);
}
}
inline void SetRange(uint l, uint h)
{
Assert(l < Size);
Assert(h < Size);
__assume(l < Size);
__assume(h < Size);
if (l < Size && h < Size)
{
if (l == h)
{
vec[l / wordSize] |= 1U << (l % wordSize);
}
else if (l < h)
{
int lw = l / wordSize;
int hw = h / wordSize;
int lo = l % wordSize;
int hio = h % wordSize;
if (lw == hw)
{
setrng(vec[lw], lo, hio);
}
else
{
setrng(vec[lw], lo, wordSize - 1);
for (int w = lw + 1; w < hw; w++)
{
vec[w] = ones;
}
setrng(vec[hw], 0, hio);
}
}
}
}
inline void ClearRange(uint l, uint h)
{
Assert(l < Size);
Assert(h < Size);
__assume(l < Size);
__assume(h < Size);
if (l < Size && h < Size)
{
if (l == h)
{
vec[l / wordSize] &= ~(1U << (l % wordSize));
}
else if (l < h)
{
int lw = l / wordSize;
int hw = h / wordSize;
int lo = l % wordSize;
int hio = h % wordSize;
if (lw == hw)
{
clearrng(vec[lw], lo, hio);
}
else
{
clearrng(vec[lw], lo, wordSize - 1);
for (int w = lw + 1; w < hw; w++)
{
vec[w] = 0;
}
clearrng(vec[hw], 0, hio);
}
}
}
}
inline bool IsEmpty()
{
for (int i = 0; i < vecSize; i++)
{
if (vec[i] != 0)
{
return false;
}
}
return true;
}
inline void UnionInPlace(const CharBitvec& other)
{
for (int w = 0; w < vecSize; w++)
{
vec[w] |= other.vec[w];
}
}
inline bool UnionInPlaceFullCheck(const CharBitvec& other)
{
bool isFull = true;
for (int w = 0; w < vecSize; w++)
{
vec[w] |= other.vec[w];
if (vec[w] != ones)
{
isFull = false;
}
}
return isFull;
}
inline bool Get(uint k) const
{
Assert(k < Size);
__assume(k < Size);
return ((vec[k / wordSize] >> (k % wordSize)) & 1) != 0;
}
inline bool IsFull() const
{
for (int w = 0; w < vecSize; w++)
{
if (vec[w] != ones)
{
return false;
}
}
return true;
}
inline bool IsSubsetOf(const CharBitvec& other) const
{
for (int w = 0; w < vecSize; w++)
{
uint32 v = other.vec[w];
if (v != (vec[w] | v))
{
return false;
}
}
return true;
}
inline bool IsEqualTo(const CharBitvec& other) const
{
for (int w = 0; w < vecSize; w++)
{
if (vec[w] != other.vec[w])
{
return false;
}
}
return true;
}
uint Count() const;
int NextSet(int k) const;
int NextClear(int k) const;
template <typename C>
void ToComplement(ArenaAllocator* allocator, uint base, CharSet<C>& result) const;
template <typename C>
void ToEquivClass(ArenaAllocator* allocator, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
};
template <typename C>
class CharSet {};
struct CharSetNode : protected Chars<char16>
{
static const int directBits = Chars<char>::CharWidth;
static const uint directSize = Chars<char>::NumChars;
static const uint bitsPerInnerLevel = 4;
static const uint branchingPerInnerLevel = 1 << bitsPerInnerLevel;
static const uint innerMask = branchingPerInnerLevel - 1;
static const int bitsPerLeafLevel = CharBitvec::Width;
static const int branchingPerLeafLevel = CharBitvec::Size;
static const uint leafMask = branchingPerLeafLevel - 1;
static const uint levels = 1 + (CharWidth - bitsPerLeafLevel) / bitsPerInnerLevel;
inline static uint innerIdx(uint level, uint v)
{
return (v >> ((level + 1) * bitsPerInnerLevel)) & innerMask;
}
inline static uint indexToValue(uint level, uint index, uint offset)
{
Assert((index & innerMask) == index);
Assert((uint)(1 << ((level + 1) * bitsPerInnerLevel)) > offset);
return (index << ((level + 1) * bitsPerInnerLevel)) + offset;
}
inline static uint leafIdx(uint v)
{
return v & leafMask;
}
inline static uint lim(uint level)
{
return (1U << (bitsPerLeafLevel + level * bitsPerInnerLevel)) - 1;
}
inline static uint remain(uint level, uint v)
{
return v & lim(level);
}
virtual void FreeSelf(ArenaAllocator* allocator) = 0;
virtual CharSetNode* Clone(ArenaAllocator* allocator) const = 0;
virtual CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
virtual CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) = 0;
virtual CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) = 0;
virtual bool Get(uint level, uint k) const = 0;
virtual void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const = 0;
virtual void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const = 0;
virtual void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const = 0;
virtual bool IsSubsetOf(uint level, const CharSetNode* other) const = 0;
virtual bool IsEqualTo(uint level, const CharSetNode* other) const = 0;
virtual uint Count(uint level) const = 0;
_Success_(return) virtual bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const = 0;
#if DBG
virtual bool IsLeaf() const = 0;
#endif
static CharSetNode* For(ArenaAllocator* allocator, int level);
};
struct CharSetFull : CharSetNode
{
private:
template <typename C>
void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
public:
static CharSetFull Instance;
static CharSetFull * const TheFullNode;
CharSetFull();
void FreeSelf(ArenaAllocator* allocator) override;
CharSetNode* Clone(ArenaAllocator* allocator) const override;
CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
bool Get(uint level, uint k) const override;
void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
bool IsSubsetOf(uint level, const CharSetNode* other) const override;
bool IsEqualTo(uint level, const CharSetNode* other) const override;
uint Count(uint level) const override;
_Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
#if DBG
bool IsLeaf() const override;
#endif
};
struct CharSetInner sealed : CharSetNode
{
private:
template <typename C>
void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result) const;
public:
CharSetNode * children[branchingPerInnerLevel];
CharSetInner();
void FreeSelf(ArenaAllocator* allocator) override;
CharSetNode* Clone(ArenaAllocator* allocator) const override;
CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
bool Get(uint level, uint k) const override;
void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
bool IsSubsetOf(uint level, const CharSetNode* other) const override;
bool IsEqualTo(uint level, const CharSetNode* other) const override;
uint Count(uint level) const override;
_Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
#if DBG
bool IsLeaf() const override;
#endif
};
struct CharSetLeaf sealed : CharSetNode
{
private:
template <typename C>
void ToEquivClass(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<C>& result, codepoint_t baseOffset = 0x0) const;
public:
CharBitvec vec;
CharSetLeaf();
void FreeSelf(ArenaAllocator* allocator) override;
CharSetNode* Clone(ArenaAllocator* allocator) const override;
CharSetNode* Set(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* ClearRange(ArenaAllocator* allocator, uint level, uint l, uint h) override;
CharSetNode* UnionInPlace(ArenaAllocator* allocator, uint level, const CharSetNode* other) override;
bool Get(uint level, uint k) const override;
void ToComplement(ArenaAllocator* allocator, uint level, uint base, CharSet<Char>& result) const override;
void ToEquivClassW(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<char16>& result) const override;
void ToEquivClassCP(ArenaAllocator* allocator, uint level, uint base, uint& tblidx, CharSet<codepoint_t>& result, codepoint_t baseOffset) const override;
bool IsSubsetOf(uint level, const CharSetNode* other) const override;
bool IsEqualTo(uint level, const CharSetNode* other) const override;
uint Count(uint level) const override;
_Success_(return) bool GetNextRange(uint level, Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar) const override;
#if DBG
bool IsLeaf() const override;
#endif
};
template <>
class CharSet<char16> : private Chars<char16>
{
public:
static const uint MaxCompact = 4;
static const uint emptySlot = (uint)-1;
struct CompactRep
{
// 1 + number of distinct characters, 1..MaxCompact+1
size_t countPlusOne;
// Characters, in no particular order, or (uint)-1 for tail empty slots
uint cs[MaxCompact];
uint8 padding[sizeof(CharBitvec) - sizeof(uint) * MaxCompact];
};
struct FullRep
{
// Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
CharSetNode* root;
// Entries for first 256 characters
CharBitvec direct;
};
union Rep
{
struct CompactRep compact;
struct FullRep full;
} rep;
static const int compactSize = sizeof(CompactRep);
static const int fullSize = sizeof(FullRep);
inline bool IsCompact() const { return rep.compact.countPlusOne - 1 <= MaxCompact; }
void SwitchRepresentations(ArenaAllocator* allocator);
void Sort();
public:
CharSet();
void FreeBody(ArenaAllocator* allocator);
void Clear(ArenaAllocator* allocator);
void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<Char>& other);
inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
void SubtractRange(ArenaAllocator* allocator, Char lc, Char hc);
void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
_Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
bool Get_helper(uint k) const;
inline bool Get(Char kc) const
{
if (IsCompact())
{
Assert(MaxCompact == 4);
return rep.compact.cs[0] == CTU(kc) ||
rep.compact.cs[1] == CTU(kc) ||
rep.compact.cs[2] == CTU(kc) ||
rep.compact.cs[3] == CTU(kc);
}
else
{
if (CTU(kc) < CharSetNode::directSize)
{
return rep.full.direct.Get(CTU(kc));
}
else if (rep.full.root == 0)
{
return false;
}
else
{
return Get_helper(CTU(kc));
}
}
}
inline bool IsEmpty() const
{
return rep.compact.countPlusOne == 1;
}
inline bool IsSingleton() const
{
return rep.compact.countPlusOne == 2;
}
// Helpers to clean up the code
inline uint GetCompactLength() const
{
Assert(IsCompact());
return (uint)(rep.compact.countPlusOne - 1u);
}
inline void SetCompactLength(size_t length)
{
rep.compact.countPlusOne = length + 1;
}
inline uint GetCompactCharU(uint index) const
{
Assert(index < this->GetCompactLength());
Assert(IsCompact());
Assert(rep.compact.cs[index] <= MaxUChar);
return rep.compact.cs[index];
}
inline Char GetCompactChar(uint index) const
{
return (Char)(GetCompactCharU(index));
}
//Replaces an existing character with a new value
inline void ReplaceCompactCharU(uint index, uint value)
{
Assert(index < this->GetCompactLength());
Assert(IsCompact());
Assert(value <= MaxUChar);
rep.compact.cs[index] = value;
}
inline void ClearCompactChar(uint index)
{
Assert(index < this->GetCompactLength());
Assert(IsCompact());
rep.compact.cs[index] = emptySlot;
}
// Adds the character to the end, assuming there is enough space. (Assert in place)
// Increments count.
inline void AddCompactCharU(uint value)
{
Assert(this->GetCompactLength() < MaxCompact);
Assert(IsCompact());
rep.compact.cs[this->GetCompactLength()] = value;
rep.compact.countPlusOne += 1;
}
// Adds the character to the end, assuming there is enough space. (Assert in place)
// Increments count.
inline void AddCompactChar(Char value)
{
AddCompactCharU((Char)(value));
}
// This performs a check to see if the index is the last char, if so sets it to emptySlot
// If not, replaces it with last index.
inline void RemoveCompactChar(uint index)
{
Assert(index < this->GetCompactLength());
Assert(IsCompact());
if (index == this->GetCompactLength() - 1)
{
this->ClearCompactChar(index);
}
else
{
this->ReplaceCompactCharU(index, this->GetCompactCharU((uint)this->GetCompactLength() - 1));
}
rep.compact.countPlusOne -= 1;
}
inline char16 Singleton() const
{
Assert(IsSingleton());
Assert(rep.compact.cs[0] <= MaxUChar);
return UTC(rep.compact.cs[0]);
}
int GetCompactEntries(uint max, __out_ecount(max) Char* entries) const;
bool IsSubsetOf(const CharSet<Char>& other) const;
bool IsEqualTo(const CharSet<Char>& other) const;
inline uint Count() const
{
if (IsCompact())
{
return (uint)rep.compact.countPlusOne - 1;
}
else if (rep.full.root == 0)
{
return rep.full.direct.Count();
}
else
{
//The bit vector
Assert(rep.full.root == CharSetFull::TheFullNode || rep.full.root->Count(CharSetNode::levels - 1) <= 0xFF00);
return rep.full.direct.Count() + (rep.full.root == CharSetFull::TheFullNode ? 0xFF00 : rep.full.root->Count(CharSetNode::levels - 1));
}
}
// NOTE: These are not 'const' methods since they may sort the compact representation internally
void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
void ToEquivClassCP(ArenaAllocator* allocator, CharSet<codepoint_t>& result, codepoint_t baseOffset);
#if ENABLE_REGEX_CONFIG_OPTIONS
void Print(DebugWriter* w) const;
#endif
};
template <>
class CharSet<codepoint_t> : private Chars<codepoint_t>
{
static const int NumberOfPlanes = 17;
private:
// Character planes are composed of 65536 characters each.
// First plane is the Basic Multilingual Plane (characters 0 - 65535)
// Every subsequent plane also stores characters in the form [0 - 65535]; to get the actual value, add 'index * 0x10000' to it
CharSet<char16> characterPlanes[NumberOfPlanes];
// Takes a character, and returns the index of the CharSet<char16> that holds it.
inline int CharToIndex(Char c) const
{
Assert(c <= Chars<codepoint_t>::MaxUChar);
return (int)(CTU(c) / (Chars<char16>::MaxUChar + 1));
}
// Takes a character, and removes the offset to make it < 0x10000
inline char16 RemoveOffset(Char c) const
{
Assert(c <= Chars<codepoint_t>::MaxUChar);
return (char16)(CTU(c) % 0x10000);
}
// Takes a character, and removes the offset to make it < 0x10000
inline Char AddOffset(char16 c, int index) const
{
Assert(c <= Chars<char16>::MaxUChar);
Assert(index >= 0);
Assert(index < NumberOfPlanes);
return ((Char)c) + 0x10000 * index;
}
public:
CharSet();
void FreeBody(ArenaAllocator* allocator);
void Clear(ArenaAllocator* allocator);
void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
void CloneSimpleCharsTo(ArenaAllocator* allocator, CharSet<char16>& other) const;
inline void CloneNonSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
{
Assert(this->SimpleCharCount() > 0);
AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, a simple clone is faster.");
this->characterPlanes[0].CloneNonSurrogateCodeUnitsTo(allocator, other);
}
inline void CloneSurrogateCodeUnitsTo(ArenaAllocator* allocator, CharSet<char16>& other)
{
Assert(this->SimpleCharCount() > 0);
AssertMsg(this->ContainSurrogateCodeUnits(), "This doesn't contain surrogate code units, will not produce any result.");
this->characterPlanes[0].CloneSurrogateCodeUnitsTo(allocator, other);
}
inline void Set(ArenaAllocator* allocator, Char kc) { SetRange(allocator, kc, kc); }
inline bool ContainSurrogateCodeUnits()
{
char16 outLower = 0xFFFF, ignore = 0x0;
return this->characterPlanes[0].GetNextRange(0xD800, &outLower, &ignore) ? outLower <= 0xDFFF : false;
}
void SetRange(ArenaAllocator* allocator, Char lc, Char hc);
void SetRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
void SetNotRanges(ArenaAllocator* allocator, int numSortedPairs, const Char* sortedPairs);
void UnionInPlace(ArenaAllocator* allocator, const CharSet<Char>& other);
void UnionInPlace(ArenaAllocator* allocator, const CharSet<char16>& other);
_Success_(return) bool GetNextRange(Char searchCharStart, _Out_ Char *outLowerChar, _Out_ Char *outHigherChar);
inline bool Get(Char kc) const
{
return this->characterPlanes[CharToIndex(kc)].Get(RemoveOffset(kc));
}
inline bool IsEmpty() const
{
for (int i = 0; i < NumberOfPlanes; i++)
{
if (!this->characterPlanes[i].IsEmpty())
{
return false;
}
}
return true;
}
inline bool IsSimpleCharASingleton() const
{
return this->characterPlanes[0].IsSingleton();
}
inline char16 SimpleCharSingleton() const
{
return this->characterPlanes[0].Singleton();
}
inline bool IsSingleton() const
{
return this->Count() == 1;
}
inline codepoint_t Singleton() const
{
Assert(IsSingleton());
for (int i = 0; i < NumberOfPlanes; i++)
{
if (this->characterPlanes[i].IsSingleton())
{
return AddOffset(this->characterPlanes[i].Singleton(), i);
}
}
AssertMsg(false, "Should not reach here, first Assert verifies we are a singleton.");
return INVALID_CODEPOINT;
}
bool IsSubsetOf(const CharSet<Char>& other) const;
bool IsEqualTo(const CharSet<Char>& other) const;
inline uint Count() const
{
uint totalCount = 0;
for (int i = 0; i < NumberOfPlanes; i++)
{
totalCount += this->characterPlanes[i].Count();
}
return totalCount;
}
inline uint SimpleCharCount() const
{
return this->characterPlanes[0].Count();
}
// NOTE: These are not 'const' methods since they may sort the compact representation internally
void ToComplement(ArenaAllocator* allocator, CharSet<Char>& result);
void ToSimpleComplement(ArenaAllocator* allocator, CharSet<codepoint_t>& result);
void ToSimpleComplement(ArenaAllocator* allocator, CharSet<char16>& result);
void ToEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
void ToSurrogateEquivClass(ArenaAllocator* allocator, CharSet<Char>& result);
#if ENABLE_REGEX_CONFIG_OPTIONS
void Print(DebugWriter* w) const;
#endif
};
template <>
class RuntimeCharSet<char16> : private Chars<char16>
{
private:
// Trie for remaining characters. Pointer value will be 0 or >> MaxCompact.
CharSetNode * root;
// Entries for first 256 characters
CharBitvec direct;
public:
RuntimeCharSet();
void FreeBody(ArenaAllocator* allocator);
void CloneFrom(ArenaAllocator* allocator, const CharSet<Char>& other);
bool Get_helper(uint k) const;
inline bool Get(Char kc) const
{
if (CTU(kc) < CharSetNode::directSize)
{
return direct.Get(CTU(kc));
}
else if (root == 0)
{
return false;
}
else
{
return Get_helper(CTU(kc));
}
}
#if ENABLE_REGEX_CONFIG_OPTIONS
void Print(DebugWriter* w) const;
#endif
};
typedef CharSet<char16> UnicodeCharSet;
typedef RuntimeCharSet<char16> UnicodeRuntimeCharSet;
}