| //------------------------------------------------------------------------------------------------------- |
| // Copyright (C) Microsoft. All rights reserved. |
| // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information. |
| //------------------------------------------------------------------------------------------------------- |
| // |
| // Matchers for pattern of form: |
| // pattern ::= atom{8} '|' atom{8} |
| // atom ::= A | [...charset drawn from A's...] |
| // where: |
| // - A is a set of exactly four ASCII characters |
| // - The pattern ignores case |
| // - The pattern includes the global flag |
| // An example pattern would be "ABCdABCd|aDcAbBcD". |
| #pragma once |
| |
| namespace UnifiedRegex |
| { |
| // ---------------------------------------------------------------------- |
| // Trigrams |
| // ---------------------------------------------------------------------- |
| |
| struct TrigramInfo { |
| static const int PatternLength=8; |
| static const int MaxResults=32; |
| Field(bool) isTrigramPattern; |
| Field(bool) hasCachedResultString; |
| Field(int) triPat1; |
| Field(int) triPat2; |
| Field(int) resultCount; |
| Field(int) offsets[MaxResults]; |
| Field(Js::JavascriptString *) cachedResult[MaxResults]; |
| |
| TrigramInfo(__in_ecount(PatternLength) char* pat1,__in_ecount(PatternLength) char* pat2, Recycler* recycler); |
| }; |
| |
| struct PatternTri { |
| RegexPattern* pattern; |
| int encodedPattern; |
| }; |
| |
| struct TrigramStart { |
| static const int MaxPatPerStart=12; |
| int count; |
| PatternTri patterns[MaxPatPerStart]; |
| }; |
| |
| struct TrigramAlphabet { |
| static const int AlphaCount=4; |
| static const int AsciiTableSize=128; |
| static const int BitsNotInAlpha=4; |
| static const int TrigramMapSize=221; |
| static const int TrigramNotInPattern=65; |
| static const char LowerCaseBit=0x20; |
| static const char UpperCaseMask=0x5f; |
| static const int TrigramCount=64; |
| static const int MaxCachedStarts=48; |
| |
| TrigramStart trigramStarts[TrigramCount]; |
| |
| char alpha[AlphaCount]; |
| char alphaBits[AsciiTableSize]; |
| char trigramMap[TrigramMapSize]; |
| const char16* input; |
| int inputLen; |
| |
| void InitTrigramMap(); |
| bool AddStarts(__in_xcount(TrigramInfo::PatternLength) char* pat1,__in_xcount(TrigramInfo::PatternLength) char* pat2, RegexPattern* pattern); |
| void MegaMatch(__in_ecount(inputLen) const char16* input,int inputLen); |
| }; |
| |
| // ---------------------------------------------------------------------- |
| // OctoquadIdentifier |
| // ---------------------------------------------------------------------- |
| |
| class OctoquadIdentifier : private Chars<char16> |
| { |
| friend class OctoquadMatcher; |
| public: |
| static const int NumPatterns = 2; |
| |
| private: |
| // Number of characters in the alphabet encountered so far |
| int numCodes; |
| |
| // Maps a character code to the character |
| char (&codeToChar)[TrigramAlphabet::AlphaCount]; |
| |
| // Maps a character to its code 0-3. This array is passed into the constructor and only indexes for characters in the |
| // alphabet are updated. |
| char (&charToCode)[TrigramAlphabet::AsciiTableSize]; |
| |
| |
| // For each octoquad pattern, each byte contains a 4-bit pattern. One character will be represented as 0x1, 0x2, 0x4, or |
| // 0x8 since it's a quad alphabet. A character class in the pattern can cause the bit pattern to be a combination of the |
| // character bits. |
| char patternBits[NumPatterns][TrigramInfo::PatternLength]; |
| |
| int currPatternLength; |
| int currPatternNum; |
| |
| void SetTrigramAlphabet(Js::ScriptContext * scriptContext, |
| __in_xcount(regex::TrigramAlphabet::AlphaCount) char* alpha, |
| __in_xcount(regex::TrigramAlphabet::AsciiTableSize) char* alphaBits); |
| public: |
| static bool Qualifies(const Program *const program); |
| |
| OctoquadIdentifier( |
| const int numCodes, |
| char (&codeToChar)[TrigramAlphabet::AlphaCount], |
| char (&charToCode)[TrigramAlphabet::AsciiTableSize]); |
| |
| // Returns -1 if character not in quad alphabet and the alphabet is full |
| int GetOrAddCharCode(const Char c); |
| |
| bool BeginConcat(); |
| bool CouldAppend(const CharCount n) const; |
| bool AppendChar(Char c); |
| bool BeginUnions(); |
| bool UnionChar(Char c); |
| void EndUnions(); |
| bool IsOctoquad(); |
| |
| void InitializeTrigramInfo(Js::ScriptContext* scriptContext, RegexPattern* const pattern); |
| |
| }; |
| |
| // ---------------------------------------------------------------------- |
| // OctoquadMatcher |
| // ---------------------------------------------------------------------- |
| |
| class OctoquadMatcher : private Chars<char16> |
| { |
| private: |
| OctoquadMatcher(const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier); |
| |
| Field(Char) codeToChar[TrigramAlphabet::AlphaCount]; |
| |
| // Maps characters (0..AsciTableSize-1) to 0 if not in alphabet, or 0x1, 0x2, 0x4 or 0x8. |
| // Allocated and filled only if invoke Match below. |
| Field(uint8) charToBits[TrigramAlphabet::AsciiTableSize]; |
| |
| Field(uint32) patterns[OctoquadIdentifier::NumPatterns]; |
| |
| public: |
| static OctoquadMatcher *New(Recycler* recycler, const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier); |
| |
| bool Match |
| ( const Char* const input |
| , const CharCount inputLength |
| , CharCount& offset |
| #if ENABLE_REGEX_CONFIG_OPTIONS |
| , RegexStats* stats |
| #endif |
| ); |
| |
| #if ENABLE_REGEX_CONFIG_OPTIONS |
| void Print(DebugWriter* w) const; |
| #endif |
| |
| }; |
| } |