blob: 73de5f3ec0573e108fd2f616c4b4775677587af3 [file] [log] [blame]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
//
// Matchers for pattern of form:
// pattern ::= atom{8} '|' atom{8}
// atom ::= A | [...charset drawn from A's...]
// where:
// - A is a set of exactly four ASCII characters
// - The pattern ignores case
// - The pattern includes the global flag
// An example pattern would be "ABCdABCd|aDcAbBcD".
#pragma once
namespace UnifiedRegex
{
// ----------------------------------------------------------------------
// Trigrams
// ----------------------------------------------------------------------
struct TrigramInfo {
static const int PatternLength=8;
static const int MaxResults=32;
Field(bool) isTrigramPattern;
Field(bool) hasCachedResultString;
Field(int) triPat1;
Field(int) triPat2;
Field(int) resultCount;
Field(int) offsets[MaxResults];
Field(Js::JavascriptString *) cachedResult[MaxResults];
TrigramInfo(__in_ecount(PatternLength) char* pat1,__in_ecount(PatternLength) char* pat2, Recycler* recycler);
};
struct PatternTri {
RegexPattern* pattern;
int encodedPattern;
};
struct TrigramStart {
static const int MaxPatPerStart=12;
int count;
PatternTri patterns[MaxPatPerStart];
};
struct TrigramAlphabet {
static const int AlphaCount=4;
static const int AsciiTableSize=128;
static const int BitsNotInAlpha=4;
static const int TrigramMapSize=221;
static const int TrigramNotInPattern=65;
static const char LowerCaseBit=0x20;
static const char UpperCaseMask=0x5f;
static const int TrigramCount=64;
static const int MaxCachedStarts=48;
TrigramStart trigramStarts[TrigramCount];
char alpha[AlphaCount];
char alphaBits[AsciiTableSize];
char trigramMap[TrigramMapSize];
const char16* input;
int inputLen;
void InitTrigramMap();
bool AddStarts(__in_xcount(TrigramInfo::PatternLength) char* pat1,__in_xcount(TrigramInfo::PatternLength) char* pat2, RegexPattern* pattern);
void MegaMatch(__in_ecount(inputLen) const char16* input,int inputLen);
};
// ----------------------------------------------------------------------
// OctoquadIdentifier
// ----------------------------------------------------------------------
class OctoquadIdentifier : private Chars<char16>
{
friend class OctoquadMatcher;
public:
static const int NumPatterns = 2;
private:
// Number of characters in the alphabet encountered so far
int numCodes;
// Maps a character code to the character
char (&codeToChar)[TrigramAlphabet::AlphaCount];
// Maps a character to its code 0-3. This array is passed into the constructor and only indexes for characters in the
// alphabet are updated.
char (&charToCode)[TrigramAlphabet::AsciiTableSize];
// For each octoquad pattern, each byte contains a 4-bit pattern. One character will be represented as 0x1, 0x2, 0x4, or
// 0x8 since it's a quad alphabet. A character class in the pattern can cause the bit pattern to be a combination of the
// character bits.
char patternBits[NumPatterns][TrigramInfo::PatternLength];
int currPatternLength;
int currPatternNum;
void SetTrigramAlphabet(Js::ScriptContext * scriptContext,
__in_xcount(regex::TrigramAlphabet::AlphaCount) char* alpha,
__in_xcount(regex::TrigramAlphabet::AsciiTableSize) char* alphaBits);
public:
static bool Qualifies(const Program *const program);
OctoquadIdentifier(
const int numCodes,
char (&codeToChar)[TrigramAlphabet::AlphaCount],
char (&charToCode)[TrigramAlphabet::AsciiTableSize]);
// Returns -1 if character not in quad alphabet and the alphabet is full
int GetOrAddCharCode(const Char c);
bool BeginConcat();
bool CouldAppend(const CharCount n) const;
bool AppendChar(Char c);
bool BeginUnions();
bool UnionChar(Char c);
void EndUnions();
bool IsOctoquad();
void InitializeTrigramInfo(Js::ScriptContext* scriptContext, RegexPattern* const pattern);
};
// ----------------------------------------------------------------------
// OctoquadMatcher
// ----------------------------------------------------------------------
class OctoquadMatcher : private Chars<char16>
{
private:
OctoquadMatcher(const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
Field(Char) codeToChar[TrigramAlphabet::AlphaCount];
// Maps characters (0..AsciTableSize-1) to 0 if not in alphabet, or 0x1, 0x2, 0x4 or 0x8.
// Allocated and filled only if invoke Match below.
Field(uint8) charToBits[TrigramAlphabet::AsciiTableSize];
Field(uint32) patterns[OctoquadIdentifier::NumPatterns];
public:
static OctoquadMatcher *New(Recycler* recycler, const StandardChars<Char>* standardChars, CaseInsensitive::MappingSource mappingSource, OctoquadIdentifier* identifier);
bool Match
( const Char* const input
, const CharCount inputLength
, CharCount& offset
#if ENABLE_REGEX_CONFIG_OPTIONS
, RegexStats* stats
#endif
);
#if ENABLE_REGEX_CONFIG_OPTIONS
void Print(DebugWriter* w) const;
#endif
};
}