blob: 9980c32aff1ee08e14031aa2bda61967af9711d1 [file] [log] [blame]
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#include "ParserPch.h"
namespace UnifiedRegex
{
// ----------------------------------------------------------------------
// ASCIIChars
// ----------------------------------------------------------------------
/*
To get these two tables run:
ch.exe ascii.js
where ascii.js is:
----------------------------------------------------------------------
function echo(s) { WScript.Echo(s); }
var NumChars = 1 << 8;
var Word = 1 << 0;
var Newline = 1 << 1;
var Whitespace = 1 << 2;
var Letter = 1 << 3;
var Digit = 1 << 4;
var Octal = 1 << 5;
var Hex = 1 << 6;
var classes = [];
var values = [];
function cc(s) {
return s.charCodeAt(0);
}
var c;
for (c = 0; c < NumChars; c++)
{
classes[c] = 0;
values[c] = 0;
}
for (c = cc('0'); c <= cc('7'); c++)
{
classes[c] |= Word | Octal | Digit | Hex;
values[c] = c - cc('0');
}
for (c = cc('8'); c <= cc('9'); c++)
{
classes[c] |= Word | Digit | Hex;
values[c] = c - cc('0');
}
for (c = cc('a'); c <= cc('f'); c++)
{
classes[c] |= Word | Hex | Letter;
values[c] = 10 + c - cc('a');
}
for (c = cc('g'); c <= cc('z'); c++)
classes[c] |= Word | Letter;
for (c = cc('A'); c <= cc('F'); c++)
{
classes[c] |= Word | Hex | Letter;
values[c] = 10 + c - cc('A');
}
for (c = cc('G'); c <= cc('Z'); c++)
classes[c] |= Word | Letter;
classes[cc('_')] |= Word;
classes[cc('\n')] |= Newline;
classes[cc('\r')] |= Newline;
for (c = cc('\t'); c <= cc('\r'); c++)
classes[c] |= Whitespace;
classes[cc(' ')] |= Whitespace;
classes[cc('\x85')] |= Whitespace;
classes[cc('\xa0')] |= Whitespace;
hex = "0123456789abcdef";
function toHex(n) {
return "0x" + hex[n >> 4] + hex[n & 0xf];
}
function dump(a) {
for (c = 0; c < NumChars; c++) {
if (c % 16 == 0)
str = " ";
else
str += ", ";
str += toHex(a[c]);
if (c % 16 == 15)
{
if (c < NumChars - 1)
str += ",";
echo(str);
}
}
}
echo(" const uint8 ASCIIChars::classes[] = {");
dump(classes);
echo(" };");
echo(" const uint8 ASCIIChars::values[] = {");
dump(values);
echo(" };");
----------------------------------------------------------------------
*/
// Character classes represented as a bit vector for each character.
const uint8 ASCIIChars::classes[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x51, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x01,
0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
// Numeric values of ASCII characters interpreted as hex digits (applies to [0-9a-fA-F], all others are 0x00).
const uint8 ASCIIChars::values[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
// ----------------------------------------------------------------------
// TrivialCaseMapper
// ----------------------------------------------------------------------
const TrivialCaseMapper TrivialCaseMapper::Instance;
// ----------------------------------------------------------------------
// StandardChars<char16>
// ----------------------------------------------------------------------
/*
To get the whitespaces string, run:
gawk -f spaces.gawk http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
where spaces.gawk is
----------------------------------------------------------------------
BEGIN {
FS = ";";
start = -1;
last = -1;
str = "";
}
{
code = strtonum("0x" $1);
if ($3 == "Zs" || code == 0x0009 || code == 0x000B || code == 0x000C || code == 0x0020 || code == 0x00A0 || code == 0xFEFF || code == 0x000A || code == 0x000D || code == 0x2028 || code == 0x2029)
{
if (start < 0)
start = code;
else if (code > last + 1) {
str = sprintf("%s\\x%04x\\x%04x", str, start, last);
start = code;
}
last = code;
}
}
END {
str = sprintf("%s\\x%04x\\x%04x", str, start, last);
print str;
}
----------------------------------------------------------------------
*/
const int StandardChars<char16>::numDigitPairs = 1;
const char16* const StandardChars<char16>::digitStr = _u("09");
const int StandardChars<char16>::numWhitespacePairs = 10;
const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
const int StandardChars<char16>::numWordPairs = 4;
const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
const int StandardChars<char16>::numWordIUPairs = 6; // Under /iu flags, Sharp S and Kelvin sign map to S and K, respectively.
const char16* const StandardChars<char16>::wordIUStr = _u("09AZ__az\x017F\x017F\x212A\x212A");
const int StandardChars<char16>::numNewlinePairs = 3;
const char16* const StandardChars<char16>::newlineStr = _u("\x000a\x000a\x000d\x000d\x2028\x2029");
StandardChars<char16>::StandardChars(ArenaAllocator* allocator)
: allocator(allocator)
, unicodeDataCaseMapper(allocator, CaseInsensitive::MappingSource::UnicodeData, &TrivialCaseMapper::Instance)
, caseFoldingCaseMapper(allocator, CaseInsensitive::MappingSource::CaseFolding, &unicodeDataCaseMapper)
, fullSet(0)
, emptySet(0)
, wordSet(0)
, nonWordSet(0)
, newlineSet(0)
, whitespaceSet(0)
{
}
void StandardChars<char16>::SetDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetRanges(setAllocator, numDigitPairs, digitStr);
}
void StandardChars<char16>::SetNonDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetNotRanges(setAllocator, numDigitPairs, digitStr);
}
void StandardChars<char16>::SetWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetRanges(setAllocator, numWhitespacePairs, whitespaceStr);
}
void StandardChars<char16>::SetNonWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetNotRanges(setAllocator, numWhitespacePairs, whitespaceStr);
}
void StandardChars<char16>::SetWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetRanges(setAllocator, numWordPairs, wordStr);
}
void StandardChars<char16>::SetNonWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetNotRanges(setAllocator, numWordPairs, wordStr);
}
void StandardChars<char16>::SetWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetRanges(setAllocator, numWordIUPairs, wordIUStr);
}
void StandardChars<char16>::SetNonWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetNotRanges(setAllocator, numWordIUPairs, wordIUStr);
}
void StandardChars<char16>::SetNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetRanges(setAllocator, numNewlinePairs, newlineStr);
}
void StandardChars<char16>::SetNonNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
{
set.SetNotRanges(setAllocator, numNewlinePairs, newlineStr);
}
CharSet<char16>* StandardChars<char16>::GetFullSet()
{
if (fullSet == 0)
{
fullSet = Anew(allocator, UnicodeCharSet);
fullSet->SetRange(allocator, MinChar, MaxChar);
}
return fullSet;
}
CharSet<char16>* StandardChars<char16>::GetEmptySet()
{
if (emptySet == 0)
{
emptySet = Anew(allocator, UnicodeCharSet);
// leave empty
}
return emptySet;
}
CharSet<char16>* StandardChars<char16>::GetWordSet()
{
if (wordSet == 0)
{
wordSet = Anew(allocator, UnicodeCharSet);
wordSet->SetRanges(allocator, numWordPairs, wordStr);
}
return wordSet;
}
CharSet<char16>* StandardChars<char16>::GetNonWordSet()
{
if (nonWordSet == 0)
{
nonWordSet = Anew(allocator, UnicodeCharSet);
nonWordSet->SetNotRanges(allocator, numWordPairs, wordStr);
}
return nonWordSet;
}
CharSet<char16>* StandardChars<char16>::GetNewlineSet()
{
if (newlineSet == 0)
{
newlineSet = Anew(allocator, UnicodeCharSet);
newlineSet->SetRanges(allocator, numNewlinePairs, newlineStr);
}
return newlineSet;
}
CharSet<char16>* StandardChars<char16>::GetWhitespaceSet()
{
if (whitespaceSet == 0)
{
whitespaceSet = Anew(allocator, UnicodeCharSet);
whitespaceSet->SetRanges(allocator, numWhitespacePairs, whitespaceStr);
}
return whitespaceSet;
}
CharSet<char16>* StandardChars<char16>::GetSurrogateUpperRange()
{
if (surrogateUpperRange == 0)
{
surrogateUpperRange = Anew(allocator, UnicodeCharSet);
surrogateUpperRange->SetRange(allocator, (char16)0xDC00u, (char16)0xDFFFu);
}
return surrogateUpperRange;
}
}