lib/Parser/StandardChars.cpp - external/github.com/Microsoft/ChakraCore - Git at Google

 //-------------------------------------------------------------------------------------------------------
 // Copyright (C) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
 //-------------------------------------------------------------------------------------------------------
 #include "ParserPch.h"

 namespace UnifiedRegex
 {

     // ----------------------------------------------------------------------
     // ASCIIChars
     // ----------------------------------------------------------------------

 /*
 To get these two tables run:
   ch.exe ascii.js
 where ascii.js is:
 ----------------------------------------------------------------------
 function echo(s) { WScript.Echo(s); }

 var NumChars = 1 << 8;

 var Word = 1 << 0;
 var Newline = 1 << 1;
 var Whitespace = 1 << 2;
 var Letter     = 1 << 3;
 var Digit      = 1 << 4;
 var Octal      = 1 << 5;
 var Hex        = 1 << 6;

 var classes = [];
 var values = [];

 function cc(s) {
     return s.charCodeAt(0);
 }

 var c;
 for (c = 0; c < NumChars; c++)
 {
     classes[c] = 0;
     values[c] = 0;
 }
 for (c = cc('0'); c <= cc('7'); c++)
 {
     classes[c] |= Word | Octal | Digit | Hex;
     values[c] = c - cc('0');
 }
 for (c = cc('8'); c <= cc('9'); c++)
 {
     classes[c] |= Word | Digit | Hex;
     values[c] = c - cc('0');
 }
 for (c = cc('a'); c <= cc('f'); c++)
 {
     classes[c] |= Word | Hex | Letter;
     values[c] = 10 + c - cc('a');
 }
 for (c = cc('g'); c <= cc('z'); c++)
     classes[c] |= Word | Letter;
 for (c = cc('A'); c <= cc('F'); c++)
 {
     classes[c] |= Word | Hex | Letter;
     values[c] = 10 + c - cc('A');
 }
 for (c = cc('G'); c <= cc('Z'); c++)
     classes[c] |= Word | Letter;
 classes[cc('_')] |= Word;

 classes[cc('\n')] |= Newline;
 classes[cc('\r')] |= Newline;

 for (c = cc('\t'); c <= cc('\r'); c++)
     classes[c] |= Whitespace;
 classes[cc(' ')] |= Whitespace;
 classes[cc('\x85')] |= Whitespace;
 classes[cc('\xa0')] |= Whitespace;

 hex = "0123456789abcdef";
 function toHex(n) {
     return "0x" + hex[n >> 4] + hex[n & 0xf];
 }

 function dump(a) {
     for (c = 0; c < NumChars; c++) {
         if (c % 16 == 0)
             str = "        ";
         else
             str += ", ";
         str += toHex(a[c]);
         if (c % 16 == 15)
         {
             if (c < NumChars - 1)
                 str += ",";
             echo(str);
         }
     }
 }

 echo("    const uint8 ASCIIChars::classes[] = {");
 dump(classes);
 echo("    };");
 echo("    const uint8 ASCIIChars::values[] = {");
 dump(values);
 echo("    };");
 ----------------------------------------------------------------------
 */

     // Character classes represented as a bit vector for each character.
     const uint8 ASCIIChars::classes[] = {
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x51, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
         0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x01,
         0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
         0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     };

     // Numeric values of ASCII characters interpreted as hex digits (applies to [0-9a-fA-F], all others are 0x00).
     const uint8 ASCIIChars::values[] = {
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     };

     // ----------------------------------------------------------------------
     // TrivialCaseMapper
     // ----------------------------------------------------------------------

     const TrivialCaseMapper TrivialCaseMapper::Instance;

     // ----------------------------------------------------------------------
     // StandardChars<char16>
     // ----------------------------------------------------------------------

 /*
 To get the whitespaces string, run:
   gawk -f spaces.gawk http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
 where spaces.gawk is
 ----------------------------------------------------------------------
 BEGIN {
   FS = ";";
   start = -1;
   last = -1;
   str = "";
 }
 {
   code = strtonum("0x" $1);
   if ($3 == "Zs" || code == 0x0009 || code == 0x000B || code == 0x000C || code == 0x0020 || code == 0x00A0 || code == 0xFEFF || code == 0x000A || code == 0x000D || code == 0x2028 || code == 0x2029)
   {
     if (start < 0)
       start = code;
     else if (code > last + 1) {
       str = sprintf("%s\\x%04x\\x%04x", str, start, last);
       start = code;
     }
     last = code;
   }
 }
 END {
   str = sprintf("%s\\x%04x\\x%04x", str, start, last);
   print str;
 }
 ----------------------------------------------------------------------
 */

     const int StandardChars<char16>::numDigitPairs = 1;
     const char16* const StandardChars<char16>::digitStr = _u("09");
     const int StandardChars<char16>::numWhitespacePairs = 10;
     const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
     const int StandardChars<char16>::numWordPairs = 4;
     const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
     const int StandardChars<char16>::numWordIUPairs = 6; // Under /iu flags, Sharp S and Kelvin sign map to S and K, respectively.
     const char16* const StandardChars<char16>::wordIUStr = _u("09AZ__az\x017F\x017F\x212A\x212A");
     const int StandardChars<char16>::numNewlinePairs = 3;
     const char16* const StandardChars<char16>::newlineStr = _u("\x000a\x000a\x000d\x000d\x2028\x2029");

     StandardChars<char16>::StandardChars(ArenaAllocator* allocator)
         : allocator(allocator)
         , unicodeDataCaseMapper(allocator, CaseInsensitive::MappingSource::UnicodeData, &TrivialCaseMapper::Instance)
         , caseFoldingCaseMapper(allocator, CaseInsensitive::MappingSource::CaseFolding, &unicodeDataCaseMapper)
         , fullSet(0)
         , emptySet(0)
         , wordSet(0)
         , nonWordSet(0)
         , newlineSet(0)
         , whitespaceSet(0)
     {
     }

     void StandardChars<char16>::SetDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetRanges(setAllocator, numDigitPairs, digitStr);
     }

     void StandardChars<char16>::SetNonDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetNotRanges(setAllocator, numDigitPairs, digitStr);
     }

     void StandardChars<char16>::SetWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetRanges(setAllocator, numWhitespacePairs, whitespaceStr);
     }

     void StandardChars<char16>::SetNonWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetNotRanges(setAllocator, numWhitespacePairs, whitespaceStr);
     }

     void StandardChars<char16>::SetWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetRanges(setAllocator, numWordPairs, wordStr);
     }

     void StandardChars<char16>::SetNonWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetNotRanges(setAllocator, numWordPairs, wordStr);
     }

     void StandardChars<char16>::SetWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetRanges(setAllocator, numWordIUPairs, wordIUStr);
     }

     void StandardChars<char16>::SetNonWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetNotRanges(setAllocator, numWordIUPairs, wordIUStr);
     }

     void StandardChars<char16>::SetNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetRanges(setAllocator, numNewlinePairs, newlineStr);
     }

     void StandardChars<char16>::SetNonNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
     {
         set.SetNotRanges(setAllocator, numNewlinePairs, newlineStr);
     }

     CharSet<char16>* StandardChars<char16>::GetFullSet()
     {
         if (fullSet == 0)
         {
             fullSet = Anew(allocator, UnicodeCharSet);
             fullSet->SetRange(allocator, MinChar, MaxChar);
         }
         return fullSet;
     }

     CharSet<char16>* StandardChars<char16>::GetEmptySet()
     {
         if (emptySet == 0)
         {
             emptySet = Anew(allocator, UnicodeCharSet);
             // leave empty
         }
         return emptySet;
     }

     CharSet<char16>* StandardChars<char16>::GetWordSet()
     {
         if (wordSet == 0)
         {
             wordSet = Anew(allocator, UnicodeCharSet);
             wordSet->SetRanges(allocator, numWordPairs, wordStr);
         }
         return wordSet;
     }

     CharSet<char16>* StandardChars<char16>::GetNonWordSet()
     {
         if (nonWordSet == 0)
         {
             nonWordSet = Anew(allocator, UnicodeCharSet);
             nonWordSet->SetNotRanges(allocator, numWordPairs, wordStr);
         }
         return nonWordSet;
     }

     CharSet<char16>* StandardChars<char16>::GetNewlineSet()
     {
         if (newlineSet == 0)
         {
             newlineSet = Anew(allocator, UnicodeCharSet);
             newlineSet->SetRanges(allocator, numNewlinePairs, newlineStr);
         }
         return newlineSet;
     }

     CharSet<char16>* StandardChars<char16>::GetWhitespaceSet()
     {
         if (whitespaceSet == 0)
         {
             whitespaceSet = Anew(allocator, UnicodeCharSet);
             whitespaceSet->SetRanges(allocator, numWhitespacePairs, whitespaceStr);
         }
         return whitespaceSet;
     }
     CharSet<char16>* StandardChars<char16>::GetSurrogateUpperRange()
     {
         if (surrogateUpperRange == 0)
         {
             surrogateUpperRange = Anew(allocator, UnicodeCharSet);
             surrogateUpperRange->SetRange(allocator, (char16)0xDC00u, (char16)0xDFFFu);
         }
         return surrogateUpperRange;
     }
 }
	//-------------------------------------------------------------------------------------------------------
	// Copyright (C) Microsoft. All rights reserved.
	// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
	//-------------------------------------------------------------------------------------------------------
	#include "ParserPch.h"

	namespace UnifiedRegex
	{

	// ----------------------------------------------------------------------
	// ASCIIChars
	// ----------------------------------------------------------------------

	/*
	To get these two tables run:
	ch.exe ascii.js
	where ascii.js is:
	----------------------------------------------------------------------
	function echo(s) { WScript.Echo(s); }

	var NumChars = 1 << 8;

	var Word = 1 << 0;
	var Newline = 1 << 1;
	var Whitespace = 1 << 2;
	var Letter = 1 << 3;
	var Digit = 1 << 4;
	var Octal = 1 << 5;
	var Hex = 1 << 6;

	var classes = [];
	var values = [];

	function cc(s) {
	return s.charCodeAt(0);
	}

	var c;
	for (c = 0; c < NumChars; c++)
	{
	classes[c] = 0;
	values[c] = 0;
	}
	for (c = cc('0'); c <= cc('7'); c++)
	{
	classes[c] \|= Word \| Octal \| Digit \| Hex;
	values[c] = c - cc('0');
	}
	for (c = cc('8'); c <= cc('9'); c++)
	{
	classes[c] \|= Word \| Digit \| Hex;
	values[c] = c - cc('0');
	}
	for (c = cc('a'); c <= cc('f'); c++)
	{
	classes[c] \|= Word \| Hex \| Letter;
	values[c] = 10 + c - cc('a');
	}
	for (c = cc('g'); c <= cc('z'); c++)
	classes[c] \|= Word \| Letter;
	for (c = cc('A'); c <= cc('F'); c++)
	{
	classes[c] \|= Word \| Hex \| Letter;
	values[c] = 10 + c - cc('A');
	}
	for (c = cc('G'); c <= cc('Z'); c++)
	classes[c] \|= Word \| Letter;
	classes[cc('_')] \|= Word;

	classes[cc('\n')] \|= Newline;
	classes[cc('\r')] \|= Newline;

	for (c = cc('\t'); c <= cc('\r'); c++)
	classes[c] \|= Whitespace;
	classes[cc(' ')] \|= Whitespace;
	classes[cc('\x85')] \|= Whitespace;
	classes[cc('\xa0')] \|= Whitespace;

	hex = "0123456789abcdef";
	function toHex(n) {
	return "0x" + hex[n >> 4] + hex[n & 0xf];
	}

	function dump(a) {
	for (c = 0; c < NumChars; c++) {
	if (c % 16 == 0)
	str = " ";
	else
	str += ", ";
	str += toHex(a[c]);
	if (c % 16 == 15)
	{
	if (c < NumChars - 1)
	str += ",";
	echo(str);
	}
	}
	}

	echo(" const uint8 ASCIIChars::classes[] = {");
	dump(classes);
	echo(" };");
	echo(" const uint8 ASCIIChars::values[] = {");
	dump(values);
	echo(" };");
	----------------------------------------------------------------------
	*/

	// Character classes represented as a bit vector for each character.
	const uint8 ASCIIChars::classes[] = {
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x51, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
	0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x01,
	0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
	0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
	};

	// Numeric values of ASCII characters interpreted as hex digits (applies to [0-9a-fA-F], all others are 0x00).
	const uint8 ASCIIChars::values[] = {
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
	};

	// ----------------------------------------------------------------------
	// TrivialCaseMapper
	// ----------------------------------------------------------------------

	const TrivialCaseMapper TrivialCaseMapper::Instance;

	// ----------------------------------------------------------------------
	// StandardChars<char16>
	// ----------------------------------------------------------------------

	/*
	To get the whitespaces string, run:
	gawk -f spaces.gawk http://www.unicode.org/Public/9.0.0/ucd/UnicodeData.txt
	where spaces.gawk is
	----------------------------------------------------------------------
	BEGIN {
	FS = ";";
	start = -1;
	last = -1;
	str = "";
	}
	{
	code = strtonum("0x" $1);
	if ($3 == "Zs" \|\| code == 0x0009 \|\| code == 0x000B \|\| code == 0x000C \|\| code == 0x0020 \|\| code == 0x00A0 \|\| code == 0xFEFF \|\| code == 0x000A \|\| code == 0x000D \|\| code == 0x2028 \|\| code == 0x2029)
	{
	if (start < 0)
	start = code;
	else if (code > last + 1) {
	str = sprintf("%s\\x%04x\\x%04x", str, start, last);
	start = code;
	}
	last = code;
	}
	}
	END {
	str = sprintf("%s\\x%04x\\x%04x", str, start, last);
	print str;
	}
	----------------------------------------------------------------------
	*/

	const int StandardChars<char16>::numDigitPairs = 1;
	const char16* const StandardChars<char16>::digitStr = _u("09");
	const int StandardChars<char16>::numWhitespacePairs = 10;
	const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
	const int StandardChars<char16>::numWordPairs = 4;
	const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
	const int StandardChars<char16>::numWordIUPairs = 6; // Under /iu flags, Sharp S and Kelvin sign map to S and K, respectively.
	const char16* const StandardChars<char16>::wordIUStr = _u("09AZ__az\x017F\x017F\x212A\x212A");
	const int StandardChars<char16>::numNewlinePairs = 3;
	const char16* const StandardChars<char16>::newlineStr = _u("\x000a\x000a\x000d\x000d\x2028\x2029");

	StandardChars<char16>::StandardChars(ArenaAllocator* allocator)
	: allocator(allocator)
	, unicodeDataCaseMapper(allocator, CaseInsensitive::MappingSource::UnicodeData, &TrivialCaseMapper::Instance)
	, caseFoldingCaseMapper(allocator, CaseInsensitive::MappingSource::CaseFolding, &unicodeDataCaseMapper)
	, fullSet(0)
	, emptySet(0)
	, wordSet(0)
	, nonWordSet(0)
	, newlineSet(0)
	, whitespaceSet(0)
	{
	}

	void StandardChars<char16>::SetDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetRanges(setAllocator, numDigitPairs, digitStr);
	}

	void StandardChars<char16>::SetNonDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetNotRanges(setAllocator, numDigitPairs, digitStr);
	}

	void StandardChars<char16>::SetWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetRanges(setAllocator, numWhitespacePairs, whitespaceStr);
	}

	void StandardChars<char16>::SetNonWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetNotRanges(setAllocator, numWhitespacePairs, whitespaceStr);
	}

	void StandardChars<char16>::SetWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetRanges(setAllocator, numWordPairs, wordStr);
	}

	void StandardChars<char16>::SetNonWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetNotRanges(setAllocator, numWordPairs, wordStr);
	}

	void StandardChars<char16>::SetWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetRanges(setAllocator, numWordIUPairs, wordIUStr);
	}

	void StandardChars<char16>::SetNonWordIUChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetNotRanges(setAllocator, numWordIUPairs, wordIUStr);
	}

	void StandardChars<char16>::SetNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetRanges(setAllocator, numNewlinePairs, newlineStr);
	}

	void StandardChars<char16>::SetNonNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
	{
	set.SetNotRanges(setAllocator, numNewlinePairs, newlineStr);
	}

	CharSet<char16>* StandardChars<char16>::GetFullSet()
	{
	if (fullSet == 0)
	{
	fullSet = Anew(allocator, UnicodeCharSet);
	fullSet->SetRange(allocator, MinChar, MaxChar);
	}
	return fullSet;
	}

	CharSet<char16>* StandardChars<char16>::GetEmptySet()
	{
	if (emptySet == 0)
	{
	emptySet = Anew(allocator, UnicodeCharSet);
	// leave empty
	}
	return emptySet;
	}

	CharSet<char16>* StandardChars<char16>::GetWordSet()
	{
	if (wordSet == 0)
	{
	wordSet = Anew(allocator, UnicodeCharSet);
	wordSet->SetRanges(allocator, numWordPairs, wordStr);
	}
	return wordSet;
	}

	CharSet<char16>* StandardChars<char16>::GetNonWordSet()
	{
	if (nonWordSet == 0)
	{
	nonWordSet = Anew(allocator, UnicodeCharSet);
	nonWordSet->SetNotRanges(allocator, numWordPairs, wordStr);
	}
	return nonWordSet;
	}

	CharSet<char16>* StandardChars<char16>::GetNewlineSet()
	{
	if (newlineSet == 0)
	{
	newlineSet = Anew(allocator, UnicodeCharSet);
	newlineSet->SetRanges(allocator, numNewlinePairs, newlineStr);
	}
	return newlineSet;
	}

	CharSet<char16>* StandardChars<char16>::GetWhitespaceSet()
	{
	if (whitespaceSet == 0)
	{
	whitespaceSet = Anew(allocator, UnicodeCharSet);
	whitespaceSet->SetRanges(allocator, numWhitespacePairs, whitespaceStr);
	}
	return whitespaceSet;
	}
	CharSet<char16>* StandardChars<char16>::GetSurrogateUpperRange()
	{
	if (surrogateUpperRange == 0)
	{
	surrogateUpperRange = Anew(allocator, UnicodeCharSet);
	surrogateUpperRange->SetRange(allocator, (char16)0xDC00u, (char16)0xDFFFu);
	}
	return surrogateUpperRange;
	}
	}