internal/cldutil_shared.cc - external/github.com/CLD2Owners/cld2 - Git at Google

 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 //
 // Author: dsites@google.com (Dick Sites)
 //

 #include "cldutil_shared.h"
 #include <string>

 #include "cld2tablesummary.h"
 #include "integral_types.h"
 #include "port.h"
 #include "utf8statetable.h"

 namespace CLD2 {

 // Runtime routines for hashing, looking up, and scoring
 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
 // Unigrams and bigrams are for CJK languages only, including simplified/
 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
 // Zhuang Han characters. Surrounding spaces are not considered.
 // Quadgrams and octagrams for for non-CJK and include two bits indicating
 // preceding and trailing spaces (word boundaries).


 // Indicator bits for leading/trailing space around quad/octagram
 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
 // 1-, 2-, or 3-bytes each.
 static const uint32 kPreSpaceIndicator =  0x00004444;
 static const uint32 kPostSpaceIndicator = 0x44440000;

 // Little-endian masks for 0..24 bytes picked up as uint32's
 static const uint32 kWordMask0[4] = {
   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
 };

 static const int kMinCJKUTF8CharBytes = 3;

 static const int kMinGramCount = 3;
 static const int kMaxGramCount = 16;

 static const int UTFmax = 4;        // Max number of bytes in a UTF-8 character


 // Routines to access a hash table of <key:wordhash, value:probs> pairs
 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
 // bucket subscript.
 // Probs is a packed: three languages plus a subscript for probability table
 // Buckets have all the keys together, then all the values.Key array never
 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
 // Match case may sometimes take an additional cache miss on value access.
 //
 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
 // byte buckets with single cache miss.
 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.


 //----------------------------------------------------------------------------//
 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores      //
 //----------------------------------------------------------------------------//

 // Design principles for these hash functions
 // - Few operations
 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
 //   Latin script expect 1- and 2-byte mixtures.
 // - Last byte of each character has about 5 bits of information
 // - Spread good bits around so they can interact in at least two ways
 //   with other characters
 // - Use add for additional mixing thorugh carries

 // CJK Three-byte bigram
 //   ....dddd..cccccc..bbbbbb....aaaa
 //   ..................ffffff..eeeeee
 // make
 //   ....dddd..cccccc..bbbbbb....aaaa
 //   000....dddd..cccccc..bbbbbb....a
 //   ..................ffffff..eeeeee
 //   ffffff..eeeeee000000000000000000
 //
 // CJK Four-byte bigram
 //   ..dddddd..cccccc....bbbb....aaaa
 //   ..hhhhhh..gggggg....ffff....eeee
 // make
 //   ..dddddd..cccccc....bbbb....aaaa
 //   000..dddddd..cccccc....bbbb....a
 //   ..hhhhhh..gggggg....ffff....eeee
 //   ..ffff....eeee000000000000000000

 // BIGRAM
 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
 // OVERSHOOTS up to 3 bytes
 // For runtime use of tables
 // Does X86 unaligned loads
 uint32 BiHashV2(const char* word_ptr, int bytecount) {
   if (bytecount == 0) {return 0;}
   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   uint32 word0, word1;
   if (bytecount <= 4) {
     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
     word0 = word0 ^ (word0 >> 3);
     return word0;
   }
   // Else do 8 bytes
   word0 = UNALIGNED_LOAD32(word_ptr32);
   word0 = word0 ^ (word0 >> 3);
   word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
   word1 = word1 ^ (word1 << 18);
   return word0 + word1;
 }

 //
 // Ascii-7 One-byte chars
 //   ...ddddd...ccccc...bbbbb...aaaaa
 // make
 //   ...ddddd...ccccc...bbbbb...aaaaa
 //   000...ddddd...ccccc...bbbbb...aa
 //
 // Latin 1- and 2-byte chars
 //   ...ddddd...ccccc...bbbbb...aaaaa
 //   ...................fffff...eeeee
 // make
 //   ...ddddd...ccccc...bbbbb...aaaaa
 //   000...ddddd...ccccc...bbbbb...aa
 //   ...................fffff...eeeee
 //   ...............fffff...eeeee0000
 //
 // Non-CJK Two-byte chars
 //   ...ddddd...........bbbbb........
 //   ...hhhhh...........fffff........
 // make
 //   ...ddddd...........bbbbb........
 //   000...ddddd...........bbbbb.....
 //   ...hhhhh...........fffff........
 //   hhhh...........fffff........0000
 //
 // Non-CJK Three-byte chars
 //   ...........ccccc................
 //   ...................fffff........
 //   ...lllll...................iiiii
 // make
 //   ...........ccccc................
 //   000...........ccccc.............
 //   ...................fffff........
 //   ...............fffff........0000
 //   ...lllll...................iiiii
 //   .lllll...................iiiii00
 //

 // QUADGRAM
 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
 // OVERSHOOTS up to 3 bytes
 // For runtime use of tables
 // Does X86 unaligned loads
 uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   uint32 word0, word1, word2;
   if (bytecount <= 4) {
     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
     word0 = word0 ^ (word0 >> 3);
     return word0 ^ prepost;
   } else if (bytecount <= 8) {
     word0 = UNALIGNED_LOAD32(word_ptr32);
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
     word1 = word1 ^ (word1 << 4);
     return (word0 ^ prepost) + word1;
   }
   // else do 12 bytes
   word0 = UNALIGNED_LOAD32(word_ptr32);
   word0 = word0 ^ (word0 >> 3);
   word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
   word1 = word1 ^ (word1 << 4);
   word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
   word2 = word2 ^ (word2 << 2);
   return (word0 ^ prepost) + word1 + word2;
 }


 // QUADGRAM wrapper with surrounding spaces
 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 // For runtime use of tables
 uint32 QuadHashV2(const char* word_ptr, int bytecount) {
   if (bytecount == 0) {return 0;}
   uint32 prepost = 0;
   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   return QuadHashV2Mix(word_ptr, bytecount, prepost);
 }

 // QUADGRAM wrapper with surrounding underscores (offline use)
 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
 // OVERSHOOTS up to 3 bytes
 // For offline construction of tables
 uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
   if (bytecount == 0) {return 0;}
   const char* local_word_ptr = word_ptr;
   int local_bytecount = bytecount;
   uint32 prepost = 0;
   if (local_word_ptr[0] == '_') {
     prepost |= kPreSpaceIndicator;
     ++local_word_ptr;
     --local_bytecount;
   }
   if (local_word_ptr[local_bytecount - 1] == '_') {
     prepost |= kPostSpaceIndicator;
     --local_bytecount;
   }
   return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
 }


 // OCTAGRAM
 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 //
 // The low 32 bits follow the pattern from above, tuned to different scripts
 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 // For runtime use of tables V3
 // Does X86 unaligned loads
 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
   const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
   uint64 word0;
   uint64 word1;
   uint64 sum;

   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   switch ((bytecount - 1) >> 2) {
   case 0:       // 1..4 bytes
     word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     break;
   case 1:       // 5..8 bytes
     word0 = UNALIGNED_LOAD32(word_ptr32);
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
     sum += word1;
     word1 = word1 ^ (word1 << 4);
     word0 += word1;
     break;
   case 2:       // 9..12 bytes
     word0 = UNALIGNED_LOAD32(word_ptr32);
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
     sum += word1;
     word1 = word1 ^ (word1 << 4);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
     sum += word1;
     word1 = word1 ^ (word1 << 2);
     word0 += word1;
     break;
   case 3:       // 13..16 bytes
     word0 =UNALIGNED_LOAD32(word_ptr32);
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
     sum += word1;
     word1 = word1 ^ (word1 << 4);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
     sum += word1;
     word1 = word1 ^ (word1 << 2);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
     sum += word1;
     word1 = word1 ^ (word1 >> 8);
     word0 += word1;
     break;
   case 4:       // 17..20 bytes
     word0 = UNALIGNED_LOAD32(word_ptr32);
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
     sum += word1;
     word1 = word1 ^ (word1 << 4);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
     sum += word1;
     word1 = word1 ^ (word1 << 2);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
     sum += word1;
     word1 = word1 ^ (word1 >> 8);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
     sum += word1;
     word1 = word1 ^ (word1 >> 4);
     word0 += word1;
     break;
   default:      // 21..24 bytes and higher (ignores beyond 24)
     word0 = UNALIGNED_LOAD32(word_ptr32);
     sum = word0;
     word0 = word0 ^ (word0 >> 3);
     word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
     sum += word1;
     word1 = word1 ^ (word1 << 4);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
     sum += word1;
     word1 = word1 ^ (word1 << 2);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
     sum += word1;
     word1 = word1 ^ (word1 >> 8);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
     sum += word1;
     word1 = word1 ^ (word1 >> 4);
     word0 += word1;
     word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
     sum += word1;
     word1 = word1 ^ (word1 >> 6);
     word0 += word1;
     break;
   }

   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
   sum = (sum & 0xff) << 32;
   return (word0 ^ prepost) + sum;
 }

 // OCTAGRAM wrapper with surrounding spaces
 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 //
 // The low 32 bits follow the pattern from above, tuned to different scripts
 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 // For runtime use of tables V3
 uint64 OctaHash40(const char* word_ptr, int bytecount) {
   if (bytecount == 0) {return 0;}
   uint64 prepost = 0;
   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
   return OctaHash40Mix(word_ptr, bytecount, prepost);
 }


 // OCTAGRAM wrapper with surrounding underscores (offline use)
 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
 //
 // The low 32 bits follow the pattern from above, tuned to different scripts
 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
 // For offline construction of tables
 uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
   if (bytecount == 0) {return 0;}
   const char* local_word_ptr = word_ptr;
   int local_bytecount = bytecount;
   uint64 prepost = 0;
   if (local_word_ptr[0] == '_') {
     prepost |= kPreSpaceIndicator;
     ++local_word_ptr;
     --local_bytecount;
   }
   if (local_word_ptr[local_bytecount - 1] == '_') {
     prepost |= kPostSpaceIndicator;
     --local_bytecount;
   }
   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
 }

 // Hash a consecutive pair of tokens/words A B
 // Old: hash is B - A, which gives too many false hits on one-char diffs
 // Now: rotate(A,13) + B
 uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
    return ((worda_hash >> 13) | (worda_hash << (64 - 13))) + wordb_hash;
 }


 //----------------------------------------------------------------------------//
 // Finding groups of 1/2/4/8 letters                                          //
 //----------------------------------------------------------------------------//

 // src points to a letter. Find the byte length of a unigram starting there.
 int UniLen(const char* src) {
   const char* src_end = src;
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   return src_end - src;
 }

 // src points to a letter. Find the byte length of a bigram starting there.
 int BiLen(const char* src) {
   const char* src_end = src;
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   return src_end - src;
 }

 // src points to a letter. Find the byte length of a quadgram starting there.
 int QuadLen(const char* src) {
   const char* src_end = src;
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
   return src_end - src;
 }

 // src points to a letter. Find the byte length of an octagram starting there.
 int OctaLen(const char* src) {
   const char* src_end = src;
   int charcount = 0;
   while (src_end[0] != ' ') {
     src_end += UTF8OneCharLen(src);
     ++charcount;
     if (charcount == 8) {break;}
   }
   return src_end - src;
 }

 }       // End namespace CLD2
	// Copyright 2013 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	//
	// Author: dsites@google.com (Dick Sites)
	//

	#include "cldutil_shared.h"
	#include <string>

	#include "cld2tablesummary.h"
	#include "integral_types.h"
	#include "port.h"
	#include "utf8statetable.h"

	namespace CLD2 {

	// Runtime routines for hashing, looking up, and scoring
	// unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
	// Unigrams and bigrams are for CJK languages only, including simplified/
	// traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
	// Zhuang Han characters. Surrounding spaces are not considered.
	// Quadgrams and octagrams for for non-CJK and include two bits indicating
	// preceding and trailing spaces (word boundaries).


	// Indicator bits for leading/trailing space around quad/octagram
	// NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
	// 1-, 2-, or 3-bytes each.
	static const uint32 kPreSpaceIndicator = 0x00004444;
	static const uint32 kPostSpaceIndicator = 0x44440000;

	// Little-endian masks for 0..24 bytes picked up as uint32's
	static const uint32 kWordMask0[4] = {
	0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
	};

	static const int kMinCJKUTF8CharBytes = 3;

	static const int kMinGramCount = 3;
	static const int kMaxGramCount = 16;

	static const int UTFmax = 4; // Max number of bytes in a UTF-8 character


	// Routines to access a hash table of <key:wordhash, value:probs> pairs
	// Buckets have 4-byte wordhash for sizes < 32K buckets, but only
	// 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
	// bucket subscript.
	// Probs is a packed: three languages plus a subscript for probability table
	// Buckets have all the keys together, then all the values.Key array never
	// crosses a cache-line boundary, so no-match case takes exactly one cache miss.
	// Match case may sometimes take an additional cache miss on value access.
	//
	// Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
	// byte buckets with single cache miss.
	// Or 2-byte key and 6-byte value, allowing 5 languages instead of three.


	//----------------------------------------------------------------------------//
	// Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores //
	//----------------------------------------------------------------------------//

	// Design principles for these hash functions
	// - Few operations
	// - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
	// Latin script expect 1- and 2-byte mixtures.
	// - Last byte of each character has about 5 bits of information
	// - Spread good bits around so they can interact in at least two ways
	// with other characters
	// - Use add for additional mixing thorugh carries

	// CJK Three-byte bigram
	// ....dddd..cccccc..bbbbbb....aaaa
	// ..................ffffff..eeeeee
	// make
	// ....dddd..cccccc..bbbbbb....aaaa
	// 000....dddd..cccccc..bbbbbb....a
	// ..................ffffff..eeeeee
	// ffffff..eeeeee000000000000000000
	//
	// CJK Four-byte bigram
	// ..dddddd..cccccc....bbbb....aaaa
	// ..hhhhhh..gggggg....ffff....eeee
	// make
	// ..dddddd..cccccc....bbbb....aaaa
	// 000..dddddd..cccccc....bbbb....a
	// ..hhhhhh..gggggg....ffff....eeee
	// ..ffff....eeee000000000000000000

	// BIGRAM
	// Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
	// OVERSHOOTS up to 3 bytes
	// For runtime use of tables
	// Does X86 unaligned loads
	uint32 BiHashV2(const char* word_ptr, int bytecount) {
	if (bytecount == 0) {return 0;}
	const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
	uint32 word0, word1;
	if (bytecount <= 4) {
	word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
	word0 = word0 ^ (word0 >> 3);
	return word0;
	}
	// Else do 8 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32);
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
	word1 = word1 ^ (word1 << 18);
	return word0 + word1;
	}

	//
	// Ascii-7 One-byte chars
	// ...ddddd...ccccc...bbbbb...aaaaa
	// make
	// ...ddddd...ccccc...bbbbb...aaaaa
	// 000...ddddd...ccccc...bbbbb...aa
	//
	// Latin 1- and 2-byte chars
	// ...ddddd...ccccc...bbbbb...aaaaa
	// ...................fffff...eeeee
	// make
	// ...ddddd...ccccc...bbbbb...aaaaa
	// 000...ddddd...ccccc...bbbbb...aa
	// ...................fffff...eeeee
	// ...............fffff...eeeee0000
	//
	// Non-CJK Two-byte chars
	// ...ddddd...........bbbbb........
	// ...hhhhh...........fffff........
	// make
	// ...ddddd...........bbbbb........
	// 000...ddddd...........bbbbb.....
	// ...hhhhh...........fffff........
	// hhhh...........fffff........0000
	//
	// Non-CJK Three-byte chars
	// ...........ccccc................
	// ...................fffff........
	// ...lllll...................iiiii
	// make
	// ...........ccccc................
	// 000...........ccccc.............
	// ...................fffff........
	// ...............fffff........0000
	// ...lllll...................iiiii
	// .lllll...................iiiii00
	//

	// QUADGRAM
	// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
	// OVERSHOOTS up to 3 bytes
	// For runtime use of tables
	// Does X86 unaligned loads
	uint32 QuadHashV2Mix(const char* word_ptr, int bytecount, uint32 prepost) {
	const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
	uint32 word0, word1, word2;
	if (bytecount <= 4) {
	word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
	word0 = word0 ^ (word0 >> 3);
	return word0 ^ prepost;
	} else if (bytecount <= 8) {
	word0 = UNALIGNED_LOAD32(word_ptr32);
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
	word1 = word1 ^ (word1 << 4);
	return (word0 ^ prepost) + word1;
	}
	// else do 12 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32);
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
	word1 = word1 ^ (word1 << 4);
	word2 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
	word2 = word2 ^ (word2 << 2);
	return (word0 ^ prepost) + word1 + word2;
	}


	// QUADGRAM wrapper with surrounding spaces
	// Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
	// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
	// For runtime use of tables
	uint32 QuadHashV2(const char* word_ptr, int bytecount) {
	if (bytecount == 0) {return 0;}
	uint32 prepost = 0;
	if (word_ptr[-1] == ' ') {prepost \|= kPreSpaceIndicator;}
	if (word_ptr[bytecount] == ' ') {prepost \|= kPostSpaceIndicator;}
	return QuadHashV2Mix(word_ptr, bytecount, prepost);
	}

	// QUADGRAM wrapper with surrounding underscores (offline use)
	// Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
	// OVERSHOOTS up to 3 bytes
	// For offline construction of tables
	uint32 QuadHashV2Underscore(const char* word_ptr, int bytecount) {
	if (bytecount == 0) {return 0;}
	const char* local_word_ptr = word_ptr;
	int local_bytecount = bytecount;
	uint32 prepost = 0;
	if (local_word_ptr[0] == '_') {
	prepost \|= kPreSpaceIndicator;
	++local_word_ptr;
	--local_bytecount;
	}
	if (local_word_ptr[local_bytecount - 1] == '_') {
	prepost \|= kPostSpaceIndicator;
	--local_bytecount;
	}
	return QuadHashV2Mix(local_word_ptr, local_bytecount, prepost);
	}


	// OCTAGRAM
	// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
	// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
	//
	// The low 32 bits follow the pattern from above, tuned to different scripts
	// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
	// For runtime use of tables V3
	// Does X86 unaligned loads
	uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
	const uint32* word_ptr32 = reinterpret_cast<const uint32*>(word_ptr);
	uint64 word0;
	uint64 word1;
	uint64 sum;

	if (word_ptr[-1] == ' ') {prepost \|= kPreSpaceIndicator;}
	if (word_ptr[bytecount] == ' ') {prepost \|= kPostSpaceIndicator;}
	switch ((bytecount - 1) >> 2) {
	case 0: // 1..4 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32) & kWordMask0[bytecount & 3];
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	break;
	case 1: // 5..8 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32);
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1) & kWordMask0[bytecount & 3];
	sum += word1;
	word1 = word1 ^ (word1 << 4);
	word0 += word1;
	break;
	case 2: // 9..12 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32);
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
	sum += word1;
	word1 = word1 ^ (word1 << 4);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 2) & kWordMask0[bytecount & 3];
	sum += word1;
	word1 = word1 ^ (word1 << 2);
	word0 += word1;
	break;
	case 3: // 13..16 bytes
	word0 =UNALIGNED_LOAD32(word_ptr32);
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
	sum += word1;
	word1 = word1 ^ (word1 << 4);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
	sum += word1;
	word1 = word1 ^ (word1 << 2);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 3) & kWordMask0[bytecount & 3];
	sum += word1;
	word1 = word1 ^ (word1 >> 8);
	word0 += word1;
	break;
	case 4: // 17..20 bytes
	word0 = UNALIGNED_LOAD32(word_ptr32);
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
	sum += word1;
	word1 = word1 ^ (word1 << 4);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
	sum += word1;
	word1 = word1 ^ (word1 << 2);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
	sum += word1;
	word1 = word1 ^ (word1 >> 8);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 4) & kWordMask0[bytecount & 3];
	sum += word1;
	word1 = word1 ^ (word1 >> 4);
	word0 += word1;
	break;
	default: // 21..24 bytes and higher (ignores beyond 24)
	word0 = UNALIGNED_LOAD32(word_ptr32);
	sum = word0;
	word0 = word0 ^ (word0 >> 3);
	word1 = UNALIGNED_LOAD32(word_ptr32 + 1);
	sum += word1;
	word1 = word1 ^ (word1 << 4);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 2);
	sum += word1;
	word1 = word1 ^ (word1 << 2);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 3);
	sum += word1;
	word1 = word1 ^ (word1 >> 8);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 4);
	sum += word1;
	word1 = word1 ^ (word1 >> 4);
	word0 += word1;
	word1 = UNALIGNED_LOAD32(word_ptr32 + 5) & kWordMask0[bytecount & 3];
	sum += word1;
	word1 = word1 ^ (word1 >> 6);
	word0 += word1;
	break;
	}

	sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3
	sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3
	sum = (sum & 0xff) << 32;
	return (word0 ^ prepost) + sum;
	}

	// OCTAGRAM wrapper with surrounding spaces
	// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
	// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
	//
	// The low 32 bits follow the pattern from above, tuned to different scripts
	// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
	// For runtime use of tables V3
	uint64 OctaHash40(const char* word_ptr, int bytecount) {
	if (bytecount == 0) {return 0;}
	uint64 prepost = 0;
	if (word_ptr[-1] == ' ') {prepost \|= kPreSpaceIndicator;}
	if (word_ptr[bytecount] == ' ') {prepost \|= kPostSpaceIndicator;}
	return OctaHash40Mix(word_ptr, bytecount, prepost);
	}


	// OCTAGRAM wrapper with surrounding underscores (offline use)
	// Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
	// UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
	//
	// The low 32 bits follow the pattern from above, tuned to different scripts
	// The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
	// For offline construction of tables
	uint64 OctaHash40underscore(const char* word_ptr, int bytecount) {
	if (bytecount == 0) {return 0;}
	const char* local_word_ptr = word_ptr;
	int local_bytecount = bytecount;
	uint64 prepost = 0;
	if (local_word_ptr[0] == '_') {
	prepost \|= kPreSpaceIndicator;
	++local_word_ptr;
	--local_bytecount;
	}
	if (local_word_ptr[local_bytecount - 1] == '_') {
	prepost \|= kPostSpaceIndicator;
	--local_bytecount;
	}
	return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
	}

	// Hash a consecutive pair of tokens/words A B
	// Old: hash is B - A, which gives too many false hits on one-char diffs
	// Now: rotate(A,13) + B
	uint64 PairHash(uint64 worda_hash, uint64 wordb_hash) {
	return ((worda_hash >> 13) \| (worda_hash << (64 - 13))) + wordb_hash;
	}




	//----------------------------------------------------------------------------//
	// Finding groups of 1/2/4/8 letters //
	//----------------------------------------------------------------------------//

	// src points to a letter. Find the byte length of a unigram starting there.
	int UniLen(const char* src) {
	const char* src_end = src;
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	return src_end - src;
	}

	// src points to a letter. Find the byte length of a bigram starting there.
	int BiLen(const char* src) {
	const char* src_end = src;
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	return src_end - src;
	}

	// src points to a letter. Find the byte length of a quadgram starting there.
	int QuadLen(const char* src) {
	const char* src_end = src;
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
	return src_end - src;
	}

	// src points to a letter. Find the byte length of an octagram starting there.
	int OctaLen(const char* src) {
	const char* src_end = src;
	int charcount = 0;
	while (src_end[0] != ' ') {
	src_end += UTF8OneCharLen(src);
	++charcount;
	if (charcount == 8) {break;}
	}
	return src_end - src;
	}

	} // End namespace CLD2