blob: fcac75a9311a37ed53ec617c528185f00319a5e0 [file] [log] [blame]
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////
#include "compact_enc_det/compact_enc_det.h"
#include <math.h> // for sqrt
#include <stddef.h> // for size_t
#include <stdio.h> // for printf, fprintf, NULL, etc
#include <stdlib.h> // for qsort
#include <string.h> // for memset, memcpy, memcmp, etc
#include <memory>
#include <string> // for string, operator==, etc
#include "compact_enc_det/compact_enc_det_hint_code.h"
#include "util/string_util.h"
#include "util/basictypes.h"
#include "util/commandlineflags.h"
#include "util/logging.h"
using std::string;
// TODO as of 2007.10.09:
//
// Consider font=TT-BHxxx as user-defined => binary
// Demote GB18030 if no 8x3x pair
// Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
// Consider removing/ignoring bytes 01-1F to avoid crap pollution
// Possibly boost declared encoding in robust scan
// googlebot tiny files
// look for ranges of encodings
// consider tags just as > < within aligned block of 32
// flag too few characters in postproc (Latin 6 problem)
// Remove slow scan beyond 16KB
// Consider removing kMostLikelyEncoding or cut it in half
// A note on mixed encodings
//
// The most common encoding error on the web is a page containing a mixture of
// CP-1252 and UTF-8. A less common encoding error is a third-party feed that
// has been converted from CP-1252 to UTF-8 and then those bytes converted a
// second time to UTF-8. CED originally attempted to detect these error cases
// by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
// implementation was to start these just below CP1252 and UTF8 respectively in
// overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
// found.
//
// The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
// UTF8CP1252 internal encoding was added late and not put into encodings.proto,
// so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
// is removed in this November 2011 CL.
//
// Mixed encoding detection never worked out as well as envisioned, so the
// ced_allow_utf8utf8 flag normally disables all this.
//
// The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
// UTF8, and the inputconverter code for UTF8 normally will convert bare
// CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
// and double-UTF-8 mixtures will be detected as UTF-8, and the double
// conversion will stand.
//
// However, it is occasionally useful to use CED to detect double-converted
// UTF-8 coming from third-party data feeds, so they can be fixed at the source.
// For this purpose, the UTF8UTF8 encoding remains available under the
// ced_allow_utf8utf8 flag.
//
// When UTF8UTF8 is detected, the inputconverter code will undo the double
// conversion, giving good text.
// Norbert Runge has noted these words in CP1252 that are mistakenly identified
// as UTF-8 because of the last pair of characters:
// NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
// drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
// Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
// Schoß\u201c 0xDF 0x93 U+00DF U+201C
// weiß\u201c 0xDF 0x93 U+00DF U+00AB
// Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
// süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
// These four byte combinations now explicitly boost Latin1/CP1252.
// And for reference, here are a couple of Portuguese spellings
// that may be mistaken as double-byte encodings.
// informações 0xE7 0xF5
// traição 0xE7 0xE3
static const char* kVersion = "2.2";
DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
"to handle mixtures of CP1252 "
"converted to UTF-8 zero, one, "
"or two times");
DEFINE_int32(enc_detect_slow_max_kb, 16,
"Maximum number of Kbytes to examine for "
"7-bit-only (2022, Hz, UTF7) encoding detect. "
"You are unlikely to want to change this.");
DEFINE_int32(enc_detect_fast_max_kb, 256,
"Maximum number of Kbytes to examine for encoding detect. "
"You are unlikely to want to change this.");
DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
"difference 1st - 2nd to be considered reliable \n"
" 2 corresponds to min 4x difference\n"
" 4 corresponds to min 16x difference\n"
" 8 corresponds to min 256x difference\n"
" 10 corresponds to min 1024x difference\n"
" 20 corresponds to min 1Mx difference.");
// Text debug output options
DEFINE_bool(enc_detect_summary, false,
"Print first 16 interesting pairs at exit.");
DEFINE_bool(counts, false, "Count major-section usage");
// PostScript debug output options
DEFINE_bool(enc_detect_detail, false,
"Print PostScript of every update, to stderr.");
DEFINE_bool(enc_detect_detail2, false,
"More PostScript detail of every update, to stderr.");
DEFINE_bool(enc_detect_source, false, "Include source text in detail");
// Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
// lang_enc.cc
// Following flags are not in use. Replace them with constants to
// avoid static initialization.
//DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
//DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
static const char* const FLAGS_enc_detect_watch1 = "";
static const char* const FLAGS_enc_detect_watch2 = "";
// Only for experiments. Delete soon.
DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
// Demo-mode/debugging experiment
DEFINE_bool(demo_nodefault, false,
"Default to all equal; no boost for declared encoding.");
DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10
static const int XLOG2 = 30; // Multiplier for log base 2 ** n
static const int kFinalPruneDifference = 10 * XLOG2;
// Final bits of minimum
// probability difference 1st-nth
// to be pruned
static const int kInititalPruneDifference = kFinalPruneDifference * 4;
// Initial bits of minimum
// probability difference 1st-nth
// to be pruned
//
static const int kPruneDiffDecrement = kFinalPruneDifference;
// Decrements bits of minimum
// probability difference 1st-nth
// to be pruned
static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum
// probability difference, base to
// superset encodings
static const int kBoostInitial = 20 * XLOG2; // bits of boost for
// initial byte patterns (BOM, 00)
static const int kBadPairWhack = 20 * XLOG2; // bits of whack for
// one bad pair
static const int kBoostOnePair = 20 * XLOG2; // bits of boost for
// one good pair in Hz, etc.
static const int kGentleOnePair = 4 * XLOG2; // bits of boost for
// one good sequence
//
static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
// for ill-formed sequence
static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
// for well-formed sequence
static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
// best declared encoding per bigram
static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for
// best encoding per bigram
static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
static const int kMaxPairs = 48; // Max interesting pairs to look at
// If you change this,
// adjust *PruneDiff*
static const int kPruneMask = 0x07; // Prune every 8 interesting pairs
static const int kBestPairsCount = 16; // For first N pairs, do extra boost
// based on most likely encoding
// of pair over entire web
static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,
// weaken the hints enough that
// unhinted encodings have a hope of
// rising to the top
static const int kMinRescanLength = 800; // Don't bother rescanning for
// unreliable encoding if fewer
// than this many bytes unscanned.
// We will rescan at most last half
// of this.
static const int kStrongBinary = 12; // Make F_BINARY the only encoding
static const int kWeakerBinary = 4; // Make F_BINARY likely encoding
// These are byte counts from front of file
static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII
static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII
// We try here to avoid having title text dominate the encoding detection,
// for the not-infrequent error case of title in encoding1, body in encoding2:
// we want to bias toward encoding2 winning.
//
// kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
// rarely cut off mid-character in the original (not-yet-detected) encoding.
// This matters most for UTF-8 two- and three-byte codes and for
// Shift-JIS three-byte codes.
static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text
static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.
// 1/16 normal weight
static const int kStrongPairs = 6; // Let reliable enc with this many
// pairs overcome missing hint
enum CEDInternalFlags {
kCEDNone = 0, // The empty flag
kCEDRescanning = 1, // Do not further recurse
kCEDSlowscore = 2, // Do extra scoring
kCEDForceTags = 4, // Always examine text inside tags
};
// Forward declaration
Encoding InternalDetectEncoding(
CEDInternalFlags flags, const char* text, int text_length,
const char* url_hint, const char* http_charset_hint,
const char* meta_charset_hint, const int encoding_hint,
const Language language_hint, // User interface lang
const CompactEncDet::TextCorpusType corpus_type,
bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
Encoding* second_best_enc);
typedef struct {
const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas
uint8 x_bar; // Average byte2 value
uint8 y_bar; // Average byte1 value
uint8 x_stddev; // Standard deviation of byte2 value
uint8 y_stddev; // Standard deviation of byte1 value
int so; // Scaling offset -- add to probabilities below
uint8 b1[256]; // Unigram probability for first byte of aligned bigram
uint8 b2[256]; // Unigram probability for second byte of aligned bigram
uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram
} UnigramEntry;
//typedef struct {
// uint8 b12[256*256]; // Bigram probability for aligned bigram
//} FullBigramEntry;
// Include all the postproc-generated tables here:
// RankedEncoding
// kMapToEncoding
// unigram_table
// kMostLIkelyEncoding
// kTLDHintProbs
// kCharsetHintProbs
// HintEntry, kMaxTldKey kMaxTldVector, etc.
// =============================================================================
#include "compact_enc_det/compact_enc_det_generated_tables.h"
#define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"
#define F_BINARY F_X_BINARYENC // We are mid-update for name change
#define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change
#define F_BIG5_CP950 F_BIG5 // We are mid-update for name change
#define F_Unicode F_UTF_16LE // We are mid-update for name change
// =============================================================================
// 7-bit encodings have at least one "interesting" byte value < 0x80
// (00 0E 1B + ~)
// JIS 2022-cn 2022-kr hz utf7
// Unicode UTF-16 UTF-32
// 8-bit encodings have no interesting byte values < 0x80
static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect
static const uint32 kUTF7Active = 0x00000002; // <80 and +
static const uint32 kHzActive = 0x00000004; // <80 and ~
static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F
static const uint32 kUTF8Active = 0x00000010;
static const uint32 kUTF8UTF8Active = 0x00000020;
static const uint32 kUTF1632Active = 0x00000040; // <80 and 00
static const uint32 kBinaryActive = 0x00000080; // <80 and 00
static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx
static const uint32 kIsIndicCode = 0x00000200; //
static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx
static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx
static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase
// Debug only. not thread safe
static int encdet_used = 0;
static int rescore_used = 0;
static int rescan_used = 0;
static int robust_used = 0;
static int looking_used = 0;
static int doing_used = 0;
// For debugging only -- about 256B/entry times about 500 = 128KB
// TODO: only allocate this if being used
typedef struct {
int offset;
int best_enc; // Best ranked encoding for this bigram, or
// -1 for overhead entries
string label;
int detail_enc_prob[NUM_RANKEDENCODING];
} DetailEntry;
static int watch1_rankedenc = -1; // Debug. not threadsafe
static int watch2_rankedenc = -1; // Debug. not threadsafe
////static int next_detail_entry = 0; // Debug. not threadsafe
////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram
// End For debugging only
// Must match kTestPrintableAsciiTildePlus exit codes, minus one
enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};
// The reasons for pruning
enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};
static const char* kWhatSetName[] = {"Ascii", "Other"};
// State for encodings that do shift-out/shift-in between one- and two-byte
// regions (ISO-2022-xx, HZ)
enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};
typedef struct {
const uint8* initial_src; // For calculating byte offsets
const uint8* limit_src; // Range of input source
const uint8* prior_src; // Source consumed by prior call to BoostPrune
const uint8* last_pair; // Last pair inserted into interesting_pairs
DetailEntry* debug_data; // Normally NULL. Ptr to debug data for
// FLAGS_enc_detect_detail PostScript data
int next_detail_entry; // Debug
bool done;
bool reliable;
bool hints_derated;
int declared_enc_1; // From http/meta hint
int declared_enc_2; // from http/meta hint
int prune_count; // Number of times we have pruned
int trigram_highwater_mark; // Byte offset of last trigram processing
bool looking_for_latin_trigrams; // True if we should test for doing
// Latin1/2/7 trigram processing
bool do_latin_trigrams; // True if we actually are scoring trigrams
// Miscellaneous state variables for difficult encodings
int binary_quadrants_count; // Number of four bigram quadrants seen:
// 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx
// 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx
int binary_8x4_count; // Number of 8x4 buckets seen:
uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen
uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen
int utf7_starts; // Count of possible UTF-7 beginnings seen
int prior_utf7_offset; // Source consumed by prior UTF-7 string
int next_utf8_ministate; // Mini state for UTF-8 sequences
int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences
int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes
int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
StateSoSi next_2022_state; // Mini state for 2022 sequences
StateSoSi next_hz_state; // Mini state for HZ sequences
bool next_eucjp_oddphase; // Mini state for EUC-JP sequences
int byte32_count[8]; // Count of top 3 bits of byte1 of bigram
// 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx
uint32 active_special; // Bits showing which special cases are active
Encoding tld_hint; // Top TLD encoding or UNKNOWN
Encoding http_hint; // What the document says about itself or
Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte
Encoding bom_hint; // order mark for UTF-xx
// small cache of previous interesting bigrams
int next_prior_bigram;
int prior_bigram[4];
int prior_binary[1];
int top_rankedencoding; // Top two probabilities and families
int second_top_rankedencoding;
int top_prob;
int second_top_prob;
int prune_difference; // Prune things this much below the top prob
int rankedencoding_list_len; // Number of active encodings
int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings
//
int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc
// This is where all the action is
int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities
int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc
// Two sets -- one for printable ASCII, one for the rest
int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call
int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write
char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair
int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair
int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair
} DetectEncodingState;
// Record a debug event that changes probabilities
void SetDetailsEncProb(DetectEncodingState* destatep,
int offset, int best_enc, const char* label) {
int next = destatep->next_detail_entry;
destatep->debug_data[next].offset = offset;
destatep->debug_data[next].best_enc = best_enc;
destatep->debug_data[next].label = label;
memcpy(&destatep->debug_data[next].detail_enc_prob,
&destatep->enc_prob,
sizeof(destatep->enc_prob));
++destatep->next_detail_entry;
}
// Record a debug event that changes probabilities, copy offset
void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
int best_enc, const char* label) {
int next = destatep->next_detail_entry;
destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
destatep->debug_data[next].best_enc = best_enc;
destatep->debug_data[next].label = label;
memcpy(&destatep->debug_data[next].detail_enc_prob,
&destatep->enc_prob,
sizeof(destatep->enc_prob));
++destatep->next_detail_entry;
}
// Record a debug event that changes probs and has simple text label
void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {
int next = destatep->next_detail_entry;
destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
destatep->debug_data[next].best_enc = -1;
destatep->debug_data[next].label = label;
memcpy(&destatep->debug_data[next].detail_enc_prob,
&destatep->enc_prob,
sizeof(destatep->enc_prob));
++destatep->next_detail_entry;
}
// Record a debug event that is just a text label, no change in probs
void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {
int next = destatep->next_detail_entry;
destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
destatep->debug_data[next].best_enc = -1;
destatep->debug_data[next].label = label;
memcpy(&destatep->debug_data[next].detail_enc_prob,
&destatep->debug_data[next - 1].detail_enc_prob,
sizeof(destatep->enc_prob));
++destatep->next_detail_entry;
}
// Maps superset encodings to base, to see if 2 encodings are compatible
// (Non-identity mappings are marked "-->" below.)
static const Encoding kMapEncToBaseEncoding[] = {
ISO_8859_1, // 0: Teragram ASCII
ISO_8859_2, // 1: Teragram Latin2
ISO_8859_3, // 2: in BasisTech but not in Teragram
ISO_8859_4, // 3: Teragram Latin4
ISO_8859_5, // 4: Teragram ISO-8859-5
ISO_8859_6, // 5: Teragram Arabic
ISO_8859_7, // 6: Teragram Greek
MSFT_CP1255, // 7: Teragram Hebrew --> 36
ISO_8859_9, // 8: in BasisTech but not in Teragram
ISO_8859_10, // 9: in BasisTech but not in Teragram
JAPANESE_EUC_JP, // 10: Teragram EUC_JP
JAPANESE_SHIFT_JIS, // 11: Teragram SJS
JAPANESE_JIS, // 12: Teragram JIS
CHINESE_BIG5, // 13: Teragram BIG5
CHINESE_GB, // 14: Teragram GB
CHINESE_EUC_CN, // 15: Teragram EUC-CN
KOREAN_EUC_KR, // 16: Teragram KSC
UNICODE, // 17: Teragram Unicode
CHINESE_EUC_CN, // 18: Teragram EUC --> 15
CHINESE_EUC_CN, // 19: Teragram CNS --> 15
CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13
JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11
UTF8, // 22
UNKNOWN_ENCODING, // 23
ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0
RUSSIAN_KOI8_R, // 25: Teragram KOI8R
RUSSIAN_CP1251, // 26: Teragram CP1251
ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0
RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
ISO_8859_9, // 31: used for Turkish
ISO_8859_13, // 32: used in Baltic countries --> 43
ISO_8859_11, // 33: aka TIS-620, used for Thai
ISO_8859_11, // 34: used for Thai --> 33
MSFT_CP1256, // 35: used for Arabic
MSFT_CP1255, // 36: Logical Hebrew Microsoft
MSFT_CP1255, // 37: Iso Hebrew Logical --> 36
MSFT_CP1255, // 38: Iso Hebrew Visual --> 36
CZECH_CP852, // 39
ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7
RUSSIAN_CP866, // 42
ISO_8859_13, // 43
ISO_2022_KR, // 44
CHINESE_GB, // 45 GBK --> 14
CHINESE_GB, // 46 GB18030 --> 14
CHINESE_BIG5, // 47 BIG5_HKSCS --> 13
ISO_2022_KR, // 48 ISO_2022_CN --> 44
TSCII, // 49 Indic encoding
TAMIL_MONO, // 50 Indic encoding - Tamil
TAMIL_BI, // 51 Indic encoding - Tamil
JAGRAN, // 52 Indic encoding - Devanagari
MACINTOSH_ROMAN, // 53
UTF7, // 54
BHASKAR, // 55 Indic encoding - Devanagari
HTCHANAKYA, // 56 Indic encoding - Devanagari
UTF16BE, // 57
UTF16LE, // 58
UTF32BE, // 59
UTF32LE, // 60
BINARYENC, // 61
HZ_GB_2312, // 62
UTF8UTF8, // 63
TAM_ELANGO, // 64 Elango - Tamil
TAM_LTTMBARANI, // 65 Barani - Tamil
TAM_SHREE, // 66 Shree - Tamil
TAM_TBOOMIS, // 67 TBoomis - Tamil
TAM_TMNEWS, // 68 TMNews - Tamil
TAM_WEBTAMIL, // 69 Webtamil - Tamil
KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS
DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS
SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS
KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP
SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP
};
COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,
kMapEncToBaseEncoding_has_incorrect_size);
// Maps base encodings to 0, supersets to 1+, undesired to -1
// (Non-identity mappings are marked "-->" below.)
static const int kMapEncToSuperLevel[] = {
0, // 0: Teragram ASCII
0, // 1: Teragram Latin2
0, // 2: in BasisTech but not in Teragram
0, // 3: Teragram Latin4
0, // 4: Teragram ISO-8859-5
0, // 5: Teragram Arabic
0, // 6: Teragram Greek
0, // 7: Teragram Hebrew
0, // 8: in BasisTech but not in Teragram
0, // 9: in BasisTech but not in Teragram
0, // 10: Teragram EUC_JP
0, // 11: Teragram SJS
0, // 12: Teragram JIS
0, // 13: Teragram BIG5
0, // 14: Teragram GB
0, // 15: Teragram EUC-CN
0, // 16: Teragram KSC
0, // 17: Teragram Unicode
-1, // 18: Teragram EUC --> 15
-1, // 19: Teragram CNS --> 15
1, // 20: Teragram BIG5_CP950 --> 13
1, // 21: Teragram CP932 --> 11
0, // 22
-1, // 23
-1, // 24: ISO_8859_1 with all characters <= 127 --> 0
0, // 25: Teragram KOI8R
0, // 26: Teragram CP1251
1, // 27: CP1252 aka MSFT euro ascii --> 0
0, // 28: CP21866 aka KOI8_RU, used for Ukrainian
0, // 29: CP1250 aka MSFT eastern european
1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
0, // 31: used for Turkish
1, // 32: used in Baltic countries --> 43
0, // 33: aka TIS-620, used for Thai
1, // 34: used for Thai --> 33
0, // 35: used for Arabic
0, // 36: Logical Hebrew Microsoft
-1, // 37: Iso Hebrew Logical --> 36
-1, // 38: Iso Hebrew Visual --> 7
0, // 39
1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
0, // 41: used for Greek, NOT superset of 8859-7
0, // 42
0, // 43
0, // 44
1, // 45 GBK --> 14
1, // 46 GB18030 --> 14
1, // 47 BIG5_HKSCS --> 13
1, // 48 ISO_2022_CN --> 44
0, // 49 Indic encoding
0, // 50 Indic encoding - Tamil
0, // 51 Indic encoding - Tamil
0, // 52 Indic encoding - Devanagari
0, // 53
0, // 54
0, // 55 Indic encoding - Devanagari
0, // 56 Indic encoding - Devanagari
0, // 57
0, // 58
0, // 59
0, // 60
0, // 61
0, // 62
2, // 63
0, 0, 0, 0, 0, 0, // add six more Tamil
0, 0, 0, 0, 0, // add five encodings with emoji
};
COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,
kMapEncToSuperLevel_has_incorrect_size);
// Subscripted by Encoding enum value
static const uint32 kSpecialMask[] = {
kHighAccentCode, // 0
kHighAccentCode,
kHighAccentCode,
kHighAccentCode,
kHighAlphaCode, // 4
kHighAlphaCode,
kHighAlphaCode,
kHighAlphaCode,
kHighAccentCode,
kHighAccentCode,
kTwobyteCode + kEUCJPActive, // 10 euc-jp
kTwobyteCode,
kSevenBitActive + kIso2022Active, // jis
kTwobyteCode,
kTwobyteCode,
kTwobyteCode,
kTwobyteCode,
kSevenBitActive + kUTF1632Active, // Unicode
kTwobyteCode,
kTwobyteCode,
kTwobyteCode, // 20
kTwobyteCode,
kUTF8Active, // UTF-8
0,
0,
kHighAlphaCode, // 25
kHighAlphaCode,
kHighAccentCode,
kHighAlphaCode,
kHighAccentCode,
kHighAccentCode, // 30
kHighAccentCode,
kHighAccentCode,
kHighAlphaCode,
kHighAlphaCode,
kHighAlphaCode, // 35
kHighAlphaCode,
kHighAlphaCode,
kHighAlphaCode,
0,
0, // 40
kHighAlphaCode,
kHighAlphaCode,
kHighAccentCode,
kSevenBitActive + kIso2022Active, // 2022-kr
kTwobyteCode,
kTwobyteCode,
kTwobyteCode,
kSevenBitActive + kIso2022Active, // 2022-cn
kHighAlphaCode + kIsIndicCode, // 49 TSCII
kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO
kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI
kHighAlphaCode + kIsIndicCode, // 52 JAGRAN
kHighAccentCode, // 53 MACINTOSH_ROMAN
kSevenBitActive + kUTF7Active, // 54 UTF-7
kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari
kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanagari
kSevenBitActive + kUTF1632Active, // 57 UTF16BE
kSevenBitActive + kUTF1632Active, // 58 UTF16LE
kSevenBitActive + kUTF1632Active, // 59 UTF32BE
kSevenBitActive + kUTF1632Active, // 60 UTF32LE
kSevenBitActive + kBinaryActive, // 61 BINARYENC
kSevenBitActive + kHzActive, // 62 HZ_GB_2312
kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8
kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil
kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil
kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil
kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil
kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil
kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil
kTwobyteCode, // 70 KDDI Shift_JIS
kTwobyteCode, // 71 DoCoMo Shift_JIS
kTwobyteCode, // 72 SoftBank Shift_JIS
kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP
kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP
};
COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,
kSpecialMask_has_incorrect_size);
/***
kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents
ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd
RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef
RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef
RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef
RUSSIAN_CP866, // 42 89ae
ISO_8859_6, // 5: Teragram Arabic nocase cde
MSFT_CP1256, // 35: used for Arabic nocase cde
ISO_8859_7, // 6: Teragram Greek UL cdef
MSFT_CP1253, // 41: used for Greek UL cdef
ISO_8859_8, // 7: Teragram Hebrew nocase ef
MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef
ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef
HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef
ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde
MSFT_CP874, // 34: used for Thai nocase abcde
TSCII, // 49 8-f
TAMIL_MONO, // 50
TAMIL_BI, // 51
JAGRAN, // 52
BHASKAR, // 55 Indic encoding - Devanagari
HTCHANAKYA, // 56 Indic encoding - Devanagari
***/
// We can scan bytes using this at about 500 MB/sec 2.8GHz P4
// Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
// We allow FF, 0x0C, here because it gives a better result for old
// Ascii text formatted for a TTY
// non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
static const char kTestPrintableAsciiTildePlus[256] = {
2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
};
// We can scan bytes using this at about 550 MB/sec 2.8GHz P4
// Slow scan uses this, stopping on NUL ESC SO SI and bad C0
// after Hz and UTF7 are pruned away
// We allow Form Feed, 0x0C, here
static const char kTestPrintableAscii[256] = {
2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
};
// Used in first-four-byte testing
static const char kIsPrintableAscii[256] = {
0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
};
static const signed char kBase64Value[256] = {
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,
52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,
-1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
};
// Subscripted by <state, byte/16>
// Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
//
// Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
// which we can mis-parse as an error byte followed by good UTF-8:
// B2 DBB8 D6BD E1B9B9
// To counteract this, we now require an ASCII7 byte to resync out
// of the error state
// Next problem: good UTF-8 with bad byte
// efbc a012 eea4 bee7 b280 c2b7
// efbca0 12 eea4be e7b280 c2b7
// ^^ bad byte
// fix: change state0 byte 1x to be don't-care
//
// Short UTF-8 ending in ASCII7 byte should resync immediately:
// E0 20 E0 A6 AA should give one error and resync at 2nd E0
//
static const char kMiniUTF8State[8][16] = {
{0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht)
{0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2
{0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3
{0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3
{0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4
{0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4
{0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4
{0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues,
// ONLY resync after Ascii char
// then restart
};
// Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
static const char kMiniUTF8Count[8][16] = {
{0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht)
{1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3
{1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4
{1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4
{0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues,
// then restart
};
// Subscripted by <state, f(byte1) + g(byte2)>
// where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
// and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
// (no checking for illegal bytes)
// Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
// to detect two, so we can back-convert to one.
// zero one two pattern
// ---- ------ ---------------- -----------------
// 81 C281 C382C281 C3->8x->C2->xx
// 98 CB9C C38BC593 C3->8x->C5->xx
// C3 C383 C383C692 C3->8x->C6->xx
// C8 C388 C383CB86 C3->8x->CB->xx
// 83 C692 C386E28099 C3->8x->E2->xx->8x
// 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
// 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
//
// We also want to detect bare-byte extra UTF-8 conversions:
// zero one two pattern
// ---- ------ ---------------- -----------------
// C3 C3 C383 C3->8x->C2->xx
// D3 D3 C393 C3->9x->C2->xx->C2->xx
// E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
// F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
//
/**
CP1252 => UTF8 => UTF8UTF8
80 => E282AC => C3A2E2809AC2AC
81 => C281 => C382C281
82 => E2809A => C3A2E282ACC5A1
83 => C692 => C386E28099
84 => E2809E => C3A2E282ACC5BE
85 => E280A6 => C3A2E282ACC2A6
86 => E280A0 => C3A2E282ACC2A0
87 => E280A1 => C3A2E282ACC2A1
88 => CB86 => C38BE280A0
89 => E280B0 => C3A2E282ACC2B0
8A => C5A0 => C385C2A0
8B => E280B9 => C3A2E282ACC2B9
8C => C592 => C385E28099
8D => C28D => C382C28D
8E => C5BD => C385C2BD
8F => C28F => C382C28F
90 => C290 => C382C290
91 => E28098 => C3A2E282ACCB9C
92 => E28099 => C3A2E282ACE284A2
93 => E2809C => C3A2E282ACC593
94 => E2809D => C3A2E282ACC29D
95 => E280A2 => C3A2E282ACC2A2
96 => E28093 => C3A2E282ACE2809C
97 => E28094 => C3A2E282ACE2809D
98 => CB9C => C38BC593
99 => E284A2 => C3A2E2809EC2A2
9A => C5A1 => C385C2A1
9B => E280BA => C3A2E282ACC2BA
9C => C593 => C385E2809C
9D => C29D => C382C29D
9E => C5BE => C385C2BE
9F => C5B8 => C385C2B8
A0 => C2A0 => C382C2A0
A1 => C2A1 => C382C2A1
A2 => C2A2 => C382C2A2
A3 => C2A3 => C382C2A3
A4 => C2A4 => C382C2A4
A5 => C2A5 => C382C2A5
A6 => C2A6 => C382C2A6
A7 => C2A7 => C382C2A7
A8 => C2A8 => C382C2A8
A9 => C2A9 => C382C2A9
AA => C2AA => C382C2AA
AB => C2AB => C382C2AB
AC => C2AC => C382C2AC
AD => C2AD => C382C2AD
AE => C2AE => C382C2AE
AF => C2AF => C382C2AF
B0 => C2B0 => C382C2B0
B1 => C2B1 => C382C2B1
B2 => C2B2 => C382C2B2
B3 => C2B3 => C382C2B3
B4 => C2B4 => C382C2B4
B5 => C2B5 => C382C2B5
B6 => C2B6 => C382C2B6
B7 => C2B7 => C382C2B7
B8 => C2B8 => C382C2B8
B9 => C2B9 => C382C2B9
BA => C2BA => C382C2BA
BB => C2BB => C382C2BB
BC => C2BC => C382C2BC
BD => C2BD => C382C2BD
BE => C2BE => C382C2BE
BF => C2BF => C382C2BF
C0 => C380 => C383E282AC
C1 => C381 => C383C281
C2 => C382 => C383E2809A
C3 => C383 => C383C692
C4 => C384 => C383E2809E
C5 => C385 => C383E280A6
C6 => C386 => C383E280A0
C7 => C387 => C383E280A1
C8 => C388 => C383CB86
C9 => C389 => C383E280B0
CA => C38A => C383C5A0
CB => C38B => C383E280B9
CC => C38C => C383C592
CD => C38D => C383C28D
CE => C38E => C383C5BD
CF => C38F => C383C28F
D0 => C390 => C383C290
D1 => C391 => C383E28098
D2 => C392 => C383E28099
D3 => C393 => C383E2809C
D4 => C394 => C383E2809D
D5 => C395 => C383E280A2
D6 => C396 => C383E28093
D7 => C397 => C383E28094
D8 => C398 => C383CB9C
D9 => C399 => C383E284A2
DA => C39A => C383C5A1
DB => C39B => C383E280BA
DC => C39C => C383C593
DD => C39D => C383C29D
DE => C39E => C383C5BE
DF => C39F => C383C5B8
E0 => C3A0 => C383C2A0
E1 => C3A1 => C383C2A1
E2 => C3A2 => C383C2A2
E3 => C3A3 => C383C2A3
E4 => C3A4 => C383C2A4
E5 => C3A5 => C383C2A5
E6 => C3A6 => C383C2A6
E7 => C3A7 => C383C2A7
E8 => C3A8 => C383C2A8
E9 => C3A9 => C383C2A9
EA => C3AA => C383C2AA
EB => C3AB => C383C2AB
EC => C3AC => C383C2AC
ED => C3AD => C383C2AD
EE => C3AE => C383C2AE
EF => C3AF => C383C2AF
F0 => C3B0 => C383C2B0
F1 => C3B1 => C383C2B1
F2 => C3B2 => C383C2B2
F3 => C3B3 => C383C2B3
F4 => C3B4 => C383C2B4
F5 => C3B5 => C383C2B5
F6 => C3B6 => C383C2B6
F7 => C3B7 => C383C2B7
F8 => C3B8 => C383C2B8
F9 => C3B9 => C383C2B9
FA => C3BA => C383C2BA
FB => C3BB => C383C2BB
FC => C3BC => C383C2BC
FD => C3BD => C383C2BD
FE => C3BE => C383C2BE
FF => C3BF => C383C2BF
**/
// Subscripted by <state, f(byte1) + g(byte2)>
// where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
// and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
// 81 C281 C382C281 C3->8x->C2->xx
// 98 CB9C C38BC593 C3->8x->C5->xx
// C3 C383 C383C692 C3->8x->C6->xx
// C8 C388 C383CB86 C3->8x->CB->xx
// [0] [2] [0]
// 83 C692 C386E28099 C3->8x->E2->xx->xx
// odd_byte=0 [0] [2] [0+] odd_byte flipped
// odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped
// 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
// odd_byte=0 [0] [3] [4] [0+]
// odd_byte=1 [0+] [3] [4] [4] [0]
// 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
// odd_byte=0 [0] [3] [4] [0] [0]
// odd_byte=1 [0+] [3] [4] [4] [0+]
//
// When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
// the odd_byte state. If that goes from 0 to 1, the next pair is offset up
// by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
// from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
// These are absorbed with no error in state 0 or state 4
//
// C3 C3 C383 C3->8x->C2->xx
// D3 D3 C393 C3->9x->C2->xx->C2->xx
// E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
// F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
// Counter3 for Fx Ex sequences is incremented at last C2
static const char kMiniUTF8UTF8State[8][16] = {
// xxxx E2xx CXxx C3xx
// 8 9 a b 8 9 a b 8 9 a b
{0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
{0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking
{1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xxC2xx
// + + + + // E2xxxx flips odd_byte
{4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
{1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
};
// Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
static const char kMiniUTF8UTF8Count[8][16] = {
// xxxx E2xx C2Xx C3xx
// 8 9 a b 8 9 a b 8 9 a b
{0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
{0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking
{1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx
// + + + + // E2xxxx flips odd_byte
{1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
{1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
{1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
};
static const char kMiniUTF8UTF8Odd[8][16] = {
// xxxx E2xx C2Xx C3xx
// 8 9 a b 8 9 a b 8 9 a b
{0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
{0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking
{0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx
// + + + + // E2xxxx flips odd_byte
{0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
// + + + + // E2xxxx flips odd_byte
{0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
{0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx
{0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx
};
// Turn a pair of bytes into the subscript for UTF8UTF8 tables above
int UTF88Sub(char s0, char s1) {
int sub = (s1 >> 4) & 0x03;
uint8 u0 = static_cast<uint8>(s0);
if (u0 == 0xc3) {
sub += 12;
} else if ((u0 & 0xf0) == 0xc0) {
if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) {
sub += 8;
}
} else if (u0 == 0xe2) {
sub += 4;
}
return sub;
}
// Default probability for an encoding rankedencoding
// Based on a scan of 55M web pages
// These values are 255 - log base 2**1/10 (occurrences / total)
// Large values are most likely. This the reverse of some Google code
// 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
//
// TODO change this to be per encoding, not permuted
//
// Support function for unit test program
// Return ranked encoding corresponding to enc
// (also exported to compact_enc_det_text.cc)
int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {
for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
if (kMapToEncoding[i] == enc) {
return i;
}
}
return -1;
}
string DecodeActive(uint32 active) {
string temp("");
if (active & kBinaryActive) {
temp.append("Binary ");
}
if (active & kUTF1632Active) {
temp.append("UTF1632 ");
}
if (active & kUTF8UTF8Active) {
temp.append("UTF8UTF8 ");
}
if (active & kUTF8Active) {
temp.append("UTF8 ");
}
if (active & kIso2022Active) {
temp.append("Iso2022 ");
}
if (active & kHzActive) {
temp.append("Hz ");
}
if (active & kUTF7Active) {
temp.append("UTF7A ");
}
if (active & kSevenBitActive) {
temp.append("SevenBit ");
}
if (active & kIsIndicCode) {
temp.append("Indic ");
}
if (active & kHighAlphaCode) {
temp.append("HighAlpha ");
}
if (active & kHighAccentCode) {
temp.append("HighAccent ");
}
if (active & kEUCJPActive) {
temp.append("EUCJP ");
}
return temp;
}
static inline bool SevenBitEncoding(int enc) {
return ((kSpecialMask[enc] & kSevenBitActive) != 0);
}
static inline bool TwoByteEncoding(int enc) {
return ((kSpecialMask[enc] & kTwobyteCode) != 0);
}
static inline bool IndicEncoding(int enc) {
return ((kSpecialMask[enc] & kIsIndicCode) != 0);
}
static inline bool HighAlphaEncoding(int enc) {
return ((kSpecialMask[enc] & kHighAlphaCode) != 0);
}
static inline bool HighAccentEncoding(int enc) {
return ((kSpecialMask[enc] & kHighAccentCode) != 0);
}
static inline bool AnyActive(DetectEncodingState* destatep) {
return (destatep->active_special != 0);
}
static inline bool SevenBitActive(DetectEncodingState* destatep) {
return (destatep->active_special & kSevenBitActive) != 0;
}
static inline bool HzActive(DetectEncodingState* destatep) {
return (destatep->active_special & kHzActive) != 0;
}
static inline bool Iso2022Active(DetectEncodingState* destatep) {
return (destatep->active_special & kIso2022Active) != 0;
}
static inline bool UTF8Active(DetectEncodingState* destatep) {
return (destatep->active_special & kUTF8Active) != 0;
}
static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {
return (destatep->active_special & kUTF8UTF8Active) != 0;
}
static inline bool UTF1632Active(DetectEncodingState* destatep) {
return (destatep->active_special & kUTF1632Active) != 0;
}
static inline bool BinaryActive(DetectEncodingState* destatep) {
return (destatep->active_special & kBinaryActive) != 0;
}
static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {
return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;
}
static inline bool EUCJPActive(DetectEncodingState* destatep) {
return ((destatep->active_special & kEUCJPActive) != 0);
}
static inline bool OtherActive(DetectEncodingState* destatep) {
return (destatep->active_special & (kIso2022Active + kBinaryActive +
kUTF8Active + kUTF8UTF8Active +
kUTF1632Active + kEUCJPActive)) != 0;
}
static inline bool CEDFlagRescanning(CEDInternalFlags flags) {
return (flags & kCEDRescanning) != 0;
}
static inline bool CEDFlagForceTags(CEDInternalFlags flags) {
return (flags & kCEDForceTags) != 0;
}
static inline int maxint(int a, int b) {return (a > b) ? a : b;}
static inline int minint(int a, int b) {return (a < b) ? a : b;}
static inline const char* MyRankedEncName(int r_enc) {
return MyEncodingName(kMapToEncoding[r_enc]);
}
// Only for debugging. not thread safe
static const int kPsSourceWidth = 32;
static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this
static int pssourcewidth = 0; // debug only.
static char* pssource_mark_buffer = NULL;
int next_do_src_line;
int do_src_offset[16];
void PsSourceInit(int len) {
pssourcenext = 0;
pssourcewidth = len;
delete[] pssource_mark_buffer;
// Allocate 2 Ascii characters per input byte
pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan
memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
next_do_src_line = 0;
memset(do_src_offset, 0, sizeof(do_src_offset));
}
void PsSourceFinish() {
// Print preceding mark buffer
int j = (pssourcewidth * 2) - 1;
while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
pssource_mark_buffer[j + 1] = '\0';
fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
delete[] pssource_mark_buffer;
pssource_mark_buffer = NULL;
}
// Dump aligned len bytes src... if not already dumped
void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {
int offset = src - isrc;
offset -= (offset % pssourcewidth); // round down to multiple of len bytes
if (offset < pssourcenext) {
return;
}
pssourcenext = offset + pssourcewidth; // Min offset for next dump
// Print preceding mark buffer
int j = (pssourcewidth * 2) - 1;
while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
pssource_mark_buffer[j + 1] = '\0';
fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
// Print source bytes
const uint8* src_aligned = isrc + offset;
int length = srclimit - src_aligned;
length = minint(pssourcewidth, length);
fprintf(stderr, "(%05x ", offset);
for (int i = 0; i < length; ++i) {
char c = src_aligned[i];
if (c == '\n') {c = ' ';}
if (c == '\r') {c = ' ';}
if (c == '\t') {c = ' ';}
if (c == '(') {
fprintf(stderr, "%s", "\\( ");
} else if (c == ')') {
fprintf(stderr, "%s", "\\) ");
} else if (c == '\\') {
fprintf(stderr, "%s", "\\\\ ");
} else if ((0x20 <= c) && (c <= 0x7e)) {
fprintf(stderr, "%c ", c);
} else {
fprintf(stderr, "%02x", c);
}
}
fprintf(stderr, ") do-src\n");
// Remember which source offsets are where, mod 16
do_src_offset[next_do_src_line & 0x0f] = offset;
++next_do_src_line;
}
// Mark bytes in just-previous source bytes
void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {
int offset = src - isrc;
offset = (offset % pssourcewidth); // mod len bytes
char mark = (weightshift == 0) ? '-' : 'x';
pssource_mark_buffer[(offset * 2)] = '=';
pssource_mark_buffer[(offset * 2) + 1] = '=';
for (int i = 1; i < len; ++i) {
pssource_mark_buffer[(offset + i) * 2] = mark;
pssource_mark_buffer[((offset + i) * 2) + 1] = mark;
}
}
// Highlight trigram bytes in just-previous source bytes
// Unfortunately, we have to skip back N lines since source was printed for
// up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {
int offset = (src + 1) - isrc;
int offset32 = (offset % pssourcewidth); // mod len bytes
offset -= offset32; // round down to multiple of len bytes
for (int i = 1; i <= 16; ++i) {
if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {
fprintf(stderr, "%d %d %d do-highlight%d\n",
i, offset32 - 1, trigram_val, n);
break;
}
}
}
void InitDetectEncodingState(DetectEncodingState* destatep) {
destatep->initial_src = NULL; // Filled in by caller
destatep->limit_src = NULL;
destatep->prior_src = NULL;
destatep->last_pair = NULL;
destatep->debug_data = NULL;
destatep->next_detail_entry = 0;
destatep->done = false;
destatep->reliable = false;
destatep->hints_derated = false;
//destatep->declared_enc_1 init in ApplyHints
//destatep->declared_enc_2 init in ApplyHints
destatep->prune_count = 0;
destatep->trigram_highwater_mark = 0;
destatep->looking_for_latin_trigrams = false;
destatep->do_latin_trigrams = false;
// Miscellaneous state variables for difficult encodings
destatep->binary_quadrants_count = 0;
destatep->binary_8x4_count = 0;
destatep->binary_quadrants_seen = 0;
destatep->binary_8x4_seen = 0;
destatep->utf7_starts = 0;
destatep->prior_utf7_offset = 0;
destatep->next_utf8_ministate = 0;
for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}
destatep->next_utf8utf8_ministate = 0;
destatep->utf8utf8_odd_byte = 0;
for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}
destatep->next_2022_state = SOSI_NONE;
destatep->next_hz_state = SOSI_NONE;
destatep->next_eucjp_oddphase = false;
for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}
destatep->active_special = 0xffffffff;
destatep->tld_hint = UNKNOWN_ENCODING;
destatep->http_hint = UNKNOWN_ENCODING;
destatep->meta_hint = UNKNOWN_ENCODING;
destatep->bom_hint = UNKNOWN_ENCODING;
destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default
destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default
destatep->top_prob = -1;
destatep->second_top_prob = -1;
// This is wide for first pruning, shrinks for 2nd and later
destatep->prune_difference = kInititalPruneDifference;
destatep->next_prior_bigram = 0;
destatep->prior_bigram[0] = -1;
destatep->prior_bigram[1] = -1;
destatep->prior_bigram[2] = -1;
destatep->prior_bigram[3] = -1;
destatep->prior_binary[0] = -1;
// Initialize with all but Indic encodings, which we never detect
int k = 0;
for (int rankedencoding = 0;
rankedencoding < NUM_RANKEDENCODING;
rankedencoding++) {
Encoding enc = kMapToEncoding[rankedencoding];
if (!IndicEncoding(enc)) {
destatep->rankedencoding_list[k++] = rankedencoding;
}
}
destatep->rankedencoding_list_len = k;
// This is where all the action is
memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));
memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));
memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));
destatep->prior_interesting_pair[AsciiPair] = 0;
destatep->prior_interesting_pair[OtherPair] = 0;
destatep->next_interesting_pair[AsciiPair] = 0;
destatep->next_interesting_pair[OtherPair] = 0;
// interesting_pairs/offsets/weightshifts not initialized; no need
}
// Probability strings are uint8, with zeros removed via simple run-length:
// (<skip-take byte> <data bytes>)*
// skip-take:
// 00 end
// x0 skip 16 x locations, take 0 data values
// xy skip x locations, take y data values
// Multiply all the incoming values by 3 to account for 3x unigram sums
//
// {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
// 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
//
// Weight is 0..100 percent
//
// Returns subscript of largest (most probable) value
//
// {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
// // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASCII-7-bit]
int ApplyCompressedProb(const char* iprob, int len,
int weight, DetectEncodingState* destatep) {
int* dst = &destatep->enc_prob[0];
int* dst2 = &destatep->hint_weight[0];
const uint8* prob = reinterpret_cast<const uint8*>(iprob);
const uint8* problimit = prob + len;
int largest = -1;
int subscript_of_largest = 0;
// Continue with first byte and subsequent ones
while (prob < problimit) {
int skiptake = *prob++;
int skip = (skiptake & 0xf0) >> 4;
int take = skiptake & 0x0f;
if (skiptake == 00) {
break;
} else if (take == 0) {
dst += (skip << 4);
dst2 += (skip << 4);
} else {
dst += skip; // Normal case
dst2 += skip; // Normal case
for (int i = 0; i < take; i++) {
int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;
if (largest < prob[i]) {
largest = prob[i];
subscript_of_largest = enc;
}
int increment = prob[i] * 3; // The actual increment
// Do maximum of previous hints plus this new one
if (weight > 0) {
increment = (increment * weight) / 100;
dst[i] = maxint(dst[i], increment);
dst2[i] = 1; // New total weight
}
}
prob += take;
dst += take;
dst2 += take;
}
}
return subscript_of_largest;
}
// Returns subscript of largest (most probable) value [for unit test]
int TopCompressedProb(const char* iprob, int len) {
const uint8* prob = reinterpret_cast<const uint8*>(iprob);
const uint8* problimit = prob + len;
int next_prob_sub = 0;
int topprob = 0;
int toprankenc = 0;
while (prob < problimit) {
int skiptake = *prob++;
int skip = (skiptake & 0xf0) >> 4;
int take = skiptake & 0x0f;
if (skiptake == 0) {
break;
} else if (take == 0) {
next_prob_sub += (skip << 4);
} else {
next_prob_sub += skip; // Normal case
for (int i = 0; i < take; i++) {
if (topprob < prob[i]) {
topprob = prob[i];
toprankenc = next_prob_sub + i;
}
}
prob += take;
next_prob_sub += take;
}
}
return toprankenc;
}
// Find subscript of matching key in first 8 bytes of sorted hint array, or -1
int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
const char* norm_key) {
// Key is always in range [lo..hi)
int lo = 0;
int hi = hintprobssize;
while (lo < hi) {
int mid = (lo + hi) >> 1;
int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);
if (comp < 0) {
lo = mid + 1;
} else if (comp > 0) {
hi = mid;
} else {
return mid;
}
}
return -1;
}
// Find subscript of matching key in first 4 bytes of sorted hint array, or -1
int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
const char* norm_key) {
// Key is always in range [lo..hi)
int lo = 0;
int hi = hintprobssize;
while (lo < hi) {
int mid = (lo + hi) >> 1;
int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);
if (comp < 0) {
lo = mid + 1;
} else if (comp > 0) {
hi = mid;
} else {
return mid;
}
}
return -1;
}
static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {
destatep->enc_prob[r_enc] += boost;
}
static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {
destatep->enc_prob[r_enc] -= whack;
}
// Apply initial probability hint based on top level domain name
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyTldHint(const char* url_tld_hint, int weight,
DetectEncodingState* destatep) {
if (url_tld_hint[0] == '~') {
return 0;
}
string normalized_tld = MakeChar4(string(url_tld_hint));
int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
normalized_tld.c_str());
if (n >= 0) {
// TLD is four bytes, probability table is ~12 bytes
int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
kMaxTldVector, weight, destatep);
// Never boost ASCII7; do CP1252 instead
if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
destatep->declared_enc_1 = best_sub;
if (destatep->debug_data != NULL) {
// Show TLD hint
SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);
}
return 1;
}
return 0;
}
// Apply initial probability hint based on charset= name
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyCharsetHint(const char* charset_hint, int weight,
DetectEncodingState* destatep) {
if (charset_hint[0] == '~') {
return 0;
}
string normalized_charset = MakeChar44(string(charset_hint));
int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
normalized_charset.c_str());
if (n >= 0) {
// Charset is eight bytes, probability table is ~eight bytes
int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
kMaxCharsetVector, weight, destatep);
// Never boost ASCII7; do CP1252 instead
if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
destatep->declared_enc_1 = best_sub;
// If first explicitly declared charset is confusable with Latin1/1252, put
// both declared forms in declared_enc_*, displacing Latin1/1252.
// This avoids a bit of Latin1 creep.
// Also boost the declared encoding and its pair
// TODO: This should all be folded into postproc-enc-detect.cc
if ((destatep->http_hint == UNKNOWN_ENCODING) &&
(destatep->meta_hint == UNKNOWN_ENCODING)) {
// This is the first charset=hint
switch (best_sub) {
case F_Latin2: // 8859-2 Latin2, east euro
destatep->declared_enc_2 = F_CP1250;
Boost(destatep, F_Latin2, kGentleOnePair);
Boost(destatep, F_CP1250, kGentleOnePair);
break;
case F_CP1250:
destatep->declared_enc_2 = F_Latin2;
Boost(destatep, F_Latin2, kGentleOnePair);
Boost(destatep, F_CP1250, kGentleOnePair);
break;
case F_Latin3: // 8859-3 Latin3, south euro, Esperanto
destatep->declared_enc_2 = F_ASCII_7_bit;
Boost(destatep, F_Latin3, kGentleOnePair);
break;
case F_Latin4: // 8859-4 Latin4, north euro
destatep->declared_enc_2 = F_ASCII_7_bit;
Boost(destatep, F_Latin4, kGentleOnePair);
break;
case F_ISO_8859_5: // 8859-5 Cyrillic
destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251
Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different)
break;
case F_CP1251:
destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5
Boost(destatep, F_CP1251, kGentleOnePair); // (too different)
break;
case F_Arabic: // 8859-6 Arabic
destatep->declared_enc_2 = F_CP1256;
Boost(destatep, F_Arabic, kGentleOnePair);
Boost(destatep, F_CP1256, kGentleOnePair);
break;
case F_CP1256:
destatep->declared_enc_2 = F_Arabic;
Boost(destatep, F_Arabic, kGentleOnePair);
Boost(destatep, F_CP1256, kGentleOnePair);
break;
case F_Greek: // 8859-7 Greek
destatep->declared_enc_2 = F_CP1253;
Boost(destatep, F_Greek, kGentleOnePair);
Boost(destatep, F_CP1253, kGentleOnePair);
break;
case F_CP1253:
destatep->declared_enc_2 = F_Greek;
Boost(destatep, F_Greek, kGentleOnePair);
Boost(destatep, F_CP1253, kGentleOnePair);
break;
case F_Hebrew: // 8859-8 Hebrew
destatep->declared_enc_2 = F_CP1255;
Boost(destatep, F_Hebrew, kGentleOnePair);
Boost(destatep, F_CP1255, kGentleOnePair);
break;
case F_CP1255:
destatep->declared_enc_2 = F_Hebrew;
Boost(destatep, F_Hebrew, kGentleOnePair);
Boost(destatep, F_CP1255, kGentleOnePair);
break;
case F_Latin5: // 8859-9 Latin5, Turkish
destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254
Boost(destatep, F_Latin5, kGentleOnePair); // (too different)
break;
case F_CP1254:
destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5
Boost(destatep, F_CP1254, kGentleOnePair); // (too different)
break;
case F_Latin6: // 8859-10 Latin6, Nordic
destatep->declared_enc_2 = F_ASCII_7_bit;
Boost(destatep, F_Latin6, kGentleOnePair);
break;
case F_ISO_8859_11: // 8859-11 Thai,
destatep->declared_enc_2 = F_CP874;
Boost(destatep, F_ISO_8859_11, kGentleOnePair);
Boost(destatep, F_CP874, kGentleOnePair);
break;
case F_CP874:
destatep->declared_enc_2 = F_ISO_8859_11;
Boost(destatep, F_ISO_8859_11, kGentleOnePair);
Boost(destatep, F_CP874, kGentleOnePair);
break;
case F_ISO_8859_13: // 8859-13 Latin7, Baltic
destatep->declared_enc_2 = F_CP1257;
Boost(destatep, F_ISO_8859_13, kGentleOnePair);
Boost(destatep, F_CP1257, kGentleOnePair);
break;
case F_CP1257:
destatep->declared_enc_2 = F_ISO_8859_13;
Boost(destatep, F_ISO_8859_13, kGentleOnePair);
Boost(destatep, F_CP1257, kGentleOnePair);
break;
case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1
destatep->declared_enc_2 = F_ASCII_7_bit;
Boost(destatep, F_ISO_8859_15, kGentleOnePair);
break;
// Greek all-caps is confusable with KOI8x all-lower and Hebrew.
// This turns some Greek documents into Cyrillic, etc. by mistake.
// Greek and Hebrew are boosted explicitly above; do KOI8x here.
// Boosting the declared encodingmakes it harder for the wrong one to
// creep up.
case F_KOI8R:
Boost(destatep, F_KOI8R, kGentleOnePair);
break;
case F_KOI8U:
Boost(destatep, F_KOI8U, kGentleOnePair);
break;
default:
break;
}
}
if (destatep->debug_data != NULL) {
// Show charset hint
SetDetailsEncProb(destatep, 0, best_sub, charset_hint);
}
//
// Some fix-ups for the declared encodings
//
// If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos
// TODO: This should all be folded into postproc-enc-detect.cc
if ((best_sub != F_UTF8) &&
(best_sub != F_Latin1) &&
(best_sub != F_CP1252)) {
Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote
}
// Latin2 and CP1250 differ in the overlap part, such as B1 or B9
// The initial probabilites for charset=Latin2 explicitly put CP1250
// down twice as far as normal, and vice versa. This is done in
// postproc-enc-detect.cc
// If charset=user-defined, treat as Binary --
// we can safely only do low ASCII, might be Indic
if (normalized_charset.substr(0,4) == "user") {
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
}
return 1;
}
return 0;
}
// Apply initial probability hint based on caller-supplied encoding
// Negative hint whacks ~encoding, non-negative boosts encoding
//
// Negative hints are an experiment to see if they might be useful.
// Not operator used instead of unary minus to allow specifying not-zero
int ApplyEncodingHint(const int encoding_hint, int weight,
DetectEncodingState* destatep) {
Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?
~encoding_hint : encoding_hint);
// Map to the right internal subscript
int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);
// I'm not sure how strong this hint should be. Weight 100% = 1 bigram
int increment = (kBoostOnePair * weight) / 100;
if (encoding_hint < 0) {
destatep->enc_prob[rankedenc_hint] -= increment;
} else {
destatep->enc_prob[rankedenc_hint] += increment;
}
if (destatep->debug_data != NULL) {
// Show encoding hint
SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));
}
return 1;
}
// Apply initial probability hint based on user interface language
// Weight is 0..100 percent
// Return 1 if name match found
int ApplyUILanguageHint(const Language language_hint,
int weight, DetectEncodingState* destatep) {
if (language_hint == UNKNOWN_LANGUAGE) {
return 0;
}
string normalized_lang = MakeChar8(LanguageName(language_hint));
int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
normalized_lang.c_str());
if (n >= 0) {
// Language is eight bytes, probability table is ~eight bytes
int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
kMaxLangVector, weight, destatep);
// Never boost ASCII7; do CP1252 instead
if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
destatep->declared_enc_1 = best_sub;
if (destatep->debug_data != NULL) {
// Show language hint
SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());
}
return 1;
}
return 0;
}
// Apply initial probability hint based on corpus type (web, email, etc)
// Return 1 if name match found
int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
DetectEncodingState* destatep) {
for (int i = 0; i < NUM_RANKEDENCODING; i++) {
// Set the default probability
destatep->enc_prob[i] = kDefaultProb[i] * 3;
// Deliberately set 2022 seven-bit encodings to zero,
// so we can look for actual use
// TODO: This should all be folded into postproc-enc-detect.cc
if (SevenBitEncoding(kMapToEncoding[i])) {
destatep->enc_prob[i] = 0;
}
}
// A little corpus distinction
switch (corpus_type) {
case CompactEncDet::WEB_CORPUS:
case CompactEncDet::XML_CORPUS:
// Allow double-converted UTF-8 to start nearly equal to normal UTF-8
destatep->enc_prob[F_UTF8UTF8] =
destatep->enc_prob[F_UTF8] - kSmallInitDiff;
break;
case CompactEncDet::QUERY_CORPUS:
case CompactEncDet::EMAIL_CORPUS:
default:
break;
}
if (FLAGS_demo_nodefault) {
// Demo, make initial probs all zero
for (int i = 0; i < NUM_RANKEDENCODING; i++) {
destatep->enc_prob[i] = 0;
}
}
if (destatep->debug_data != NULL) {
// Show default hint
SetDetailsEncProb(destatep, 0, -1, "Default");
}
return 1;
}
// Do reverse search for c in [str..str+len)
// Note: initial pointer is to FRONT of string, not back
const char* MyMemrchr(const char* str, char c, size_t len) {
const char* ret = str + len;
while (str <= --ret) {
if (*ret == c) {return ret;}
}
return NULL;
}
// Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
// Now that we are no longer trying to do Indic font-based encodigns, we
// don't need the full URL and can go back to simple TLD. This test remains for
// backwards compatility with any caller using full URL.
static const int kMinURLLength = 11;
// Extract TLD from a full URL or just a TLD
// Return hostname and length if a full URL
void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
const char** ret_host_start, int* ret_host_len) {
// url_hint can either be a full URL (preferred) or just top-level domain name
// Extract the TLD from a full URL and use it for
// a normal TLD hint
strncpy(tld_hint, "~", tld_hint_len);
tld_hint[tld_hint_len - 1] = '\0';
*ret_host_start = NULL;
*ret_host_len = 0;
int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;
if (url_len == 0) {
// Empty TLD
return;
}
// Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
if (kMinURLLength <= url_len) {
// See if it really is a URL
const char* first_slash = strchr(url_hint, '/');
if ((first_slash != NULL) && (first_slash != url_hint) &&
(first_slash[-1] == ':') && (first_slash[1] == '/') &&
(memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {
// We found :// and no dot in front of it, so declare a real URL
const char* hostname_start = first_slash + 2;
const char* hostname_end = strchr(hostname_start, '/');
if (hostname_end == NULL) {
// No slash; end is first byte off end of the URL string
hostname_end = url_hint + url_len;
}
size_t hostname_len = hostname_end - hostname_start;
const char* port_start =
(const char*)memchr(hostname_start, ':', hostname_len);
if (port_start != NULL) {
// Port; shorten hostname
hostname_end = port_start;
hostname_len = hostname_end - hostname_start;
}
const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);
if (tld_start != NULL) {
// Remember the TLD we just found
int tld_len = hostname_start + hostname_len - tld_start - 1;
if (tld_len > (tld_hint_len - 1)) {
tld_len = tld_hint_len - 1;
}
memcpy(tld_hint, tld_start + 1, tld_len);
tld_hint[tld_len] = '\0';
}
*ret_host_start = hostname_start;
*ret_host_len = hostname_len;
return;
}
} else {
strncpy(tld_hint, url_hint, tld_hint_len);
tld_hint[tld_hint_len - 1] = '\0';
}
}
// Apply hints, if any, to probabilities
// NOTE: Encoding probabilites are all zero at this point
void ApplyHints(const char* url_hint,
const char* http_charset_hint,
const char* meta_charset_hint,
const int encoding_hint,
const Language language_hint,
const CompactEncDet::TextCorpusType corpus_type,
DetectEncodingState* destatep) {
int hint_count = 0;
// url_hint can either be a full URL (preferred) or just top-level domain name
// Extract the TLD from a full URL and use it for
// a normal TLD hint
char tld_hint[16];
const char* hostname_start = NULL;
int hostname_len = 0;
ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),
&hostname_start, &hostname_len);
// Initial hints give slight boost to Ascii-7-bit and code page 1252
// ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1
// This gives a boost to 1252 if one of HTTP/META is specified,
// but this could be the wrong thing to do if Latin2/3/4/etc. is specified
destatep->declared_enc_1 = F_CP1252;
destatep->declared_enc_2 = F_ASCII_7_bit;
// Applying various hints takes max of new hint and any old hint.
// This does better on multiple hints that a weighted average
// Weight is 0..100 percent
if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);
destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];
if ((destatep->declared_enc_1 == F_CP1252) ||
(destatep->declared_enc_1 == F_Latin1)) {
destatep->looking_for_latin_trigrams = true;
}
}
if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);
destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];
if ((destatep->declared_enc_1 == F_CP1252) ||
(destatep->declared_enc_1 == F_Latin1)) {
destatep->looking_for_latin_trigrams = true;
}
}
if (encoding_hint != UNKNOWN_ENCODING) {
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);
}
if (language_hint != UNKNOWN_LANGUAGE) {
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyUILanguageHint(language_hint, 50, destatep);
}
// Use top level domain if not .com and <=1 other hint was available
if (url_hint != NULL) {
destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);
if (hint_count == 0) {
// Apply with weight 100%
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyTldHint(tld_hint, 100, destatep);
if ((destatep->declared_enc_1 == F_CP1252) ||
(destatep->declared_enc_1 == F_Latin1)) {
destatep->looking_for_latin_trigrams = true;
}
if (strcmp("hu", tld_hint) == 0) {
// Hungarian is particularly difficult to separate Latin2 from Latin1,
// so always look for trigram scanning if bare TLD=hu hint
destatep->looking_for_latin_trigrams = true;
}
// Treat .com as no TLD hint at all
} else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {
// Either shift weighting or consider doing no TLD here -- seems to
// distract from correct charset= hints. Or perhaps apply only if
// charset = Latin1/1252...
// Apply with weight 50%
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyTldHint(tld_hint, 50, destatep);
if ((destatep->declared_enc_1 == F_CP1252) ||
(destatep->declared_enc_1 == F_Latin1)) {
destatep->looking_for_latin_trigrams = true; // These need trigrams
}
}
// Else ignore TLD hint entirely
}
// Use all-web default distribution if not even a TLD hint
if (hint_count == 0) {
destatep->looking_for_latin_trigrams = true; // Default needs trigrams
destatep->declared_enc_2 = destatep->declared_enc_1;
hint_count += ApplyDefaultHint(corpus_type, destatep);
}
// ISO-Microsoft Pairs
// F_Latin1, F_CP1252,
// F_Latin2, F_CP1250, NOT really strict subset/superset pairs
// F_Latin3,
// F_Latin4,
// F_ISO_8859_5, F_CP1251,
// F_Arabic, F_CP1256, NOT
// F_Greek, F_CP1253, NOT really pairs
// (or upgrade incvt to make Greek use CP)
// F_Hebrew, F_CP1255, NOT really pairs
// F_Latin5, F_CP1254,
// F_Latin6,
// F_ISO_8859_11,
// F_ISO_8859_13, F_CP1257,
// F_ISO_8859_15,
// ISO-Microsoft Pairs
// Get important families started together
// // This should fall out of the initializatoin vectors for charset,
// but we need to get rid of families alltogetrher
//
// TODO make this more graceful
// Add small bias for subsets
// Subtract small bias for supersets
destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;
destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;
destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;
destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -
kSmallInitDiff;
destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -
kSmallInitDiff;
// Deliberate over-bias Ascii7 and underbias Binary [unneeded]
// destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSmallInitDiff;
// destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitial / 2);
if (destatep->debug_data != NULL) {
// Show state at end of hints
SetDetailsEncProb(destatep, 0, -1, "Endhints");
if(FLAGS_enc_detect_detail2) {
// Add a line showing the watched encoding(s)
if (watch1_rankedenc >= 0) {
SetDetailsEncProb(destatep, 0,
watch1_rankedenc, FLAGS_enc_detect_watch1);
}
if (watch2_rankedenc >= 0) {
SetDetailsEncProb(destatep, 0,
watch2_rankedenc, FLAGS_enc_detect_watch2);
}
} // End detail2
}
// If duplicate hints, set second one to ASCII_7BIT to prevent double-boost
if (destatep->declared_enc_1 == destatep->declared_enc_2) {
destatep->declared_enc_2 = F_ASCII_7_bit;
}
if (FLAGS_force127) {
destatep->do_latin_trigrams = true;
if (FLAGS_enc_detect_source) {
PsHighlight(0, destatep->initial_src, 0, 2);
}
}
if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}
if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}
//
// At this point, destatep->enc_prob[] is an initial probability vector based
// on the given hints/default. In general, it spreads out least-likely
// encodings to be about 2**-25 below the most-likely encoding.
// For input text with lots of bigrams, an unlikely encoding can rise to
// the top at a rate of about 2**6 per bigram, and more commonly 2**2 per
// bigram. So more than 4 bigrams and commonly more than 12 are
// needed to overcome the initial hints when the least-likely encoding
// is in fact the correct answer. So if the entire text has very few bigrams
// (as a two-word query might), it can be impossible for the correct
// encoding to win.
//
// To compensate for this, we take the initial hint vector and effectively
// apply it at the rate of 1/16 every bigram for the first 16 bigrams. The
// actual mechanism is done just before the last prune.
//
// Remember Initial hint probabilities
memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));
}
// Look for specific high-value patterns in the first 4 bytes
// Byte order marks (BOM)
// EFBBBF UTF-8
// FEFF UTF-16 BE
// FFFE UTF-16 LE
// FFFE0000 UTF-32 BE
// 0000FEFF UTF-32 LE
//
// Likely UTF-x of seven-bit ASCII
// 00xx UTF-16 BE xx printable ASCII
// xx00 UTF-16 LE
// 000000xx UTF-32 BE
// xx000000 UTF-32 LE
//
void InitialBytesBoost(const uint8* src,
int text_length,
DetectEncodingState* destatep) {
if (text_length < 4) {return;}
uint32 pair01 = (src[0] << 8) | src[1];
uint32 pair23 = (src[2] << 8) | src[3];
uint32 quad0123 = (pair01 << 16) | pair23;
bool utf_16_indication = false;
bool utf_32_indication = false;
int best_enc = -1;
// Byte order marks
// UTF-8
if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {
destatep->bom_hint = UTF8;
Boost(destatep, F_UTF8, kBoostInitial * 2);
Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);
best_enc = F_UTF8;
// UTF-32 (test before UTF-16)
} else if (quad0123 == 0x0000FEFF) {
destatep->bom_hint = UTF32BE;
Boost(destatep, F_UTF_32BE, kBoostInitial * 2);
best_enc = F_UTF_32BE;
} else if (quad0123 == 0xFFFE0000) {
destatep->bom_hint = UTF32LE;
Boost(destatep, F_UTF_32LE, kBoostInitial * 2);
best_enc = F_UTF_32LE;
// UTF-16
} else if (pair01 == 0xFEFF) {
destatep->bom_hint = UTF16BE;
Boost(destatep, F_UTF_16BE, kBoostInitial * 3);
best_enc = F_UTF_16BE;
} else if (pair01 == 0xFFFE) {
destatep->bom_hint = UTF16LE;
Boost(destatep, F_UTF_16LE, kBoostInitial * 3);
best_enc = F_UTF_16LE;
// Possible seven-bit ASCII encoded as UTF-16/32
// UTF-32 (test before UTF-16)
} else if (((quad0123 & 0xffffff00) == 0) &&
(kIsPrintableAscii[src[3]] != 0)) {
Boost(destatep, F_UTF_32BE, kBoostInitial);
Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char
best_enc = F_UTF_32BE;
} else if (((quad0123 & 0x00ffffff) == 0) &&
(kIsPrintableAscii[src[0]] != 0)) {
Boost(destatep, F_UTF_32LE, kBoostInitial);
Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
best_enc = F_UTF_32LE;
} else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {
Boost(destatep, F_UTF_16BE, kBoostInitial);
best_enc = F_UTF_16BE;
} else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {
Boost(destatep, F_UTF_16LE, kBoostInitial);
best_enc = F_UTF_16LE;
// Whack if 0000 or FFFF
// UTF-32 (test before UTF-16)
} else if (quad0123 == 0x00000000) {
Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
Whack(destatep, F_UTF_32LE, kBadPairWhack);
Whack(destatep, F_UTF_16BE, kBadPairWhack);
Whack(destatep, F_UTF_16LE, kBadPairWhack);
best_enc = -1;
} else if (quad0123 == 0xffffffff) {
Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
Whack(destatep, F_UTF_32LE, kBadPairWhack);
Whack(destatep, F_UTF_16BE, kBadPairWhack);
Whack(destatep, F_UTF_16LE, kBadPairWhack);
best_enc = -1;
} else if (pair01 == 0x0000) {
Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
Whack(destatep, F_UTF_16LE, kBadPairWhack);
best_enc = -1;
} else if (pair01 == 0xffff) {
Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
Whack(destatep, F_UTF_16LE, kBadPairWhack);
best_enc = -1;
// These are the first four bytes of some known binary file formats
// Boost BINARY bigtime if JPEG FFD8FFxx
// Boost BINARY bigtime if png 89504E47 (.PNG)
// Boost BINARY bigtime if gif 47494638 (GIF8)
// Boost BINARY bigtime if zip 504B0304 (PK..)
// Boost BINARY bigtime if gzip 1F8B08xx
// Boost BINARY bigtime if gzip 78DAxxxx
// Boost BINARY if PDF 25504446 (%PDF)
// Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)
} else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x89504E47) { // Hex 89 P N G
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x47494638) { // Hex GIF8
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x504B0304) { // Hex P K 03 04
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (pair01 == 0x78DA) { // gzip 78DAxxxx
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x25504446) { // Hex %PDF
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
// More binary detect prefixes
// 7F E L F Executable and linking format
// M M 00 * TIFF (little-endian)
// * 00 M M TIFF (big-endian)
// 01 f c p Final cut pro
} else if (quad0123 == 0x7F454C46) { // Hex 7F E L F
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x4D4D002A) { // Hex M M 00 *
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x01666370) { // Hex 01 f c p
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
// More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII
// prefix overcoming binary
// C C S D USGS ISIS 3-D cube files
// S I M P FITS image header "SIMPLE "
} else if (quad0123 == 0x43435344) { // Hex C C S D
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x53494D50) { // Hex S I M P
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
// More binary detect prefixes; all-ASCII names; lighter weight
// H W P Hangul word processor
// 8 B P S Photoshop
// P D S _ xx "PDS_VERSION_ID "
} else if (quad0123 == 0x48575020) { // Hex H W P
if ((19 <= text_length) &&
(memcmp(src, "HWP.Document.File.V", 19) == 0)) {
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if ((19 <= text_length) &&
(memcmp(src, "HWP Document File V", 19) == 0)) {
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else {
Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
}
} else if (quad0123 == 0x38425053) { // Hex 8 B P S
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else if (quad0123 == 0x5044535F) { // Hex P D S _
if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {
Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
} else {
Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
}
}
// There are several main Windows EXE file formats.
// Not examined here (prefix too short; never see them in Google pipeline)
// M Z DOS .exe Mark Zbikowski
// N E DOS 4.0 16-bit
// L E OS/2 VxD drivers
// L X OS/2
// P E Windows NT
// More user-defined
// http://www.freenet.am/armscii/ Armenian
// If any hints or BOM, etc. keep UTF 16/32 around
if ((destatep->enc_prob[F_UTF_16BE] > 0) ||
(destatep->enc_prob[F_UTF_16LE] > 0)) {
utf_16_indication = true;
}
if ((destatep->enc_prob[F_UTF_32BE] > 0) ||
(destatep->enc_prob[F_UTF_32LE] > 0)) {
utf_32_indication = true;
}
// Kill UTF16/32 right now if no positive indication of them
// Otherwise, they tend to rise to the top in 7-bit files with an
// occasional 0x02 byte in some comment or javascript
if (!utf_16_indication) {
Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);
Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);
Whack(destatep, F_Unicode, kBadPairWhack * 8);
}
if (!utf_32_indication) {
Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);
Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);
}
// Usually kill mixed encodings
if (!FLAGS_ced_allow_utf8utf8) {
Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);
}
// 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead
Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);
if (destatep->debug_data != NULL) {
// Show first four bytes of the input
char buff[16];
snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);
SetDetailsEncProb(destatep, 0, best_enc, buff);
}
}
// Descending order
int IntCompare(const void* v1, const void* v2) {
const int* p1 = reinterpret_cast<const int*>(v1);
const int* p2 = reinterpret_cast<const int*>(v2);
if (*p1 < *p2) {return 1;}
if (*p1 > *p2) {return -1;}
return 0;
}
bool Base64Char(uint8 c) {
if (('A' <= c) && (c <= 'Z')) {return true;}
if (('a' <= c) && (c <= 'z')) {return true;}
if (('0' <= c) && (c <= '9')) {return true;}
if ('+' == c) {return true;}
if ('/' == c) {return true;}
return false;
}
int Base64ScanLen(const uint8* start, const uint8* limit) {
// We have a plausible beginning; scan entire base64 string
const uint8* ib64str = start;
const uint8* b64str = ib64str;
const uint8* b64strlimit = limit;
// if starts with + +++, assume it is drawing, so bogus
if (((limit - start) > 3) && (start[0] == '+') &&
(start[1] == '+') && (start[2] == '+')) {
return 81;
}
// Scan over base64
while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) {
}
b64str--; // We overshot by 1
return b64str - ib64str;
}
// Input is at least 8-character legal base64 string after +.
// But might be say + "Presse+Termine"
bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {
// Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)
// Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)
// Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)
// Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)
// NOTE: this requires at least one lower AND one upper AND one digit to pass
//
int plus_count = 0;
int lower_count = 0;
int upper_count = 0;
int digit_count = 0;
int len = limit - start;
for (const uint8* src = start; src < limit; ++src) {
uint8 c = *src;
if (('a' <= c) && (c <= 'z')) {
++lower_count;
} else if (('A' <= c) && (c <= 'Z')) {
++upper_count;
} else if (('0' <= c) && (c <= '0')) {
++digit_count;
} else if (*src == '+') {
++plus_count;
}
}
if (plus_count > (1 + (len >> 4))) {return false;}
if (lower_count < (1 + (len >> 4))) {return false;}
if (upper_count < (1 + (len >> 4))) {return false;}
if (digit_count < (1 + (len >> 5))) {return false;}
// checking the last character to reduce false positive
// since the last character may be padded to 0 bits at the end.
// refer to http://en.wikipedia.org/wiki/UTF-7
int nmod8 = len & 7;
const uint8 last = *(start+len-1);
// When UTF-7 string length%8=3, the last two bits must be padded as 0
if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}
// When UTF-7 string length%8=6, the last four bits must be padded as 0
if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}
return true;
}
// Prune here after N bytes
// Boost here for seven-bit sequences (at every prune)
// if (sevenbitrankedencoding)
// + UTF7 scan and boost/demote len mod 8 = 0 3 6
// ~ Hz scan and boost/demote len mod 8 = 0 2 4 6
// 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6
// 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6
// [0F 2022 boost/demote]
// 00 UTF16/32 scan and boost/demote offset = even/odd
//
// If still some seven-bit possibilities > pure ASCII,
// scan each possibility for clearer prob, s.t. about
// two good sequences is a clear win
// A-Z 00-19 00xx-64xx (B = 04xx)
// a-z 1A-33 68xx-CCxx (f = 7Cxx)
// 0-9 34-3D D0xx-F4xx (1 = D4xx)
// + 3E F8xx
// / 3F FCxx
// do another chunk with slow scan
// Boost, whack, or leave alone UTF-7 probablilty
void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
int off = destatep->interesting_offsets[AsciiPair][next_pair];
if (off >= destatep->prior_utf7_offset) {
// Not part of a previous successful UTF-7 string
++destatep->utf7_starts;
if (byte2 == '-') {
// +- encoding for '+' neutral
} else if (!Base64Char(byte2)) {
// Not base64 -- not UTF-7, whack
Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair
} else {
// Starts with base64 byte, might be a good UTF7 sequence
const uint8* start = destatep->initial_src + off + 1; // over the +
int n = Base64ScanLen(start, destatep->limit_src);
int nmod8 = n & 7;
if ((n == 3) || (n == 6)) {
// short but legal -- treat as neutral
} else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) {
// Good length. Check for good Unicode.
if (GoodUnicodeFromBase64(start, start + n)) {
// Good length and Unicode, boost
Boost(destatep, F_UTF7, kBoostOnePair); // Found good
destatep->prior_utf7_offset = off + n + 1;
} else {
// Bad Unicode. Whack
Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
}
} else {
// Bad length. Whack
Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
}
}
}
}
// Boost, whack, or leave alone HZ probablilty
void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
if ((byte2 == '{') || (byte2 == '}')) {
Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
} else if ((byte2 == '~') || (byte2 == '\n')) {
destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral
} else {
Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
}
}
// Boost, whack, or leave alone BINARY probablilty
void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7);
int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
uint32 quad_mask = 1 << quadrant;
uint32 bucket8x4_mask = 1 << bucket8x4;
if ((destatep->binary_quadrants_seen & quad_mask) == 0) {
destatep->binary_quadrants_seen |= quad_mask;
destatep->binary_quadrants_count += 1;
if (destatep->binary_quadrants_count == 4) {
Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants,
// boost 2 pairs
}
}
if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
destatep->binary_8x4_seen |= bucket8x4_mask;
destatep->binary_8x4_count += 1;
if (destatep->binary_8x4_count >= 11) {
Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets,
// boost 4 pairs each time
}
}
}
// Demote UTF-16/32 on 0000 or FFFF, favoring Binary
void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {
if (byte1 == 0) { // We have 0000
Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
switch (offset & 3) {
case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE
Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair
break;
case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE
case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE
Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair
break;
case 3: // ambiguous
break;
}
} else { // We have ffff
Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
}
}
// Make even offset
void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {
destatep->interesting_offsets[OtherPair][next_pair] &= ~1;
}
bool ConsecutivePair(DetectEncodingState* destatep, int i) {
if (i <= 0) {
return false;
}
return destatep->interesting_offsets[OtherPair][i] ==
(destatep->interesting_offsets[OtherPair][i - 1] + 2);
}
// boost, whack, or leave alone UTF-8 probablilty
// Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
// Returns total boost
int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {
int startcount = destatep->prior_interesting_pair[OtherPair];
int endcount = destatep->next_interesting_pair[OtherPair];
int demotion_count = 0;
for (int i = startcount; i < endcount; ++i) {
int sub;
char* s = &destatep->interesting_pairs[OtherPair][i * 2];
// Demote four byte patterns that are more likely Latin1 than UTF-8
// C9AE, DF92, DF93, DFAB. See note at top.
// Demotion also boosts Latin1 and CP1252
uint8 s0 = static_cast<uint8>(s[0]);
uint8 s1 = static_cast<uint8>(s[1]);
if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}
if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}
if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}
if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}
if (!ConsecutivePair(destatep, i)) {
// Insert a blank into the sequence; avoid wrong splices
sub = (' ' >> 4) & 0x0f;
++destatep->utf8_minicount[
static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];