| // Copyright 2016 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| #include "compact_enc_det/compact_enc_det.h" |
| |
| #include <math.h> // for sqrt |
| #include <stddef.h> // for size_t |
| #include <stdio.h> // for printf, fprintf, NULL, etc |
| #include <stdlib.h> // for qsort |
| #include <string.h> // for memset, memcpy, memcmp, etc |
| #include <memory> |
| #include <string> // for string, operator==, etc |
| |
| #include "compact_enc_det/compact_enc_det_hint_code.h" |
| #include "util/string_util.h" |
| #include "util/basictypes.h" |
| #include "util/commandlineflags.h" |
| #include "util/logging.h" |
| |
| using std::string; |
| |
| // TODO as of 2007.10.09: |
| // |
| // Consider font=TT-BHxxx as user-defined => binary |
| // Demote GB18030 if no 8x3x pair |
| // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires |
| // Consider removing/ignoring bytes 01-1F to avoid crap pollution |
| // Possibly boost declared encoding in robust scan |
| // googlebot tiny files |
| // look for ranges of encodings |
| // consider tags just as > < within aligned block of 32 |
| // flag too few characters in postproc (Latin 6 problem) |
| // Remove slow scan beyond 16KB |
| // Consider removing kMostLikelyEncoding or cut it in half |
| |
| |
| // A note on mixed encodings |
| // |
| // The most common encoding error on the web is a page containing a mixture of |
| // CP-1252 and UTF-8. A less common encoding error is a third-party feed that |
| // has been converted from CP-1252 to UTF-8 and then those bytes converted a |
| // second time to UTF-8. CED originally attempted to detect these error cases |
| // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended |
| // implementation was to start these just below CP1252 and UTF8 respectively in |
| // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are |
| // found. |
| // |
| // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the |
| // UTF8CP1252 internal encoding was added late and not put into encodings.proto, |
| // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and |
| // is removed in this November 2011 CL. |
| // |
| // Mixed encoding detection never worked out as well as envisioned, so the |
| // ced_allow_utf8utf8 flag normally disables all this. |
| // |
| // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as |
| // UTF8, and the inputconverter code for UTF8 normally will convert bare |
| // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8 |
| // and double-UTF-8 mixtures will be detected as UTF-8, and the double |
| // conversion will stand. |
| // |
| // However, it is occasionally useful to use CED to detect double-converted |
| // UTF-8 coming from third-party data feeds, so they can be fixed at the source. |
| // For this purpose, the UTF8UTF8 encoding remains available under the |
| // ced_allow_utf8utf8 flag. |
| // |
| // When UTF8UTF8 is detected, the inputconverter code will undo the double |
| // conversion, giving good text. |
| |
| // Norbert Runge has noted these words in CP1252 that are mistakenly identified |
| // as UTF-8 because of the last pair of characters: |
| // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH |
| // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N |
| // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA |
| // Schoß\u201c 0xDF 0x93 U+00DF U+201C |
| // weiß\u201c 0xDF 0x93 U+00DF U+00AB |
| // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C |
| // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE |
| // These four byte combinations now explicitly boost Latin1/CP1252. |
| |
| // And for reference, here are a couple of Portuguese spellings |
| // that may be mistaken as double-byte encodings. |
| // informações 0xE7 0xF5 |
| // traição 0xE7 0xE3 |
| |
| |
| static const char* kVersion = "2.2"; |
| |
| DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, " |
| "to handle mixtures of CP1252 " |
| "converted to UTF-8 zero, one, " |
| "or two times"); |
| DEFINE_int32(enc_detect_slow_max_kb, 16, |
| "Maximum number of Kbytes to examine for " |
| "7-bit-only (2022, Hz, UTF7) encoding detect. " |
| "You are unlikely to want to change this."); |
| DEFINE_int32(enc_detect_fast_max_kb, 256, |
| "Maximum number of Kbytes to examine for encoding detect. " |
| "You are unlikely to want to change this."); |
| |
| DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility " |
| "difference 1st - 2nd to be considered reliable \n" |
| " 2 corresponds to min 4x difference\n" |
| " 4 corresponds to min 16x difference\n" |
| " 8 corresponds to min 256x difference\n" |
| " 10 corresponds to min 1024x difference\n" |
| " 20 corresponds to min 1Mx difference."); |
| |
| // Text debug output options |
| DEFINE_bool(enc_detect_summary, false, |
| "Print first 16 interesting pairs at exit."); |
| DEFINE_bool(counts, false, "Count major-section usage"); |
| |
| // PostScript debug output options |
| DEFINE_bool(enc_detect_detail, false, |
| "Print PostScript of every update, to stderr."); |
| DEFINE_bool(enc_detect_detail2, false, |
| "More PostScript detail of every update, to stderr."); |
| DEFINE_bool(enc_detect_source, false, "Include source text in detail"); |
| // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in |
| // lang_enc.cc |
| |
| // Following flags are not in use. Replace them with constants to |
| // avoid static initialization. |
| |
| //DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name."); |
| //DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name."); |
| |
| static const char* const FLAGS_enc_detect_watch1 = ""; |
| static const char* const FLAGS_enc_detect_watch2 = ""; |
| |
| // Only for experiments. Delete soon. |
| DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams"); |
| |
| // Demo-mode/debugging experiment |
| DEFINE_bool(demo_nodefault, false, |
| "Default to all equal; no boost for declared encoding."); |
| DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings"); |
| DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr"); |
| |
| |
| static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10 |
| static const int XLOG2 = 30; // Multiplier for log base 2 ** n |
| |
| static const int kFinalPruneDifference = 10 * XLOG2; |
| // Final bits of minimum |
| // probability difference 1st-nth |
| // to be pruned |
| |
| static const int kInititalPruneDifference = kFinalPruneDifference * 4; |
| // Initial bits of minimum |
| // probability difference 1st-nth |
| // to be pruned |
| // |
| static const int kPruneDiffDecrement = kFinalPruneDifference; |
| // Decrements bits of minimum |
| // probability difference 1st-nth |
| // to be pruned |
| |
| static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum |
| // probability difference, base to |
| // superset encodings |
| |
| static const int kBoostInitial = 20 * XLOG2; // bits of boost for |
| // initial byte patterns (BOM, 00) |
| |
| static const int kBadPairWhack = 20 * XLOG2; // bits of whack for |
| // one bad pair |
| |
| static const int kBoostOnePair = 20 * XLOG2; // bits of boost for |
| // one good pair in Hz, etc. |
| |
| static const int kGentleOnePair = 4 * XLOG2; // bits of boost for |
| // one good sequence |
| // |
| static const int kGentlePairWhack = 2 * XLOG2; // bits of whack |
| // for ill-formed sequence |
| |
| static const int kGentlePairBoost = 2 * XLOG2; // bits of boost |
| // for well-formed sequence |
| |
| static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for |
| // best declared encoding per bigram |
| |
| static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for |
| // best encoding per bigram |
| |
| static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri |
| |
| static const int kMaxPairs = 48; // Max interesting pairs to look at |
| // If you change this, |
| // adjust *PruneDiff* |
| |
| static const int kPruneMask = 0x07; // Prune every 8 interesting pairs |
| |
| |
| static const int kBestPairsCount = 16; // For first N pairs, do extra boost |
| // based on most likely encoding |
| // of pair over entire web |
| |
| static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams, |
| // weaken the hints enough that |
| // unhinted encodings have a hope of |
| // rising to the top |
| |
| static const int kMinRescanLength = 800; // Don't bother rescanning for |
| // unreliable encoding if fewer |
| // than this many bytes unscanned. |
| // We will rescan at most last half |
| // of this. |
| |
| static const int kStrongBinary = 12; // Make F_BINARY the only encoding |
| static const int kWeakerBinary = 4; // Make F_BINARY likely encoding |
| |
| // These are byte counts from front of file |
| static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII |
| static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII |
| |
| // We try here to avoid having title text dominate the encoding detection, |
| // for the not-infrequent error case of title in encoding1, body in encoding2: |
| // we want to bias toward encoding2 winning. |
| // |
| // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we |
| // rarely cut off mid-character in the original (not-yet-detected) encoding. |
| // This matters most for UTF-8 two- and three-byte codes and for |
| // Shift-JIS three-byte codes. |
| static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text |
| static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc. |
| // 1/16 normal weight |
| |
| static const int kStrongPairs = 6; // Let reliable enc with this many |
| // pairs overcome missing hint |
| |
| enum CEDInternalFlags { |
| kCEDNone = 0, // The empty flag |
| kCEDRescanning = 1, // Do not further recurse |
| kCEDSlowscore = 2, // Do extra scoring |
| kCEDForceTags = 4, // Always examine text inside tags |
| }; |
| |
| // Forward declaration |
| Encoding InternalDetectEncoding( |
| CEDInternalFlags flags, const char* text, int text_length, |
| const char* url_hint, const char* http_charset_hint, |
| const char* meta_charset_hint, const int encoding_hint, |
| const Language language_hint, // User interface lang |
| const CompactEncDet::TextCorpusType corpus_type, |
| bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable, |
| Encoding* second_best_enc); |
| |
| typedef struct { |
| const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas |
| uint8 x_bar; // Average byte2 value |
| uint8 y_bar; // Average byte1 value |
| uint8 x_stddev; // Standard deviation of byte2 value |
| uint8 y_stddev; // Standard deviation of byte1 value |
| int so; // Scaling offset -- add to probabilities below |
| uint8 b1[256]; // Unigram probability for first byte of aligned bigram |
| uint8 b2[256]; // Unigram probability for second byte of aligned bigram |
| uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram |
| } UnigramEntry; |
| |
| //typedef struct { |
| // uint8 b12[256*256]; // Bigram probability for aligned bigram |
| //} FullBigramEntry; |
| |
| |
| // Include all the postproc-generated tables here: |
| // RankedEncoding |
| // kMapToEncoding |
| // unigram_table |
| // kMostLIkelyEncoding |
| // kTLDHintProbs |
| // kCharsetHintProbs |
| // HintEntry, kMaxTldKey kMaxTldVector, etc. |
| // ============================================================================= |
| |
| #include "compact_enc_det/compact_enc_det_generated_tables.h" |
| |
| |
| #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1" |
| |
| #define F_BINARY F_X_BINARYENC // We are mid-update for name change |
| #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change |
| #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change |
| #define F_Unicode F_UTF_16LE // We are mid-update for name change |
| // ============================================================================= |
| |
| // 7-bit encodings have at least one "interesting" byte value < 0x80 |
| // (00 0E 1B + ~) |
| // JIS 2022-cn 2022-kr hz utf7 |
| // Unicode UTF-16 UTF-32 |
| // 8-bit encodings have no interesting byte values < 0x80 |
| static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect |
| static const uint32 kUTF7Active = 0x00000002; // <80 and + |
| static const uint32 kHzActive = 0x00000004; // <80 and ~ |
| static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F |
| static const uint32 kUTF8Active = 0x00000010; |
| static const uint32 kUTF8UTF8Active = 0x00000020; |
| static const uint32 kUTF1632Active = 0x00000040; // <80 and 00 |
| static const uint32 kBinaryActive = 0x00000080; // <80 and 00 |
| static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx |
| static const uint32 kIsIndicCode = 0x00000200; // |
| static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx |
| static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx |
| static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase |
| |
| |
| // Debug only. not thread safe |
| static int encdet_used = 0; |
| static int rescore_used = 0; |
| static int rescan_used = 0; |
| static int robust_used = 0; |
| static int looking_used = 0; |
| static int doing_used = 0; |
| |
| |
| // For debugging only -- about 256B/entry times about 500 = 128KB |
| // TODO: only allocate this if being used |
| typedef struct { |
| int offset; |
| int best_enc; // Best ranked encoding for this bigram, or |
| // -1 for overhead entries |
| string label; |
| int detail_enc_prob[NUM_RANKEDENCODING]; |
| } DetailEntry; |
| |
| static int watch1_rankedenc = -1; // Debug. not threadsafe |
| static int watch2_rankedenc = -1; // Debug. not threadsafe |
| ////static int next_detail_entry = 0; // Debug. not threadsafe |
| ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram |
| // End For debugging only |
| |
| // Must match kTestPrintableAsciiTildePlus exit codes, minus one |
| enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2}; |
| |
| // The reasons for pruning |
| enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL}; |
| |
| static const char* kWhatSetName[] = {"Ascii", "Other"}; |
| |
| |
| // State for encodings that do shift-out/shift-in between one- and two-byte |
| // regions (ISO-2022-xx, HZ) |
| enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE}; |
| |
| typedef struct { |
| const uint8* initial_src; // For calculating byte offsets |
| const uint8* limit_src; // Range of input source |
| const uint8* prior_src; // Source consumed by prior call to BoostPrune |
| const uint8* last_pair; // Last pair inserted into interesting_pairs |
| |
| DetailEntry* debug_data; // Normally NULL. Ptr to debug data for |
| // FLAGS_enc_detect_detail PostScript data |
| int next_detail_entry; // Debug |
| |
| bool done; |
| bool reliable; |
| bool hints_derated; |
| int declared_enc_1; // From http/meta hint |
| int declared_enc_2; // from http/meta hint |
| int prune_count; // Number of times we have pruned |
| |
| int trigram_highwater_mark; // Byte offset of last trigram processing |
| bool looking_for_latin_trigrams; // True if we should test for doing |
| // Latin1/2/7 trigram processing |
| bool do_latin_trigrams; // True if we actually are scoring trigrams |
| |
| // Miscellaneous state variables for difficult encodings |
| int binary_quadrants_count; // Number of four bigram quadrants seen: |
| // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx |
| // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx |
| int binary_8x4_count; // Number of 8x4 buckets seen: |
| uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen |
| uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen |
| int utf7_starts; // Count of possible UTF-7 beginnings seen |
| int prior_utf7_offset; // Source consumed by prior UTF-7 string |
| int next_utf8_ministate; // Mini state for UTF-8 sequences |
| int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors |
| int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences |
| int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes |
| int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors |
| StateSoSi next_2022_state; // Mini state for 2022 sequences |
| StateSoSi next_hz_state; // Mini state for HZ sequences |
| bool next_eucjp_oddphase; // Mini state for EUC-JP sequences |
| int byte32_count[8]; // Count of top 3 bits of byte1 of bigram |
| // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx |
| uint32 active_special; // Bits showing which special cases are active |
| |
| Encoding tld_hint; // Top TLD encoding or UNKNOWN |
| Encoding http_hint; // What the document says about itself or |
| Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte |
| Encoding bom_hint; // order mark for UTF-xx |
| |
| // small cache of previous interesting bigrams |
| int next_prior_bigram; |
| int prior_bigram[4]; |
| int prior_binary[1]; |
| |
| int top_rankedencoding; // Top two probabilities and families |
| int second_top_rankedencoding; |
| int top_prob; |
| int second_top_prob; |
| int prune_difference; // Prune things this much below the top prob |
| int rankedencoding_list_len; // Number of active encodings |
| int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings |
| // |
| int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc |
| // This is where all the action is |
| int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities |
| int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc |
| |
| // Two sets -- one for printable ASCII, one for the rest |
| int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call |
| int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write |
| char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair |
| int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair |
| int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair |
| } DetectEncodingState; |
| |
| |
| // Record a debug event that changes probabilities |
| void SetDetailsEncProb(DetectEncodingState* destatep, |
| int offset, int best_enc, const char* label) { |
| int next = destatep->next_detail_entry; |
| destatep->debug_data[next].offset = offset; |
| destatep->debug_data[next].best_enc = best_enc; |
| destatep->debug_data[next].label = label; |
| memcpy(&destatep->debug_data[next].detail_enc_prob, |
| &destatep->enc_prob, |
| sizeof(destatep->enc_prob)); |
| ++destatep->next_detail_entry; |
| } |
| |
| // Record a debug event that changes probabilities, copy offset |
| void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep, |
| int best_enc, const char* label) { |
| int next = destatep->next_detail_entry; |
| destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| destatep->debug_data[next].best_enc = best_enc; |
| destatep->debug_data[next].label = label; |
| memcpy(&destatep->debug_data[next].detail_enc_prob, |
| &destatep->enc_prob, |
| sizeof(destatep->enc_prob)); |
| ++destatep->next_detail_entry; |
| } |
| |
| // Record a debug event that changes probs and has simple text label |
| void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) { |
| int next = destatep->next_detail_entry; |
| destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| destatep->debug_data[next].best_enc = -1; |
| destatep->debug_data[next].label = label; |
| memcpy(&destatep->debug_data[next].detail_enc_prob, |
| &destatep->enc_prob, |
| sizeof(destatep->enc_prob)); |
| ++destatep->next_detail_entry; |
| } |
| |
| // Record a debug event that is just a text label, no change in probs |
| void SetDetailsLabel(DetectEncodingState* destatep, const char* label) { |
| int next = destatep->next_detail_entry; |
| destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset; |
| destatep->debug_data[next].best_enc = -1; |
| destatep->debug_data[next].label = label; |
| memcpy(&destatep->debug_data[next].detail_enc_prob, |
| &destatep->debug_data[next - 1].detail_enc_prob, |
| sizeof(destatep->enc_prob)); |
| ++destatep->next_detail_entry; |
| } |
| |
| |
| // Maps superset encodings to base, to see if 2 encodings are compatible |
| // (Non-identity mappings are marked "-->" below.) |
| static const Encoding kMapEncToBaseEncoding[] = { |
| ISO_8859_1, // 0: Teragram ASCII |
| ISO_8859_2, // 1: Teragram Latin2 |
| ISO_8859_3, // 2: in BasisTech but not in Teragram |
| ISO_8859_4, // 3: Teragram Latin4 |
| ISO_8859_5, // 4: Teragram ISO-8859-5 |
| ISO_8859_6, // 5: Teragram Arabic |
| ISO_8859_7, // 6: Teragram Greek |
| MSFT_CP1255, // 7: Teragram Hebrew --> 36 |
| ISO_8859_9, // 8: in BasisTech but not in Teragram |
| ISO_8859_10, // 9: in BasisTech but not in Teragram |
| JAPANESE_EUC_JP, // 10: Teragram EUC_JP |
| JAPANESE_SHIFT_JIS, // 11: Teragram SJS |
| JAPANESE_JIS, // 12: Teragram JIS |
| CHINESE_BIG5, // 13: Teragram BIG5 |
| CHINESE_GB, // 14: Teragram GB |
| CHINESE_EUC_CN, // 15: Teragram EUC-CN |
| KOREAN_EUC_KR, // 16: Teragram KSC |
| UNICODE, // 17: Teragram Unicode |
| CHINESE_EUC_CN, // 18: Teragram EUC --> 15 |
| CHINESE_EUC_CN, // 19: Teragram CNS --> 15 |
| CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13 |
| JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11 |
| UTF8, // 22 |
| UNKNOWN_ENCODING, // 23 |
| ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0 |
| RUSSIAN_KOI8_R, // 25: Teragram KOI8R |
| RUSSIAN_CP1251, // 26: Teragram CP1251 |
| ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0 |
| RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian |
| MSFT_CP1250, // 29: CP1250 aka MSFT eastern european |
| ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0 |
| ISO_8859_9, // 31: used for Turkish |
| ISO_8859_13, // 32: used in Baltic countries --> 43 |
| ISO_8859_11, // 33: aka TIS-620, used for Thai |
| ISO_8859_11, // 34: used for Thai --> 33 |
| MSFT_CP1256, // 35: used for Arabic |
| MSFT_CP1255, // 36: Logical Hebrew Microsoft |
| MSFT_CP1255, // 37: Iso Hebrew Logical --> 36 |
| MSFT_CP1255, // 38: Iso Hebrew Visual --> 36 |
| CZECH_CP852, // 39 |
| ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1 |
| MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7 |
| RUSSIAN_CP866, // 42 |
| ISO_8859_13, // 43 |
| ISO_2022_KR, // 44 |
| CHINESE_GB, // 45 GBK --> 14 |
| CHINESE_GB, // 46 GB18030 --> 14 |
| CHINESE_BIG5, // 47 BIG5_HKSCS --> 13 |
| ISO_2022_KR, // 48 ISO_2022_CN --> 44 |
| TSCII, // 49 Indic encoding |
| TAMIL_MONO, // 50 Indic encoding - Tamil |
| TAMIL_BI, // 51 Indic encoding - Tamil |
| JAGRAN, // 52 Indic encoding - Devanagari |
| MACINTOSH_ROMAN, // 53 |
| UTF7, // 54 |
| BHASKAR, // 55 Indic encoding - Devanagari |
| HTCHANAKYA, // 56 Indic encoding - Devanagari |
| UTF16BE, // 57 |
| UTF16LE, // 58 |
| UTF32BE, // 59 |
| UTF32LE, // 60 |
| BINARYENC, // 61 |
| HZ_GB_2312, // 62 |
| UTF8UTF8, // 63 |
| TAM_ELANGO, // 64 Elango - Tamil |
| TAM_LTTMBARANI, // 65 Barani - Tamil |
| TAM_SHREE, // 66 Shree - Tamil |
| TAM_TBOOMIS, // 67 TBoomis - Tamil |
| TAM_TMNEWS, // 68 TMNews - Tamil |
| TAM_WEBTAMIL, // 69 Webtamil - Tamil |
| KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS |
| DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS |
| SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS |
| KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP |
| SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP |
| }; |
| |
| COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS, |
| kMapEncToBaseEncoding_has_incorrect_size); |
| |
| // Maps base encodings to 0, supersets to 1+, undesired to -1 |
| // (Non-identity mappings are marked "-->" below.) |
| static const int kMapEncToSuperLevel[] = { |
| 0, // 0: Teragram ASCII |
| 0, // 1: Teragram Latin2 |
| 0, // 2: in BasisTech but not in Teragram |
| 0, // 3: Teragram Latin4 |
| 0, // 4: Teragram ISO-8859-5 |
| 0, // 5: Teragram Arabic |
| 0, // 6: Teragram Greek |
| 0, // 7: Teragram Hebrew |
| 0, // 8: in BasisTech but not in Teragram |
| 0, // 9: in BasisTech but not in Teragram |
| 0, // 10: Teragram EUC_JP |
| 0, // 11: Teragram SJS |
| 0, // 12: Teragram JIS |
| 0, // 13: Teragram BIG5 |
| 0, // 14: Teragram GB |
| 0, // 15: Teragram EUC-CN |
| 0, // 16: Teragram KSC |
| 0, // 17: Teragram Unicode |
| -1, // 18: Teragram EUC --> 15 |
| -1, // 19: Teragram CNS --> 15 |
| 1, // 20: Teragram BIG5_CP950 --> 13 |
| 1, // 21: Teragram CP932 --> 11 |
| 0, // 22 |
| -1, // 23 |
| -1, // 24: ISO_8859_1 with all characters <= 127 --> 0 |
| 0, // 25: Teragram KOI8R |
| 0, // 26: Teragram CP1251 |
| 1, // 27: CP1252 aka MSFT euro ascii --> 0 |
| 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian |
| 0, // 29: CP1250 aka MSFT eastern european |
| 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0 |
| 0, // 31: used for Turkish |
| 1, // 32: used in Baltic countries --> 43 |
| 0, // 33: aka TIS-620, used for Thai |
| 1, // 34: used for Thai --> 33 |
| 0, // 35: used for Arabic |
| 0, // 36: Logical Hebrew Microsoft |
| -1, // 37: Iso Hebrew Logical --> 36 |
| -1, // 38: Iso Hebrew Visual --> 7 |
| 0, // 39 |
| 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1 |
| 0, // 41: used for Greek, NOT superset of 8859-7 |
| 0, // 42 |
| 0, // 43 |
| 0, // 44 |
| 1, // 45 GBK --> 14 |
| 1, // 46 GB18030 --> 14 |
| 1, // 47 BIG5_HKSCS --> 13 |
| 1, // 48 ISO_2022_CN --> 44 |
| 0, // 49 Indic encoding |
| 0, // 50 Indic encoding - Tamil |
| 0, // 51 Indic encoding - Tamil |
| 0, // 52 Indic encoding - Devanagari |
| 0, // 53 |
| 0, // 54 |
| 0, // 55 Indic encoding - Devanagari |
| 0, // 56 Indic encoding - Devanagari |
| 0, // 57 |
| 0, // 58 |
| 0, // 59 |
| 0, // 60 |
| 0, // 61 |
| 0, // 62 |
| 2, // 63 |
| 0, 0, 0, 0, 0, 0, // add six more Tamil |
| 0, 0, 0, 0, 0, // add five encodings with emoji |
| }; |
| |
| COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS, |
| kMapEncToSuperLevel_has_incorrect_size); |
| |
| |
| |
| // Subscripted by Encoding enum value |
| static const uint32 kSpecialMask[] = { |
| kHighAccentCode, // 0 |
| kHighAccentCode, |
| kHighAccentCode, |
| kHighAccentCode, |
| kHighAlphaCode, // 4 |
| kHighAlphaCode, |
| kHighAlphaCode, |
| kHighAlphaCode, |
| kHighAccentCode, |
| kHighAccentCode, |
| |
| kTwobyteCode + kEUCJPActive, // 10 euc-jp |
| kTwobyteCode, |
| kSevenBitActive + kIso2022Active, // jis |
| kTwobyteCode, |
| kTwobyteCode, |
| kTwobyteCode, |
| kTwobyteCode, |
| kSevenBitActive + kUTF1632Active, // Unicode |
| kTwobyteCode, |
| kTwobyteCode, |
| |
| kTwobyteCode, // 20 |
| kTwobyteCode, |
| kUTF8Active, // UTF-8 |
| 0, |
| 0, |
| kHighAlphaCode, // 25 |
| kHighAlphaCode, |
| kHighAccentCode, |
| kHighAlphaCode, |
| kHighAccentCode, |
| |
| kHighAccentCode, // 30 |
| kHighAccentCode, |
| kHighAccentCode, |
| kHighAlphaCode, |
| kHighAlphaCode, |
| kHighAlphaCode, // 35 |
| kHighAlphaCode, |
| kHighAlphaCode, |
| kHighAlphaCode, |
| 0, |
| |
| 0, // 40 |
| kHighAlphaCode, |
| kHighAlphaCode, |
| kHighAccentCode, |
| kSevenBitActive + kIso2022Active, // 2022-kr |
| kTwobyteCode, |
| kTwobyteCode, |
| kTwobyteCode, |
| kSevenBitActive + kIso2022Active, // 2022-cn |
| kHighAlphaCode + kIsIndicCode, // 49 TSCII |
| |
| kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO |
| kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI |
| kHighAlphaCode + kIsIndicCode, // 52 JAGRAN |
| kHighAccentCode, // 53 MACINTOSH_ROMAN |
| kSevenBitActive + kUTF7Active, // 54 UTF-7 |
| kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari |
| kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanagari |
| kSevenBitActive + kUTF1632Active, // 57 UTF16BE |
| kSevenBitActive + kUTF1632Active, // 58 UTF16LE |
| kSevenBitActive + kUTF1632Active, // 59 UTF32BE |
| kSevenBitActive + kUTF1632Active, // 60 UTF32LE |
| |
| kSevenBitActive + kBinaryActive, // 61 BINARYENC |
| kSevenBitActive + kHzActive, // 62 HZ_GB_2312 |
| kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8 |
| kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil |
| kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil |
| kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil |
| kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil |
| kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil |
| kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil |
| kTwobyteCode, // 70 KDDI Shift_JIS |
| kTwobyteCode, // 71 DoCoMo Shift_JIS |
| kTwobyteCode, // 72 SoftBank Shift_JIS |
| kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP |
| kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP |
| }; |
| |
| COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS, |
| kSpecialMask_has_incorrect_size); |
| |
| |
| /*** |
| kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents |
| |
| ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd |
| RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef |
| RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef |
| RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef |
| RUSSIAN_CP866, // 42 89ae |
| |
| ISO_8859_6, // 5: Teragram Arabic nocase cde |
| MSFT_CP1256, // 35: used for Arabic nocase cde |
| |
| ISO_8859_7, // 6: Teragram Greek UL cdef |
| MSFT_CP1253, // 41: used for Greek UL cdef |
| |
| ISO_8859_8, // 7: Teragram Hebrew nocase ef |
| MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef |
| ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef |
| HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef |
| |
| ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde |
| MSFT_CP874, // 34: used for Thai nocase abcde |
| |
| TSCII, // 49 8-f |
| TAMIL_MONO, // 50 |
| TAMIL_BI, // 51 |
| JAGRAN, // 52 |
| BHASKAR, // 55 Indic encoding - Devanagari |
| HTCHANAKYA, // 56 Indic encoding - Devanagari |
| ***/ |
| |
| // We can scan bytes using this at about 500 MB/sec 2.8GHz P4 |
| // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~ |
| // We allow FF, 0x0C, here because it gives a better result for old |
| // Ascii text formatted for a TTY |
| // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise |
| static const char kTestPrintableAsciiTildePlus[256] = { |
| 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2, |
| |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| }; |
| |
| // We can scan bytes using this at about 550 MB/sec 2.8GHz P4 |
| // Slow scan uses this, stopping on NUL ESC SO SI and bad C0 |
| // after Hz and UTF7 are pruned away |
| // We allow Form Feed, 0x0C, here |
| static const char kTestPrintableAscii[256] = { |
| 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2, |
| |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, |
| }; |
| |
| // Used in first-four-byte testing |
| static const char kIsPrintableAscii[256] = { |
| 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0, |
| |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| }; |
| |
| |
| static const signed char kBase64Value[256] = { |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63, |
| 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1, |
| |
| -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, |
| 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1, |
| -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40, |
| 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1, |
| |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1, |
| }; |
| |
| |
| // Subscripted by <state, byte/16> |
| // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x |
| // |
| // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9 |
| // which we can mis-parse as an error byte followed by good UTF-8: |
| // B2 DBB8 D6BD E1B9B9 |
| // To counteract this, we now require an ASCII7 byte to resync out |
| // of the error state |
| // Next problem: good UTF-8 with bad byte |
| // efbc a012 eea4 bee7 b280 c2b7 |
| // efbca0 12 eea4be e7b280 c2b7 |
| // ^^ bad byte |
| // fix: change state0 byte 1x to be don't-care |
| // |
| // Short UTF-8 ending in ASCII7 byte should resync immediately: |
| // E0 20 E0 A6 AA should give one error and resync at 2nd E0 |
| // |
| static const char kMiniUTF8State[8][16] = { |
| {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht) |
| {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2 |
| {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3 |
| {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3 |
| {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4 |
| {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4 |
| {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4 |
| {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues, |
| // ONLY resync after Ascii char |
| // then restart |
| }; |
| // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B |
| static const char kMiniUTF8Count[8][16] = { |
| {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht) |
| {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2 |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3 |
| {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3 |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4 |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4 |
| {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4 |
| {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues, |
| // then restart |
| }; |
| |
| // Subscripted by <state, f(byte1) + g(byte2)> |
| // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise |
| // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. |
| // (no checking for illegal bytes) |
| // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want |
| // to detect two, so we can back-convert to one. |
| // zero one two pattern |
| // ---- ------ ---------------- ----------------- |
| // 81 C281 C382C281 C3->8x->C2->xx |
| // 98 CB9C C38BC593 C3->8x->C5->xx |
| // C3 C383 C383C692 C3->8x->C6->xx |
| // C8 C388 C383CB86 C3->8x->CB->xx |
| // 83 C692 C386E28099 C3->8x->E2->xx->8x |
| // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx |
| // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx |
| // |
| // We also want to detect bare-byte extra UTF-8 conversions: |
| // zero one two pattern |
| // ---- ------ ---------------- ----------------- |
| // C3 C3 C383 C3->8x->C2->xx |
| // D3 D3 C393 C3->9x->C2->xx->C2->xx |
| // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx |
| // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx |
| // |
| |
| /** |
| CP1252 => UTF8 => UTF8UTF8 |
| 80 => E282AC => C3A2E2809AC2AC |
| 81 => C281 => C382C281 |
| 82 => E2809A => C3A2E282ACC5A1 |
| 83 => C692 => C386E28099 |
| 84 => E2809E => C3A2E282ACC5BE |
| 85 => E280A6 => C3A2E282ACC2A6 |
| 86 => E280A0 => C3A2E282ACC2A0 |
| 87 => E280A1 => C3A2E282ACC2A1 |
| 88 => CB86 => C38BE280A0 |
| 89 => E280B0 => C3A2E282ACC2B0 |
| 8A => C5A0 => C385C2A0 |
| 8B => E280B9 => C3A2E282ACC2B9 |
| 8C => C592 => C385E28099 |
| 8D => C28D => C382C28D |
| 8E => C5BD => C385C2BD |
| 8F => C28F => C382C28F |
| 90 => C290 => C382C290 |
| 91 => E28098 => C3A2E282ACCB9C |
| 92 => E28099 => C3A2E282ACE284A2 |
| 93 => E2809C => C3A2E282ACC593 |
| 94 => E2809D => C3A2E282ACC29D |
| 95 => E280A2 => C3A2E282ACC2A2 |
| 96 => E28093 => C3A2E282ACE2809C |
| 97 => E28094 => C3A2E282ACE2809D |
| 98 => CB9C => C38BC593 |
| 99 => E284A2 => C3A2E2809EC2A2 |
| 9A => C5A1 => C385C2A1 |
| 9B => E280BA => C3A2E282ACC2BA |
| 9C => C593 => C385E2809C |
| 9D => C29D => C382C29D |
| 9E => C5BE => C385C2BE |
| 9F => C5B8 => C385C2B8 |
| A0 => C2A0 => C382C2A0 |
| A1 => C2A1 => C382C2A1 |
| A2 => C2A2 => C382C2A2 |
| A3 => C2A3 => C382C2A3 |
| A4 => C2A4 => C382C2A4 |
| A5 => C2A5 => C382C2A5 |
| A6 => C2A6 => C382C2A6 |
| A7 => C2A7 => C382C2A7 |
| A8 => C2A8 => C382C2A8 |
| A9 => C2A9 => C382C2A9 |
| AA => C2AA => C382C2AA |
| AB => C2AB => C382C2AB |
| AC => C2AC => C382C2AC |
| AD => C2AD => C382C2AD |
| AE => C2AE => C382C2AE |
| AF => C2AF => C382C2AF |
| B0 => C2B0 => C382C2B0 |
| B1 => C2B1 => C382C2B1 |
| B2 => C2B2 => C382C2B2 |
| B3 => C2B3 => C382C2B3 |
| B4 => C2B4 => C382C2B4 |
| B5 => C2B5 => C382C2B5 |
| B6 => C2B6 => C382C2B6 |
| B7 => C2B7 => C382C2B7 |
| B8 => C2B8 => C382C2B8 |
| B9 => C2B9 => C382C2B9 |
| BA => C2BA => C382C2BA |
| BB => C2BB => C382C2BB |
| BC => C2BC => C382C2BC |
| BD => C2BD => C382C2BD |
| BE => C2BE => C382C2BE |
| BF => C2BF => C382C2BF |
| C0 => C380 => C383E282AC |
| C1 => C381 => C383C281 |
| C2 => C382 => C383E2809A |
| C3 => C383 => C383C692 |
| C4 => C384 => C383E2809E |
| C5 => C385 => C383E280A6 |
| C6 => C386 => C383E280A0 |
| C7 => C387 => C383E280A1 |
| C8 => C388 => C383CB86 |
| C9 => C389 => C383E280B0 |
| CA => C38A => C383C5A0 |
| CB => C38B => C383E280B9 |
| CC => C38C => C383C592 |
| CD => C38D => C383C28D |
| CE => C38E => C383C5BD |
| CF => C38F => C383C28F |
| D0 => C390 => C383C290 |
| D1 => C391 => C383E28098 |
| D2 => C392 => C383E28099 |
| D3 => C393 => C383E2809C |
| D4 => C394 => C383E2809D |
| D5 => C395 => C383E280A2 |
| D6 => C396 => C383E28093 |
| D7 => C397 => C383E28094 |
| D8 => C398 => C383CB9C |
| D9 => C399 => C383E284A2 |
| DA => C39A => C383C5A1 |
| DB => C39B => C383E280BA |
| DC => C39C => C383C593 |
| DD => C39D => C383C29D |
| DE => C39E => C383C5BE |
| DF => C39F => C383C5B8 |
| E0 => C3A0 => C383C2A0 |
| E1 => C3A1 => C383C2A1 |
| E2 => C3A2 => C383C2A2 |
| E3 => C3A3 => C383C2A3 |
| E4 => C3A4 => C383C2A4 |
| E5 => C3A5 => C383C2A5 |
| E6 => C3A6 => C383C2A6 |
| E7 => C3A7 => C383C2A7 |
| E8 => C3A8 => C383C2A8 |
| E9 => C3A9 => C383C2A9 |
| EA => C3AA => C383C2AA |
| EB => C3AB => C383C2AB |
| EC => C3AC => C383C2AC |
| ED => C3AD => C383C2AD |
| EE => C3AE => C383C2AE |
| EF => C3AF => C383C2AF |
| F0 => C3B0 => C383C2B0 |
| F1 => C3B1 => C383C2B1 |
| F2 => C3B2 => C383C2B2 |
| F3 => C3B3 => C383C2B3 |
| F4 => C3B4 => C383C2B4 |
| F5 => C3B5 => C383C2B5 |
| F6 => C3B6 => C383C2B6 |
| F7 => C3B7 => C383C2B7 |
| F8 => C3B8 => C383C2B8 |
| F9 => C3B9 => C383C2B9 |
| FA => C3BA => C383C2BA |
| FB => C3BB => C383C2BB |
| FC => C3BC => C383C2BC |
| FD => C3BD => C383C2BD |
| FE => C3BE => C383C2BE |
| FF => C3BF => C383C2BF |
| **/ |
| |
| // Subscripted by <state, f(byte1) + g(byte2)> |
| // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise |
| // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc. |
| |
| // 81 C281 C382C281 C3->8x->C2->xx |
| // 98 CB9C C38BC593 C3->8x->C5->xx |
| // C3 C383 C383C692 C3->8x->C6->xx |
| // C8 C388 C383CB86 C3->8x->CB->xx |
| // [0] [2] [0] |
| // 83 C692 C386E28099 C3->8x->E2->xx->xx |
| // odd_byte=0 [0] [2] [0+] odd_byte flipped |
| // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped |
| // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx |
| // odd_byte=0 [0] [3] [4] [0+] |
| // odd_byte=1 [0+] [3] [4] [4] [0] |
| // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx |
| // odd_byte=0 [0] [3] [4] [0] [0] |
| // odd_byte=1 [0+] [3] [4] [4] [0+] |
| // |
| // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip |
| // the odd_byte state. If that goes from 0 to 1, the next pair is offset up |
| // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes |
| // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx. |
| // These are absorbed with no error in state 0 or state 4 |
| // |
| // C3 C3 C383 C3->8x->C2->xx |
| // D3 D3 C393 C3->9x->C2->xx->C2->xx |
| // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx |
| // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx |
| // Counter3 for Fx Ex sequences is incremented at last C2 |
| |
| static const char kMiniUTF8UTF8State[8][16] = { |
| // xxxx E2xx CXxx C3xx |
| // 8 9 a b 8 9 a b 8 9 a b |
| {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err |
| {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking |
| {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xxC2xx |
| // + + + + // E2xxxx flips odd_byte |
| {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx |
| {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx |
| }; |
| // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B |
| static const char kMiniUTF8UTF8Count[8][16] = { |
| // xxxx E2xx C2Xx C3xx |
| // 8 9 a b 8 9 a b 8 9 a b |
| {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err |
| {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking |
| {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx |
| // + + + + // E2xxxx flips odd_byte |
| {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx |
| {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx |
| {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx |
| }; |
| |
| static const char kMiniUTF8UTF8Odd[8][16] = { |
| // xxxx E2xx C2Xx C3xx |
| // 8 9 a b 8 9 a b 8 9 a b |
| {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err |
| {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking |
| {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx |
| // + + + + // E2xxxx flips odd_byte |
| {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx |
| // + + + + // E2xxxx flips odd_byte |
| {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC2xx |
| {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx |
| {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx |
| }; |
| |
| // Turn a pair of bytes into the subscript for UTF8UTF8 tables above |
| int UTF88Sub(char s0, char s1) { |
| int sub = (s1 >> 4) & 0x03; |
| uint8 u0 = static_cast<uint8>(s0); |
| if (u0 == 0xc3) { |
| sub += 12; |
| } else if ((u0 & 0xf0) == 0xc0) { |
| if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) { |
| sub += 8; |
| } |
| } else if (u0 == 0xe2) { |
| sub += 4; |
| } |
| return sub; |
| } |
| |
| |
| |
| |
| |
| // Default probability for an encoding rankedencoding |
| // Based on a scan of 55M web pages |
| // These values are 255 - log base 2**1/10 (occurrences / total) |
| // Large values are most likely. This the reverse of some Google code |
| // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M) |
| // |
| // TODO change this to be per encoding, not permuted |
| // |
| |
| |
| // Support function for unit test program |
| // Return ranked encoding corresponding to enc |
| // (also exported to compact_enc_det_text.cc) |
| int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) { |
| for (int i = 0; i < NUM_RANKEDENCODING; ++i) { |
| if (kMapToEncoding[i] == enc) { |
| return i; |
| } |
| } |
| return -1; |
| } |
| |
| |
| string DecodeActive(uint32 active) { |
| string temp(""); |
| if (active & kBinaryActive) { |
| temp.append("Binary "); |
| } |
| if (active & kUTF1632Active) { |
| temp.append("UTF1632 "); |
| } |
| if (active & kUTF8UTF8Active) { |
| temp.append("UTF8UTF8 "); |
| } |
| if (active & kUTF8Active) { |
| temp.append("UTF8 "); |
| } |
| if (active & kIso2022Active) { |
| temp.append("Iso2022 "); |
| } |
| if (active & kHzActive) { |
| temp.append("Hz "); |
| } |
| if (active & kUTF7Active) { |
| temp.append("UTF7A "); |
| } |
| if (active & kSevenBitActive) { |
| temp.append("SevenBit "); |
| } |
| if (active & kIsIndicCode) { |
| temp.append("Indic "); |
| } |
| if (active & kHighAlphaCode) { |
| temp.append("HighAlpha "); |
| } |
| if (active & kHighAccentCode) { |
| temp.append("HighAccent "); |
| } |
| if (active & kEUCJPActive) { |
| temp.append("EUCJP "); |
| } |
| return temp; |
| } |
| |
| static inline bool SevenBitEncoding(int enc) { |
| return ((kSpecialMask[enc] & kSevenBitActive) != 0); |
| } |
| static inline bool TwoByteEncoding(int enc) { |
| return ((kSpecialMask[enc] & kTwobyteCode) != 0); |
| } |
| static inline bool IndicEncoding(int enc) { |
| return ((kSpecialMask[enc] & kIsIndicCode) != 0); |
| } |
| static inline bool HighAlphaEncoding(int enc) { |
| return ((kSpecialMask[enc] & kHighAlphaCode) != 0); |
| } |
| static inline bool HighAccentEncoding(int enc) { |
| return ((kSpecialMask[enc] & kHighAccentCode) != 0); |
| } |
| |
| |
| static inline bool AnyActive(DetectEncodingState* destatep) { |
| return (destatep->active_special != 0); |
| } |
| static inline bool SevenBitActive(DetectEncodingState* destatep) { |
| return (destatep->active_special & kSevenBitActive) != 0; |
| } |
| static inline bool HzActive(DetectEncodingState* destatep) { |
| return (destatep->active_special & kHzActive) != 0; |
| } |
| static inline bool Iso2022Active(DetectEncodingState* destatep) { |
| return (destatep->active_special & kIso2022Active) != 0; |
| } |
| static inline bool UTF8Active(DetectEncodingState* destatep) { |
| return (destatep->active_special & kUTF8Active) != 0; |
| } |
| static inline bool UTF8UTF8Active(DetectEncodingState* destatep) { |
| return (destatep->active_special & kUTF8UTF8Active) != 0; |
| } |
| static inline bool UTF1632Active(DetectEncodingState* destatep) { |
| return (destatep->active_special & kUTF1632Active) != 0; |
| } |
| static inline bool BinaryActive(DetectEncodingState* destatep) { |
| return (destatep->active_special & kBinaryActive) != 0; |
| } |
| static inline bool UTF7OrHzActive(DetectEncodingState* destatep) { |
| return (destatep->active_special & (kHzActive + kUTF7Active)) != 0; |
| } |
| static inline bool EUCJPActive(DetectEncodingState* destatep) { |
| return ((destatep->active_special & kEUCJPActive) != 0); |
| } |
| static inline bool OtherActive(DetectEncodingState* destatep) { |
| return (destatep->active_special & (kIso2022Active + kBinaryActive + |
| kUTF8Active + kUTF8UTF8Active + |
| kUTF1632Active + kEUCJPActive)) != 0; |
| } |
| |
| |
| static inline bool CEDFlagRescanning(CEDInternalFlags flags) { |
| return (flags & kCEDRescanning) != 0; |
| } |
| |
| static inline bool CEDFlagForceTags(CEDInternalFlags flags) { |
| return (flags & kCEDForceTags) != 0; |
| } |
| |
| |
| static inline int maxint(int a, int b) {return (a > b) ? a : b;} |
| static inline int minint(int a, int b) {return (a < b) ? a : b;} |
| |
| static inline const char* MyRankedEncName(int r_enc) { |
| return MyEncodingName(kMapToEncoding[r_enc]); |
| } |
| |
| |
| // Only for debugging. not thread safe |
| static const int kPsSourceWidth = 32; |
| static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this |
| static int pssourcewidth = 0; // debug only. |
| static char* pssource_mark_buffer = NULL; |
| int next_do_src_line; |
| int do_src_offset[16]; |
| |
| |
| void PsSourceInit(int len) { |
| pssourcenext = 0; |
| pssourcewidth = len; |
| delete[] pssource_mark_buffer; |
| // Allocate 2 Ascii characters per input byte |
| pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan |
| memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| |
| next_do_src_line = 0; |
| memset(do_src_offset, 0, sizeof(do_src_offset)); |
| } |
| |
| void PsSourceFinish() { |
| // Print preceding mark buffer |
| int j = (pssourcewidth * 2) - 1; |
| while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim |
| pssource_mark_buffer[j + 1] = '\0'; |
| fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer); |
| memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| |
| delete[] pssource_mark_buffer; |
| pssource_mark_buffer = NULL; |
| } |
| |
| // Dump aligned len bytes src... if not already dumped |
| void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) { |
| int offset = src - isrc; |
| offset -= (offset % pssourcewidth); // round down to multiple of len bytes |
| if (offset < pssourcenext) { |
| return; |
| } |
| pssourcenext = offset + pssourcewidth; // Min offset for next dump |
| |
| // Print preceding mark buffer |
| int j = (pssourcewidth * 2) - 1; |
| while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim |
| pssource_mark_buffer[j + 1] = '\0'; |
| fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer); |
| memset(pssource_mark_buffer, ' ', pssourcewidth * 2); |
| memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8); |
| |
| // Print source bytes |
| const uint8* src_aligned = isrc + offset; |
| int length = srclimit - src_aligned; |
| length = minint(pssourcewidth, length); |
| |
| fprintf(stderr, "(%05x ", offset); |
| for (int i = 0; i < length; ++i) { |
| char c = src_aligned[i]; |
| if (c == '\n') {c = ' ';} |
| if (c == '\r') {c = ' ';} |
| if (c == '\t') {c = ' ';} |
| if (c == '(') { |
| fprintf(stderr, "%s", "\\( "); |
| } else if (c == ')') { |
| fprintf(stderr, "%s", "\\) "); |
| } else if (c == '\\') { |
| fprintf(stderr, "%s", "\\\\ "); |
| } else if ((0x20 <= c) && (c <= 0x7e)) { |
| fprintf(stderr, "%c ", c); |
| } else { |
| fprintf(stderr, "%02x", c); |
| } |
| } |
| fprintf(stderr, ") do-src\n"); |
| // Remember which source offsets are where, mod 16 |
| do_src_offset[next_do_src_line & 0x0f] = offset; |
| ++next_do_src_line; |
| } |
| |
| // Mark bytes in just-previous source bytes |
| void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) { |
| int offset = src - isrc; |
| offset = (offset % pssourcewidth); // mod len bytes |
| char mark = (weightshift == 0) ? '-' : 'x'; |
| |
| pssource_mark_buffer[(offset * 2)] = '='; |
| pssource_mark_buffer[(offset * 2) + 1] = '='; |
| for (int i = 1; i < len; ++i) { |
| pssource_mark_buffer[(offset + i) * 2] = mark; |
| pssource_mark_buffer[((offset + i) * 2) + 1] = mark; |
| } |
| } |
| |
| |
| // Highlight trigram bytes in just-previous source bytes |
| // Unfortunately, we have to skip back N lines since source was printed for |
| // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better |
| void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) { |
| int offset = (src + 1) - isrc; |
| int offset32 = (offset % pssourcewidth); // mod len bytes |
| offset -= offset32; // round down to multiple of len bytes |
| |
| for (int i = 1; i <= 16; ++i) { |
| if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) { |
| fprintf(stderr, "%d %d %d do-highlight%d\n", |
| i, offset32 - 1, trigram_val, n); |
| break; |
| } |
| } |
| } |
| |
| |
| void InitDetectEncodingState(DetectEncodingState* destatep) { |
| destatep->initial_src = NULL; // Filled in by caller |
| destatep->limit_src = NULL; |
| destatep->prior_src = NULL; |
| destatep->last_pair = NULL; |
| |
| destatep->debug_data = NULL; |
| destatep->next_detail_entry = 0; |
| |
| destatep->done = false; |
| destatep->reliable = false; |
| destatep->hints_derated = false; |
| //destatep->declared_enc_1 init in ApplyHints |
| //destatep->declared_enc_2 init in ApplyHints |
| destatep->prune_count = 0; |
| |
| destatep->trigram_highwater_mark = 0; |
| destatep->looking_for_latin_trigrams = false; |
| destatep->do_latin_trigrams = false; |
| |
| // Miscellaneous state variables for difficult encodings |
| destatep->binary_quadrants_count = 0; |
| destatep->binary_8x4_count = 0; |
| destatep->binary_quadrants_seen = 0; |
| destatep->binary_8x4_seen = 0; |
| destatep->utf7_starts = 0; |
| destatep->prior_utf7_offset = 0; |
| destatep->next_utf8_ministate = 0; |
| for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;} |
| destatep->next_utf8utf8_ministate = 0; |
| destatep->utf8utf8_odd_byte = 0; |
| for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;} |
| destatep->next_2022_state = SOSI_NONE; |
| destatep->next_hz_state = SOSI_NONE; |
| destatep->next_eucjp_oddphase = false; |
| for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;} |
| destatep->active_special = 0xffffffff; |
| destatep->tld_hint = UNKNOWN_ENCODING; |
| destatep->http_hint = UNKNOWN_ENCODING; |
| destatep->meta_hint = UNKNOWN_ENCODING; |
| destatep->bom_hint = UNKNOWN_ENCODING; |
| destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default |
| destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default |
| destatep->top_prob = -1; |
| destatep->second_top_prob = -1; |
| // This is wide for first pruning, shrinks for 2nd and later |
| destatep->prune_difference = kInititalPruneDifference; |
| |
| destatep->next_prior_bigram = 0; |
| destatep->prior_bigram[0] = -1; |
| destatep->prior_bigram[1] = -1; |
| destatep->prior_bigram[2] = -1; |
| destatep->prior_bigram[3] = -1; |
| |
| destatep->prior_binary[0] = -1; |
| |
| // Initialize with all but Indic encodings, which we never detect |
| int k = 0; |
| for (int rankedencoding = 0; |
| rankedencoding < NUM_RANKEDENCODING; |
| rankedencoding++) { |
| Encoding enc = kMapToEncoding[rankedencoding]; |
| if (!IndicEncoding(enc)) { |
| destatep->rankedencoding_list[k++] = rankedencoding; |
| } |
| } |
| destatep->rankedencoding_list_len = k; |
| |
| // This is where all the action is |
| memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob)); |
| |
| memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob)); |
| memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight)); |
| |
| destatep->prior_interesting_pair[AsciiPair] = 0; |
| destatep->prior_interesting_pair[OtherPair] = 0; |
| destatep->next_interesting_pair[AsciiPair] = 0; |
| destatep->next_interesting_pair[OtherPair] = 0; |
| // interesting_pairs/offsets/weightshifts not initialized; no need |
| } |
| |
| // Probability strings are uint8, with zeros removed via simple run-length: |
| // (<skip-take byte> <data bytes>)* |
| // skip-take: |
| // 00 end |
| // x0 skip 16 x locations, take 0 data values |
| // xy skip x locations, take y data values |
| // Multiply all the incoming values by 3 to account for 3x unigram sums |
| // |
| // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35, |
| // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255" |
| // |
| // Weight is 0..100 percent |
| // |
| // Returns subscript of largest (most probable) value |
| // |
| |
| |
| // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__" |
| // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASCII-7-bit] |
| int ApplyCompressedProb(const char* iprob, int len, |
| int weight, DetectEncodingState* destatep) { |
| int* dst = &destatep->enc_prob[0]; |
| int* dst2 = &destatep->hint_weight[0]; |
| const uint8* prob = reinterpret_cast<const uint8*>(iprob); |
| const uint8* problimit = prob + len; |
| |
| int largest = -1; |
| int subscript_of_largest = 0; |
| |
| // Continue with first byte and subsequent ones |
| while (prob < problimit) { |
| int skiptake = *prob++; |
| int skip = (skiptake & 0xf0) >> 4; |
| int take = skiptake & 0x0f; |
| if (skiptake == 00) { |
| break; |
| } else if (take == 0) { |
| dst += (skip << 4); |
| dst2 += (skip << 4); |
| } else { |
| dst += skip; // Normal case |
| dst2 += skip; // Normal case |
| for (int i = 0; i < take; i++) { |
| int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i; |
| if (largest < prob[i]) { |
| largest = prob[i]; |
| subscript_of_largest = enc; |
| } |
| |
| int increment = prob[i] * 3; // The actual increment |
| |
| // Do maximum of previous hints plus this new one |
| if (weight > 0) { |
| increment = (increment * weight) / 100; |
| dst[i] = maxint(dst[i], increment); |
| dst2[i] = 1; // New total weight |
| } |
| } |
| prob += take; |
| dst += take; |
| dst2 += take; |
| } |
| } |
| return subscript_of_largest; |
| } |
| |
| |
| // Returns subscript of largest (most probable) value [for unit test] |
| int TopCompressedProb(const char* iprob, int len) { |
| const uint8* prob = reinterpret_cast<const uint8*>(iprob); |
| const uint8* problimit = prob + len; |
| int next_prob_sub = 0; |
| int topprob = 0; |
| int toprankenc = 0; |
| |
| while (prob < problimit) { |
| int skiptake = *prob++; |
| int skip = (skiptake & 0xf0) >> 4; |
| int take = skiptake & 0x0f; |
| if (skiptake == 0) { |
| break; |
| } else if (take == 0) { |
| next_prob_sub += (skip << 4); |
| } else { |
| next_prob_sub += skip; // Normal case |
| for (int i = 0; i < take; i++) { |
| if (topprob < prob[i]) { |
| topprob = prob[i]; |
| toprankenc = next_prob_sub + i; |
| } |
| } |
| prob += take; |
| next_prob_sub += take; |
| } |
| } |
| return toprankenc; |
| } |
| |
| |
| // Find subscript of matching key in first 8 bytes of sorted hint array, or -1 |
| int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize, |
| const char* norm_key) { |
| // Key is always in range [lo..hi) |
| int lo = 0; |
| int hi = hintprobssize; |
| while (lo < hi) { |
| int mid = (lo + hi) >> 1; |
| int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8); |
| if (comp < 0) { |
| lo = mid + 1; |
| } else if (comp > 0) { |
| hi = mid; |
| } else { |
| return mid; |
| } |
| } |
| return -1; |
| } |
| |
| // Find subscript of matching key in first 4 bytes of sorted hint array, or -1 |
| int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize, |
| const char* norm_key) { |
| // Key is always in range [lo..hi) |
| int lo = 0; |
| int hi = hintprobssize; |
| while (lo < hi) { |
| int mid = (lo + hi) >> 1; |
| int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4); |
| if (comp < 0) { |
| lo = mid + 1; |
| } else if (comp > 0) { |
| hi = mid; |
| } else { |
| return mid; |
| } |
| } |
| return -1; |
| } |
| |
| static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) { |
| destatep->enc_prob[r_enc] += boost; |
| } |
| |
| static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) { |
| destatep->enc_prob[r_enc] -= whack; |
| } |
| |
| // Apply initial probability hint based on top level domain name |
| // Weight is 0..100 percent |
| // Return 1 if name match found |
| int ApplyTldHint(const char* url_tld_hint, int weight, |
| DetectEncodingState* destatep) { |
| if (url_tld_hint[0] == '~') { |
| return 0; |
| } |
| string normalized_tld = MakeChar4(string(url_tld_hint)); |
| int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize, |
| normalized_tld.c_str()); |
| if (n >= 0) { |
| // TLD is four bytes, probability table is ~12 bytes |
| int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey], |
| kMaxTldVector, weight, destatep); |
| // Never boost ASCII7; do CP1252 instead |
| if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| destatep->declared_enc_1 = best_sub; |
| if (destatep->debug_data != NULL) { |
| // Show TLD hint |
| SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint); |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| // Apply initial probability hint based on charset= name |
| // Weight is 0..100 percent |
| // Return 1 if name match found |
| int ApplyCharsetHint(const char* charset_hint, int weight, |
| DetectEncodingState* destatep) { |
| if (charset_hint[0] == '~') { |
| return 0; |
| } |
| string normalized_charset = MakeChar44(string(charset_hint)); |
| int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize, |
| normalized_charset.c_str()); |
| if (n >= 0) { |
| // Charset is eight bytes, probability table is ~eight bytes |
| int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey], |
| kMaxCharsetVector, weight, destatep); |
| // Never boost ASCII7; do CP1252 instead |
| if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| destatep->declared_enc_1 = best_sub; |
| |
| // If first explicitly declared charset is confusable with Latin1/1252, put |
| // both declared forms in declared_enc_*, displacing Latin1/1252. |
| // This avoids a bit of Latin1 creep. |
| // Also boost the declared encoding and its pair |
| // TODO: This should all be folded into postproc-enc-detect.cc |
| if ((destatep->http_hint == UNKNOWN_ENCODING) && |
| (destatep->meta_hint == UNKNOWN_ENCODING)) { |
| // This is the first charset=hint |
| switch (best_sub) { |
| case F_Latin2: // 8859-2 Latin2, east euro |
| destatep->declared_enc_2 = F_CP1250; |
| Boost(destatep, F_Latin2, kGentleOnePair); |
| Boost(destatep, F_CP1250, kGentleOnePair); |
| break; |
| case F_CP1250: |
| destatep->declared_enc_2 = F_Latin2; |
| Boost(destatep, F_Latin2, kGentleOnePair); |
| Boost(destatep, F_CP1250, kGentleOnePair); |
| break; |
| |
| case F_Latin3: // 8859-3 Latin3, south euro, Esperanto |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| Boost(destatep, F_Latin3, kGentleOnePair); |
| break; |
| |
| case F_Latin4: // 8859-4 Latin4, north euro |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| Boost(destatep, F_Latin4, kGentleOnePair); |
| break; |
| |
| case F_ISO_8859_5: // 8859-5 Cyrillic |
| destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251 |
| Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different) |
| break; |
| case F_CP1251: |
| destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5 |
| Boost(destatep, F_CP1251, kGentleOnePair); // (too different) |
| break; |
| |
| case F_Arabic: // 8859-6 Arabic |
| destatep->declared_enc_2 = F_CP1256; |
| Boost(destatep, F_Arabic, kGentleOnePair); |
| Boost(destatep, F_CP1256, kGentleOnePair); |
| break; |
| case F_CP1256: |
| destatep->declared_enc_2 = F_Arabic; |
| Boost(destatep, F_Arabic, kGentleOnePair); |
| Boost(destatep, F_CP1256, kGentleOnePair); |
| break; |
| |
| case F_Greek: // 8859-7 Greek |
| destatep->declared_enc_2 = F_CP1253; |
| Boost(destatep, F_Greek, kGentleOnePair); |
| Boost(destatep, F_CP1253, kGentleOnePair); |
| break; |
| case F_CP1253: |
| destatep->declared_enc_2 = F_Greek; |
| Boost(destatep, F_Greek, kGentleOnePair); |
| Boost(destatep, F_CP1253, kGentleOnePair); |
| break; |
| |
| case F_Hebrew: // 8859-8 Hebrew |
| destatep->declared_enc_2 = F_CP1255; |
| Boost(destatep, F_Hebrew, kGentleOnePair); |
| Boost(destatep, F_CP1255, kGentleOnePair); |
| break; |
| case F_CP1255: |
| destatep->declared_enc_2 = F_Hebrew; |
| Boost(destatep, F_Hebrew, kGentleOnePair); |
| Boost(destatep, F_CP1255, kGentleOnePair); |
| break; |
| |
| case F_Latin5: // 8859-9 Latin5, Turkish |
| destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254 |
| Boost(destatep, F_Latin5, kGentleOnePair); // (too different) |
| break; |
| case F_CP1254: |
| destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5 |
| Boost(destatep, F_CP1254, kGentleOnePair); // (too different) |
| break; |
| |
| case F_Latin6: // 8859-10 Latin6, Nordic |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| Boost(destatep, F_Latin6, kGentleOnePair); |
| break; |
| |
| case F_ISO_8859_11: // 8859-11 Thai, |
| destatep->declared_enc_2 = F_CP874; |
| Boost(destatep, F_ISO_8859_11, kGentleOnePair); |
| Boost(destatep, F_CP874, kGentleOnePair); |
| break; |
| case F_CP874: |
| destatep->declared_enc_2 = F_ISO_8859_11; |
| Boost(destatep, F_ISO_8859_11, kGentleOnePair); |
| Boost(destatep, F_CP874, kGentleOnePair); |
| break; |
| |
| case F_ISO_8859_13: // 8859-13 Latin7, Baltic |
| destatep->declared_enc_2 = F_CP1257; |
| Boost(destatep, F_ISO_8859_13, kGentleOnePair); |
| Boost(destatep, F_CP1257, kGentleOnePair); |
| break; |
| case F_CP1257: |
| destatep->declared_enc_2 = F_ISO_8859_13; |
| Boost(destatep, F_ISO_8859_13, kGentleOnePair); |
| Boost(destatep, F_CP1257, kGentleOnePair); |
| break; |
| |
| case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1 |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| Boost(destatep, F_ISO_8859_15, kGentleOnePair); |
| break; |
| |
| |
| // Greek all-caps is confusable with KOI8x all-lower and Hebrew. |
| // This turns some Greek documents into Cyrillic, etc. by mistake. |
| // Greek and Hebrew are boosted explicitly above; do KOI8x here. |
| // Boosting the declared encodingmakes it harder for the wrong one to |
| // creep up. |
| case F_KOI8R: |
| Boost(destatep, F_KOI8R, kGentleOnePair); |
| break; |
| case F_KOI8U: |
| Boost(destatep, F_KOI8U, kGentleOnePair); |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| if (destatep->debug_data != NULL) { |
| // Show charset hint |
| SetDetailsEncProb(destatep, 0, best_sub, charset_hint); |
| } |
| |
| // |
| // Some fix-ups for the declared encodings |
| // |
| |
| // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos |
| // TODO: This should all be folded into postproc-enc-detect.cc |
| if ((best_sub != F_UTF8) && |
| (best_sub != F_Latin1) && |
| (best_sub != F_CP1252)) { |
| Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote |
| } |
| |
| // Latin2 and CP1250 differ in the overlap part, such as B1 or B9 |
| // The initial probabilites for charset=Latin2 explicitly put CP1250 |
| // down twice as far as normal, and vice versa. This is done in |
| // postproc-enc-detect.cc |
| |
| // If charset=user-defined, treat as Binary -- |
| // we can safely only do low ASCII, might be Indic |
| if (normalized_charset.substr(0,4) == "user") { |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } |
| |
| return 1; |
| } |
| return 0; |
| } |
| |
| // Apply initial probability hint based on caller-supplied encoding |
| // Negative hint whacks ~encoding, non-negative boosts encoding |
| // |
| // Negative hints are an experiment to see if they might be useful. |
| // Not operator used instead of unary minus to allow specifying not-zero |
| int ApplyEncodingHint(const int encoding_hint, int weight, |
| DetectEncodingState* destatep) { |
| Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ? |
| ~encoding_hint : encoding_hint); |
| // Map to the right internal subscript |
| int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint); |
| |
| // I'm not sure how strong this hint should be. Weight 100% = 1 bigram |
| int increment = (kBoostOnePair * weight) / 100; |
| |
| if (encoding_hint < 0) { |
| destatep->enc_prob[rankedenc_hint] -= increment; |
| } else { |
| destatep->enc_prob[rankedenc_hint] += increment; |
| } |
| |
| if (destatep->debug_data != NULL) { |
| // Show encoding hint |
| SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint)); |
| } |
| return 1; |
| } |
| |
| // Apply initial probability hint based on user interface language |
| // Weight is 0..100 percent |
| // Return 1 if name match found |
| int ApplyUILanguageHint(const Language language_hint, |
| int weight, DetectEncodingState* destatep) { |
| if (language_hint == UNKNOWN_LANGUAGE) { |
| return 0; |
| } |
| string normalized_lang = MakeChar8(LanguageName(language_hint)); |
| int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize, |
| normalized_lang.c_str()); |
| if (n >= 0) { |
| // Language is eight bytes, probability table is ~eight bytes |
| int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey], |
| kMaxLangVector, weight, destatep); |
| // Never boost ASCII7; do CP1252 instead |
| if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;} |
| destatep->declared_enc_1 = best_sub; |
| if (destatep->debug_data != NULL) { |
| // Show language hint |
| SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str()); |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| // Apply initial probability hint based on corpus type (web, email, etc) |
| // Return 1 if name match found |
| int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type, |
| DetectEncodingState* destatep) { |
| |
| for (int i = 0; i < NUM_RANKEDENCODING; i++) { |
| // Set the default probability |
| destatep->enc_prob[i] = kDefaultProb[i] * 3; |
| // Deliberately set 2022 seven-bit encodings to zero, |
| // so we can look for actual use |
| // TODO: This should all be folded into postproc-enc-detect.cc |
| if (SevenBitEncoding(kMapToEncoding[i])) { |
| destatep->enc_prob[i] = 0; |
| } |
| } |
| |
| // A little corpus distinction |
| switch (corpus_type) { |
| case CompactEncDet::WEB_CORPUS: |
| case CompactEncDet::XML_CORPUS: |
| // Allow double-converted UTF-8 to start nearly equal to normal UTF-8 |
| destatep->enc_prob[F_UTF8UTF8] = |
| destatep->enc_prob[F_UTF8] - kSmallInitDiff; |
| break; |
| case CompactEncDet::QUERY_CORPUS: |
| case CompactEncDet::EMAIL_CORPUS: |
| default: |
| break; |
| } |
| |
| if (FLAGS_demo_nodefault) { |
| // Demo, make initial probs all zero |
| for (int i = 0; i < NUM_RANKEDENCODING; i++) { |
| destatep->enc_prob[i] = 0; |
| } |
| } |
| |
| if (destatep->debug_data != NULL) { |
| // Show default hint |
| SetDetailsEncProb(destatep, 0, -1, "Default"); |
| } |
| return 1; |
| } |
| |
| |
| |
| // Do reverse search for c in [str..str+len) |
| // Note: initial pointer is to FRONT of string, not back |
| const char* MyMemrchr(const char* str, char c, size_t len) { |
| const char* ret = str + len; |
| while (str <= --ret) { |
| if (*ret == c) {return ret;} |
| } |
| return NULL; |
| } |
| |
| |
| // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD |
| // Now that we are no longer trying to do Indic font-based encodigns, we |
| // don't need the full URL and can go back to simple TLD. This test remains for |
| // backwards compatility with any caller using full URL. |
| static const int kMinURLLength = 11; |
| |
| // Extract TLD from a full URL or just a TLD |
| // Return hostname and length if a full URL |
| void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len, |
| const char** ret_host_start, int* ret_host_len) { |
| // url_hint can either be a full URL (preferred) or just top-level domain name |
| // Extract the TLD from a full URL and use it for |
| // a normal TLD hint |
| |
| strncpy(tld_hint, "~", tld_hint_len); |
| tld_hint[tld_hint_len - 1] = '\0'; |
| *ret_host_start = NULL; |
| *ret_host_len = 0; |
| |
| int url_len = (url_hint != NULL) ? strlen(url_hint) : 0; |
| if (url_len == 0) { |
| // Empty TLD |
| return; |
| } |
| |
| // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD |
| if (kMinURLLength <= url_len) { |
| // See if it really is a URL |
| const char* first_slash = strchr(url_hint, '/'); |
| if ((first_slash != NULL) && (first_slash != url_hint) && |
| (first_slash[-1] == ':') && (first_slash[1] == '/') && |
| (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) { |
| // We found :// and no dot in front of it, so declare a real URL |
| |
| const char* hostname_start = first_slash + 2; |
| const char* hostname_end = strchr(hostname_start, '/'); |
| if (hostname_end == NULL) { |
| // No slash; end is first byte off end of the URL string |
| hostname_end = url_hint + url_len; |
| } |
| size_t hostname_len = hostname_end - hostname_start; |
| const char* port_start = |
| (const char*)memchr(hostname_start, ':', hostname_len); |
| if (port_start != NULL) { |
| // Port; shorten hostname |
| hostname_end = port_start; |
| hostname_len = hostname_end - hostname_start; |
| } |
| |
| const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len); |
| if (tld_start != NULL) { |
| // Remember the TLD we just found |
| int tld_len = hostname_start + hostname_len - tld_start - 1; |
| if (tld_len > (tld_hint_len - 1)) { |
| tld_len = tld_hint_len - 1; |
| } |
| memcpy(tld_hint, tld_start + 1, tld_len); |
| tld_hint[tld_len] = '\0'; |
| } |
| *ret_host_start = hostname_start; |
| *ret_host_len = hostname_len; |
| return; |
| } |
| } else { |
| strncpy(tld_hint, url_hint, tld_hint_len); |
| tld_hint[tld_hint_len - 1] = '\0'; |
| } |
| } |
| |
| // Apply hints, if any, to probabilities |
| // NOTE: Encoding probabilites are all zero at this point |
| void ApplyHints(const char* url_hint, |
| const char* http_charset_hint, |
| const char* meta_charset_hint, |
| const int encoding_hint, |
| const Language language_hint, |
| const CompactEncDet::TextCorpusType corpus_type, |
| DetectEncodingState* destatep) { |
| int hint_count = 0; |
| // url_hint can either be a full URL (preferred) or just top-level domain name |
| // Extract the TLD from a full URL and use it for |
| // a normal TLD hint |
| |
| char tld_hint[16]; |
| const char* hostname_start = NULL; |
| int hostname_len = 0; |
| ExtractTLD(url_hint, tld_hint, sizeof(tld_hint), |
| &hostname_start, &hostname_len); |
| |
| |
| // Initial hints give slight boost to Ascii-7-bit and code page 1252 |
| // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1 |
| // This gives a boost to 1252 if one of HTTP/META is specified, |
| // but this could be the wrong thing to do if Latin2/3/4/etc. is specified |
| destatep->declared_enc_1 = F_CP1252; |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| |
| // Applying various hints takes max of new hint and any old hint. |
| // This does better on multiple hints that a weighted average |
| |
| // Weight is 0..100 percent |
| if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) { |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep); |
| destatep->http_hint = kMapToEncoding[destatep->declared_enc_1]; |
| if ((destatep->declared_enc_1 == F_CP1252) || |
| (destatep->declared_enc_1 == F_Latin1)) { |
| destatep->looking_for_latin_trigrams = true; |
| } |
| } |
| if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) { |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep); |
| destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1]; |
| if ((destatep->declared_enc_1 == F_CP1252) || |
| (destatep->declared_enc_1 == F_Latin1)) { |
| destatep->looking_for_latin_trigrams = true; |
| } |
| } |
| if (encoding_hint != UNKNOWN_ENCODING) { |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyEncodingHint(encoding_hint, 50, destatep); |
| } |
| if (language_hint != UNKNOWN_LANGUAGE) { |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyUILanguageHint(language_hint, 50, destatep); |
| } |
| // Use top level domain if not .com and <=1 other hint was available |
| if (url_hint != NULL) { |
| destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint); |
| if (hint_count == 0) { |
| // Apply with weight 100% |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyTldHint(tld_hint, 100, destatep); |
| if ((destatep->declared_enc_1 == F_CP1252) || |
| (destatep->declared_enc_1 == F_Latin1)) { |
| destatep->looking_for_latin_trigrams = true; |
| } |
| if (strcmp("hu", tld_hint) == 0) { |
| // Hungarian is particularly difficult to separate Latin2 from Latin1, |
| // so always look for trigram scanning if bare TLD=hu hint |
| destatep->looking_for_latin_trigrams = true; |
| } |
| // Treat .com as no TLD hint at all |
| } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) { |
| // Either shift weighting or consider doing no TLD here -- seems to |
| // distract from correct charset= hints. Or perhaps apply only if |
| // charset = Latin1/1252... |
| // Apply with weight 50% |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyTldHint(tld_hint, 50, destatep); |
| if ((destatep->declared_enc_1 == F_CP1252) || |
| (destatep->declared_enc_1 == F_Latin1)) { |
| destatep->looking_for_latin_trigrams = true; // These need trigrams |
| } |
| } |
| // Else ignore TLD hint entirely |
| } |
| |
| // Use all-web default distribution if not even a TLD hint |
| if (hint_count == 0) { |
| destatep->looking_for_latin_trigrams = true; // Default needs trigrams |
| destatep->declared_enc_2 = destatep->declared_enc_1; |
| hint_count += ApplyDefaultHint(corpus_type, destatep); |
| } |
| |
| |
| // ISO-Microsoft Pairs |
| // F_Latin1, F_CP1252, |
| // F_Latin2, F_CP1250, NOT really strict subset/superset pairs |
| // F_Latin3, |
| // F_Latin4, |
| // F_ISO_8859_5, F_CP1251, |
| // F_Arabic, F_CP1256, NOT |
| // F_Greek, F_CP1253, NOT really pairs |
| // (or upgrade incvt to make Greek use CP) |
| // F_Hebrew, F_CP1255, NOT really pairs |
| // F_Latin5, F_CP1254, |
| // F_Latin6, |
| // F_ISO_8859_11, |
| // F_ISO_8859_13, F_CP1257, |
| // F_ISO_8859_15, |
| // ISO-Microsoft Pairs |
| |
| // Get important families started together |
| // // This should fall out of the initializatoin vectors for charset, |
| // but we need to get rid of families alltogetrher |
| // |
| // TODO make this more graceful |
| |
| // Add small bias for subsets |
| |
| // Subtract small bias for supersets |
| destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff; |
| |
| destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff; |
| destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff; |
| |
| destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] - |
| kSmallInitDiff; |
| destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] - |
| kSmallInitDiff; |
| |
| // Deliberate over-bias Ascii7 and underbias Binary [unneeded] |
| // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSmallInitDiff; |
| // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitial / 2); |
| |
| if (destatep->debug_data != NULL) { |
| // Show state at end of hints |
| SetDetailsEncProb(destatep, 0, -1, "Endhints"); |
| if(FLAGS_enc_detect_detail2) { |
| // Add a line showing the watched encoding(s) |
| if (watch1_rankedenc >= 0) { |
| SetDetailsEncProb(destatep, 0, |
| watch1_rankedenc, FLAGS_enc_detect_watch1); |
| } |
| if (watch2_rankedenc >= 0) { |
| SetDetailsEncProb(destatep, 0, |
| watch2_rankedenc, FLAGS_enc_detect_watch2); |
| } |
| } // End detail2 |
| } |
| |
| // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost |
| if (destatep->declared_enc_1 == destatep->declared_enc_2) { |
| destatep->declared_enc_2 = F_ASCII_7_bit; |
| } |
| |
| if (FLAGS_force127) { |
| destatep->do_latin_trigrams = true; |
| if (FLAGS_enc_detect_source) { |
| PsHighlight(0, destatep->initial_src, 0, 2); |
| } |
| } |
| |
| |
| if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;} |
| if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;} |
| |
| // |
| // At this point, destatep->enc_prob[] is an initial probability vector based |
| // on the given hints/default. In general, it spreads out least-likely |
| // encodings to be about 2**-25 below the most-likely encoding. |
| // For input text with lots of bigrams, an unlikely encoding can rise to |
| // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per |
| // bigram. So more than 4 bigrams and commonly more than 12 are |
| // needed to overcome the initial hints when the least-likely encoding |
| // is in fact the correct answer. So if the entire text has very few bigrams |
| // (as a two-word query might), it can be impossible for the correct |
| // encoding to win. |
| // |
| // To compensate for this, we take the initial hint vector and effectively |
| // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The |
| // actual mechanism is done just before the last prune. |
| // |
| |
| // Remember Initial hint probabilities |
| memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob)); |
| } |
| |
| // Look for specific high-value patterns in the first 4 bytes |
| // Byte order marks (BOM) |
| // EFBBBF UTF-8 |
| // FEFF UTF-16 BE |
| // FFFE UTF-16 LE |
| // FFFE0000 UTF-32 BE |
| // 0000FEFF UTF-32 LE |
| // |
| // Likely UTF-x of seven-bit ASCII |
| // 00xx UTF-16 BE xx printable ASCII |
| // xx00 UTF-16 LE |
| // 000000xx UTF-32 BE |
| // xx000000 UTF-32 LE |
| // |
| void InitialBytesBoost(const uint8* src, |
| int text_length, |
| DetectEncodingState* destatep) { |
| if (text_length < 4) {return;} |
| |
| uint32 pair01 = (src[0] << 8) | src[1]; |
| uint32 pair23 = (src[2] << 8) | src[3]; |
| uint32 quad0123 = (pair01 << 16) | pair23; |
| |
| bool utf_16_indication = false; |
| bool utf_32_indication = false; |
| int best_enc = -1; |
| |
| // Byte order marks |
| // UTF-8 |
| if ((quad0123 & 0xffffff00) == 0xEFBBBF00) { |
| destatep->bom_hint = UTF8; |
| Boost(destatep, F_UTF8, kBoostInitial * 2); |
| Boost(destatep, F_UTF8UTF8, kBoostInitial * 2); |
| best_enc = F_UTF8; |
| // UTF-32 (test before UTF-16) |
| } else if (quad0123 == 0x0000FEFF) { |
| destatep->bom_hint = UTF32BE; |
| Boost(destatep, F_UTF_32BE, kBoostInitial * 2); |
| best_enc = F_UTF_32BE; |
| } else if (quad0123 == 0xFFFE0000) { |
| destatep->bom_hint = UTF32LE; |
| Boost(destatep, F_UTF_32LE, kBoostInitial * 2); |
| best_enc = F_UTF_32LE; |
| // UTF-16 |
| } else if (pair01 == 0xFEFF) { |
| destatep->bom_hint = UTF16BE; |
| Boost(destatep, F_UTF_16BE, kBoostInitial * 3); |
| best_enc = F_UTF_16BE; |
| } else if (pair01 == 0xFFFE) { |
| destatep->bom_hint = UTF16LE; |
| Boost(destatep, F_UTF_16LE, kBoostInitial * 3); |
| best_enc = F_UTF_16LE; |
| |
| // Possible seven-bit ASCII encoded as UTF-16/32 |
| // UTF-32 (test before UTF-16) |
| } else if (((quad0123 & 0xffffff00) == 0) && |
| (kIsPrintableAscii[src[3]] != 0)) { |
| Boost(destatep, F_UTF_32BE, kBoostInitial); |
| Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char |
| best_enc = F_UTF_32BE; |
| } else if (((quad0123 & 0x00ffffff) == 0) && |
| (kIsPrintableAscii[src[0]] != 0)) { |
| Boost(destatep, F_UTF_32LE, kBoostInitial); |
| Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| best_enc = F_UTF_32LE; |
| } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) { |
| Boost(destatep, F_UTF_16BE, kBoostInitial); |
| best_enc = F_UTF_16BE; |
| } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) { |
| Boost(destatep, F_UTF_16LE, kBoostInitial); |
| best_enc = F_UTF_16LE; |
| |
| // Whack if 0000 or FFFF |
| // UTF-32 (test before UTF-16) |
| } else if (quad0123 == 0x00000000) { |
| Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| Whack(destatep, F_UTF_32LE, kBadPairWhack); |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| best_enc = -1; |
| } else if (quad0123 == 0xffffffff) { |
| Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char |
| Whack(destatep, F_UTF_32LE, kBadPairWhack); |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| best_enc = -1; |
| } else if (pair01 == 0x0000) { |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| best_enc = -1; |
| } else if (pair01 == 0xffff) { |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); |
| best_enc = -1; |
| |
| |
| // These are the first four bytes of some known binary file formats |
| |
| // Boost BINARY bigtime if JPEG FFD8FFxx |
| // Boost BINARY bigtime if png 89504E47 (.PNG) |
| // Boost BINARY bigtime if gif 47494638 (GIF8) |
| // Boost BINARY bigtime if zip 504B0304 (PK..) |
| // Boost BINARY bigtime if gzip 1F8B08xx |
| // Boost BINARY bigtime if gzip 78DAxxxx |
| // Boost BINARY if PDF 25504446 (%PDF) |
| // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f) |
| } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x89504E47) { // Hex 89 P N G |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x47494638) { // Hex GIF8 |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x504B0304) { // Hex P K 03 04 |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (pair01 == 0x78DA) { // gzip 78DAxxxx |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x25504446) { // Hex %PDF |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| |
| // More binary detect prefixes |
| // 7F E L F Executable and linking format |
| // M M 00 * TIFF (little-endian) |
| // * 00 M M TIFF (big-endian) |
| // 01 f c p Final cut pro |
| } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 * |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x01666370) { // Hex 01 f c p |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| |
| // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII |
| // prefix overcoming binary |
| // C C S D USGS ISIS 3-D cube files |
| // S I M P FITS image header "SIMPLE " |
| } else if (quad0123 == 0x43435344) { // Hex C C S D |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x53494D50) { // Hex S I M P |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| |
| // More binary detect prefixes; all-ASCII names; lighter weight |
| // H W P Hangul word processor |
| // 8 B P S Photoshop |
| // P D S _ xx "PDS_VERSION_ID " |
| } else if (quad0123 == 0x48575020) { // Hex H W P |
| if ((19 <= text_length) && |
| (memcmp(src, "HWP.Document.File.V", 19) == 0)) { |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if ((19 <= text_length) && |
| (memcmp(src, "HWP Document File V", 19) == 0)) { |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else { |
| Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary); |
| } |
| } else if (quad0123 == 0x38425053) { // Hex 8 B P S |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else if (quad0123 == 0x5044535F) { // Hex P D S _ |
| if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) { |
| Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary); |
| } else { |
| Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary); |
| } |
| } |
| |
| // There are several main Windows EXE file formats. |
| // Not examined here (prefix too short; never see them in Google pipeline) |
| // M Z DOS .exe Mark Zbikowski |
| // N E DOS 4.0 16-bit |
| // L E OS/2 VxD drivers |
| // L X OS/2 |
| // P E Windows NT |
| |
| |
| // More user-defined |
| // http://www.freenet.am/armscii/ Armenian |
| |
| // If any hints or BOM, etc. keep UTF 16/32 around |
| if ((destatep->enc_prob[F_UTF_16BE] > 0) || |
| (destatep->enc_prob[F_UTF_16LE] > 0)) { |
| utf_16_indication = true; |
| } |
| if ((destatep->enc_prob[F_UTF_32BE] > 0) || |
| (destatep->enc_prob[F_UTF_32LE] > 0)) { |
| utf_32_indication = true; |
| } |
| |
| |
| // Kill UTF16/32 right now if no positive indication of them |
| // Otherwise, they tend to rise to the top in 7-bit files with an |
| // occasional 0x02 byte in some comment or javascript |
| if (!utf_16_indication) { |
| Whack(destatep, F_UTF_16BE, kBadPairWhack * 8); |
| Whack(destatep, F_UTF_16LE, kBadPairWhack * 8); |
| Whack(destatep, F_Unicode, kBadPairWhack * 8); |
| } |
| if (!utf_32_indication) { |
| Whack(destatep, F_UTF_32BE, kBadPairWhack * 8); |
| Whack(destatep, F_UTF_32LE, kBadPairWhack * 8); |
| } |
| |
| // Usually kill mixed encodings |
| if (!FLAGS_ced_allow_utf8utf8) { |
| Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); |
| } |
| // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead |
| Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8); |
| |
| if (destatep->debug_data != NULL) { |
| // Show first four bytes of the input |
| char buff[16]; |
| snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23); |
| SetDetailsEncProb(destatep, 0, best_enc, buff); |
| } |
| } |
| |
| |
| |
| // Descending order |
| int IntCompare(const void* v1, const void* v2) { |
| const int* p1 = reinterpret_cast<const int*>(v1); |
| const int* p2 = reinterpret_cast<const int*>(v2); |
| if (*p1 < *p2) {return 1;} |
| if (*p1 > *p2) {return -1;} |
| return 0; |
| } |
| |
| bool Base64Char(uint8 c) { |
| if (('A' <= c) && (c <= 'Z')) {return true;} |
| if (('a' <= c) && (c <= 'z')) {return true;} |
| if (('0' <= c) && (c <= '9')) {return true;} |
| if ('+' == c) {return true;} |
| if ('/' == c) {return true;} |
| return false; |
| } |
| |
| int Base64ScanLen(const uint8* start, const uint8* limit) { |
| // We have a plausible beginning; scan entire base64 string |
| const uint8* ib64str = start; |
| const uint8* b64str = ib64str; |
| const uint8* b64strlimit = limit; |
| // if starts with + +++, assume it is drawing, so bogus |
| if (((limit - start) > 3) && (start[0] == '+') && |
| (start[1] == '+') && (start[2] == '+')) { |
| return 81; |
| } |
| // Scan over base64 |
| while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) { |
| } |
| b64str--; // We overshot by 1 |
| return b64str - ib64str; |
| } |
| |
| // Input is at least 8-character legal base64 string after +. |
| // But might be say + "Presse+Termine" |
| bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) { |
| // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64) |
| // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64) |
| // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64) |
| // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64) |
| // NOTE: this requires at least one lower AND one upper AND one digit to pass |
| // |
| int plus_count = 0; |
| int lower_count = 0; |
| int upper_count = 0; |
| int digit_count = 0; |
| int len = limit - start; |
| for (const uint8* src = start; src < limit; ++src) { |
| uint8 c = *src; |
| if (('a' <= c) && (c <= 'z')) { |
| ++lower_count; |
| } else if (('A' <= c) && (c <= 'Z')) { |
| ++upper_count; |
| } else if (('0' <= c) && (c <= '0')) { |
| ++digit_count; |
| } else if (*src == '+') { |
| ++plus_count; |
| } |
| } |
| |
| if (plus_count > (1 + (len >> 4))) {return false;} |
| if (lower_count < (1 + (len >> 4))) {return false;} |
| if (upper_count < (1 + (len >> 4))) {return false;} |
| if (digit_count < (1 + (len >> 5))) {return false;} |
| |
| // checking the last character to reduce false positive |
| // since the last character may be padded to 0 bits at the end. |
| // refer to http://en.wikipedia.org/wiki/UTF-7 |
| int nmod8 = len & 7; |
| const uint8 last = *(start+len-1); |
| // When UTF-7 string length%8=3, the last two bits must be padded as 0 |
| if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;} |
| // When UTF-7 string length%8=6, the last four bits must be padded as 0 |
| if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;} |
| return true; |
| } |
| |
| // Prune here after N bytes |
| // Boost here for seven-bit sequences (at every prune) |
| // if (sevenbitrankedencoding) |
| // + UTF7 scan and boost/demote len mod 8 = 0 3 6 |
| // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6 |
| // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6 |
| // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6 |
| // [0F 2022 boost/demote] |
| // 00 UTF16/32 scan and boost/demote offset = even/odd |
| // |
| // If still some seven-bit possibilities > pure ASCII, |
| // scan each possibility for clearer prob, s.t. about |
| // two good sequences is a clear win |
| // A-Z 00-19 00xx-64xx (B = 04xx) |
| // a-z 1A-33 68xx-CCxx (f = 7Cxx) |
| // 0-9 34-3D D0xx-F4xx (1 = D4xx) |
| // + 3E F8xx |
| // / 3F FCxx |
| // do another chunk with slow scan |
| |
| |
| // Boost, whack, or leave alone UTF-7 probablilty |
| void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) { |
| int off = destatep->interesting_offsets[AsciiPair][next_pair]; |
| if (off >= destatep->prior_utf7_offset) { |
| // Not part of a previous successful UTF-7 string |
| ++destatep->utf7_starts; |
| |
| if (byte2 == '-') { |
| // +- encoding for '+' neutral |
| } else if (!Base64Char(byte2)) { |
| // Not base64 -- not UTF-7, whack |
| Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair |
| } else { |
| // Starts with base64 byte, might be a good UTF7 sequence |
| const uint8* start = destatep->initial_src + off + 1; // over the + |
| int n = Base64ScanLen(start, destatep->limit_src); |
| int nmod8 = n & 7; |
| if ((n == 3) || (n == 6)) { |
| // short but legal -- treat as neutral |
| } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) { |
| // Good length. Check for good Unicode. |
| if (GoodUnicodeFromBase64(start, start + n)) { |
| // Good length and Unicode, boost |
| Boost(destatep, F_UTF7, kBoostOnePair); // Found good |
| destatep->prior_utf7_offset = off + n + 1; |
| } else { |
| // Bad Unicode. Whack |
| Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length |
| } |
| } else { |
| // Bad length. Whack |
| Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length |
| } |
| } |
| } |
| } |
| |
| // Boost, whack, or leave alone HZ probablilty |
| void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) { |
| if ((byte2 == '{') || (byte2 == '}')) { |
| Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~} |
| } else if ((byte2 == '~') || (byte2 == '\n')) { |
| destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral |
| } else { |
| Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair |
| } |
| } |
| |
| // Boost, whack, or leave alone BINARY probablilty |
| void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) { |
| int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7); |
| int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6); |
| uint32 quad_mask = 1 << quadrant; |
| uint32 bucket8x4_mask = 1 << bucket8x4; |
| if ((destatep->binary_quadrants_seen & quad_mask) == 0) { |
| destatep->binary_quadrants_seen |= quad_mask; |
| destatep->binary_quadrants_count += 1; |
| if (destatep->binary_quadrants_count == 4) { |
| Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants, |
| // boost 2 pairs |
| } |
| } |
| if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) { |
| destatep->binary_8x4_seen |= bucket8x4_mask; |
| destatep->binary_8x4_count += 1; |
| if (destatep->binary_8x4_count >= 11) { |
| Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets, |
| // boost 4 pairs each time |
| } |
| } |
| } |
| |
| |
| // Demote UTF-16/32 on 0000 or FFFF, favoring Binary |
| void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) { |
| if (byte1 == 0) { // We have 0000 |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair |
| switch (offset & 3) { |
| case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE |
| Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair |
| Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair |
| break; |
| case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE |
| case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE |
| Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair |
| Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair |
| break; |
| case 3: // ambiguous |
| break; |
| } |
| } else { // We have ffff |
| Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair |
| Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair |
| Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair |
| Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair |
| } |
| } |
| |
| // Make even offset |
| void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) { |
| destatep->interesting_offsets[OtherPair][next_pair] &= ~1; |
| } |
| |
| bool ConsecutivePair(DetectEncodingState* destatep, int i) { |
| if (i <= 0) { |
| return false; |
| } |
| return destatep->interesting_offsets[OtherPair][i] == |
| (destatep->interesting_offsets[OtherPair][i - 1] + 2); |
| } |
| |
| // boost, whack, or leave alone UTF-8 probablilty |
| // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8 |
| // Returns total boost |
| int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) { |
| int startcount = destatep->prior_interesting_pair[OtherPair]; |
| int endcount = destatep->next_interesting_pair[OtherPair]; |
| |
| int demotion_count = 0; |
| for (int i = startcount; i < endcount; ++i) { |
| int sub; |
| char* s = &destatep->interesting_pairs[OtherPair][i * 2]; |
| // Demote four byte patterns that are more likely Latin1 than UTF-8 |
| // C9AE, DF92, DF93, DFAB. See note at top. |
| // Demotion also boosts Latin1 and CP1252 |
| uint8 s0 = static_cast<uint8>(s[0]); |
| uint8 s1 = static_cast<uint8>(s[1]); |
| if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;} |
| if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;} |
| if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;} |
| if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;} |
| |
| if (!ConsecutivePair(destatep, i)) { |
| // Insert a blank into the sequence; avoid wrong splices |
| sub = (' ' >> 4) & 0x0f; |
| ++destatep->utf8_minicount[ |
| static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])]; |
|