| /* Copyright 2016 Google Inc. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| #include "utils.h" |
| |
| #include <ctype.h> |
| #include <stdlib.h> |
| |
| #include "script_span/stringpiece.h" |
| |
| namespace chrome_lang_id { |
| namespace utils { |
| |
| bool ParseInt32(const char *c_str, int *value) { |
| char *temp; |
| *value = strtol(c_str, &temp, 0); // NOLINT |
| return (*temp == '\0'); |
| } |
| |
| bool ParseDouble(const char *c_str, double *value) { |
| char *temp; |
| *value = strtod(c_str, &temp); |
| return (*temp == '\0'); |
| } |
| |
| static char hex_char[] = "0123456789abcdef"; |
| |
| string CEscape(const string &src) { |
| string dest; |
| |
| for (unsigned char c : src) { |
| switch (c) { |
| case '\n': |
| dest.append("\\n"); |
| break; |
| case '\r': |
| dest.append("\\r"); |
| break; |
| case '\t': |
| dest.append("\\t"); |
| break; |
| case '\"': |
| dest.append("\\\""); |
| break; |
| case '\'': |
| dest.append("\\'"); |
| break; |
| case '\\': |
| dest.append("\\\\"); |
| break; |
| default: |
| // Note that if we emit \xNN and the src character after that is a hex |
| // digit then that digit must be escaped too to prevent it being |
| // interpreted as part of the character code by C. |
| if ((c >= 0x80) || !isprint(c)) { |
| dest.append("\\"); |
| dest.push_back(hex_char[c / 64]); |
| dest.push_back(hex_char[(c % 64) / 8]); |
| dest.push_back(hex_char[c % 8]); |
| } else { |
| dest.push_back(c); |
| break; |
| } |
| } |
| } |
| |
| return dest; |
| } |
| |
| std::vector<string> Split(const string &text, char delim) { |
| std::vector<string> result; |
| size_t token_start = 0; |
| if (!text.empty()) { |
| for (size_t i = 0; i < text.size() + 1; i++) { |
| if ((i == text.size()) || (text[i] == delim)) { |
| result.push_back(string(text.data() + token_start, i - token_start)); |
| token_start = i + 1; |
| } |
| } |
| } |
| return result; |
| } |
| |
| int RemoveLeadingWhitespace(StringPiece *text) { |
| int count = 0; |
| const char *ptr = text->data(); |
| while (count < text->size() && isspace(*ptr)) { |
| count++; |
| ptr++; |
| } |
| text->remove_prefix(count); |
| return count; |
| } |
| |
| int RemoveTrailingWhitespace(StringPiece *text) { |
| int count = 0; |
| const char *ptr = text->data() + text->size() - 1; |
| while (count < text->size() && isspace(*ptr)) { |
| ++count; |
| --ptr; |
| } |
| text->remove_suffix(count); |
| return count; |
| } |
| |
| int RemoveWhitespaceContext(StringPiece *text) { |
| // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job |
| return RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text); |
| } |
| |
| namespace { |
| // Lower-level versions of Get... that read directly from a character buffer |
| // without any bounds checking. |
| inline uint32 DecodeFixed32(const char *ptr) { |
| return ((static_cast<uint32>(static_cast<unsigned char>(ptr[0]))) | |
| (static_cast<uint32>(static_cast<unsigned char>(ptr[1])) << 8) | |
| (static_cast<uint32>(static_cast<unsigned char>(ptr[2])) << 16) | |
| (static_cast<uint32>(static_cast<unsigned char>(ptr[3])) << 24)); |
| } |
| |
| // 0xff is in case char is signed. |
| static inline uint32 ByteAs32(char c) { return static_cast<uint32>(c) & 0xff; } |
| } // namespace |
| |
| uint32 Hash32(const char *data, size_t n, uint32 seed) { |
| // 'm' and 'r' are mixing constants generated offline. |
| // They're not really 'magic', they just happen to work well. |
| const uint32 m = 0x5bd1e995; |
| const int r = 24; |
| |
| // Initialize the hash to a 'random' value |
| uint32 h = static_cast<uint32>(seed ^ n); |
| |
| // Mix 4 bytes at a time into the hash |
| while (n >= 4) { |
| uint32 k = DecodeFixed32(data); |
| k *= m; |
| k ^= k >> r; |
| k *= m; |
| h *= m; |
| h ^= k; |
| data += 4; |
| n -= 4; |
| } |
| |
| // Handle the last few bytes of the input array |
| if (n == 3) { |
| h ^= ByteAs32(data[2]) << 16; |
| h ^= ByteAs32(data[1]) << 8; |
| h ^= ByteAs32(data[0]); |
| h *= m; |
| } else if (n == 2) { |
| h ^= ByteAs32(data[1]) << 8; |
| h ^= ByteAs32(data[0]); |
| h *= m; |
| } else if (n == 1) { |
| h ^= ByteAs32(data[0]); |
| h *= m; |
| } |
| |
| // Do a few final mixes of the hash to ensure the last few |
| // bytes are well-incorporated. |
| h ^= h >> 13; |
| h *= m; |
| h ^= h >> 15; |
| return h; |
| } |
| |
| uint32 Hash32WithDefaultSeed(const string &input) { |
| return Hash32(input.data(), input.size(), 0xBEEF); |
| } |
| |
| PunctuationUtil::CharacterRange PunctuationUtil::kPunctuation[] = { |
| {33, 35}, {37, 42}, {44, 47}, {58, 59}, |
| {63, 64}, {91, 93}, {95, 95}, {123, 123}, |
| {125, 125}, {161, 161}, {171, 171}, {183, 183}, |
| {187, 187}, {191, 191}, {894, 894}, {903, 903}, |
| {1370, 1375}, {1417, 1418}, {1470, 1470}, {1472, 1472}, |
| {1475, 1475}, {1478, 1478}, {1523, 1524}, {1548, 1549}, |
| {1563, 1563}, {1566, 1567}, {1642, 1645}, {1748, 1748}, |
| {1792, 1805}, {2404, 2405}, {2416, 2416}, {3572, 3572}, |
| {3663, 3663}, {3674, 3675}, {3844, 3858}, {3898, 3901}, |
| {3973, 3973}, {4048, 4049}, {4170, 4175}, {4347, 4347}, |
| {4961, 4968}, {5741, 5742}, {5787, 5788}, {5867, 5869}, |
| {5941, 5942}, {6100, 6102}, {6104, 6106}, {6144, 6154}, |
| {6468, 6469}, {6622, 6623}, {6686, 6687}, {8208, 8231}, |
| {8240, 8259}, {8261, 8273}, {8275, 8286}, {8317, 8318}, |
| {8333, 8334}, {9001, 9002}, {9140, 9142}, {10088, 10101}, |
| {10181, 10182}, {10214, 10219}, {10627, 10648}, {10712, 10715}, |
| {10748, 10749}, {11513, 11516}, {11518, 11519}, {11776, 11799}, |
| {11804, 11805}, {12289, 12291}, {12296, 12305}, {12308, 12319}, |
| {12336, 12336}, {12349, 12349}, {12448, 12448}, {12539, 12539}, |
| {64830, 64831}, {65040, 65049}, {65072, 65106}, {65108, 65121}, |
| {65123, 65123}, {65128, 65128}, {65130, 65131}, {65281, 65283}, |
| {65285, 65290}, {65292, 65295}, {65306, 65307}, {65311, 65312}, |
| {65339, 65341}, {65343, 65343}, {65371, 65371}, {65373, 65373}, |
| {65375, 65381}, {65792, 65793}, {66463, 66463}, {68176, 68184}, |
| {-1, -1}}; |
| |
| void NormalizeDigits(string *form) { |
| for (size_t i = 0; i < form->size(); ++i) { |
| if ((*form)[i] >= '0' && (*form)[i] <= '9') (*form)[i] = '9'; |
| } |
| } |
| |
| void GetUTF8Chars(const string &text, std::vector<string> *chars) { |
| const char *start = text.c_str(); |
| const char *end = text.c_str() + text.size(); |
| while (start < end) { |
| int char_length = UTF8FirstLetterNumBytes(start); |
| chars->emplace_back(start, char_length); |
| start += char_length; |
| } |
| } |
| |
| int UTF8FirstLetterNumBytes(const char *utf8_str) { |
| if (*utf8_str == '\0') return 0; |
| return OneCharLen(utf8_str); |
| } |
| |
| int OneCharLen(const char *src) { |
| // On most platforms, char is unsigned by default, but iOS is an exception. |
| // The cast below makes sure we always interpret *src as an unsigned char. |
| return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4" |
| [(*(reinterpret_cast<const unsigned char *>(src)) & 0xFF) >> 4]; |
| } |
| |
| } // namespace utils |
| } // namespace chrome_lang_id |