| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chromeos/components/local_search_service/content_extraction_utils.h" |
| #include <memory> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "base/check.h" |
| #include "base/containers/flat_set.h" |
| #include "base/i18n/case_conversion.h" |
| #include "base/i18n/unicodestring.h" |
| #include "base/memory/ptr_util.h" |
| #include "base/no_destructor.h" |
| #include "base/strings/string16.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "chromeos/components/string_matching/tokenized_string.h" |
| #include "third_party/icu/source/i18n/unicode/translit.h" |
| |
| namespace chromeos { |
| namespace local_search_service { |
| |
| namespace { |
| using chromeos::string_matching::TokenizedString; |
| } // namespace |
| |
| std::vector<Token> ConsolidateToken(const std::vector<Token>& tokens) { |
| std::unordered_map<base::string16, std::vector<WeightedPosition>> dictionary; |
| for (const auto& token : tokens) { |
| dictionary[token.content].insert(dictionary[token.content].end(), |
| token.positions.begin(), |
| token.positions.end()); |
| } |
| |
| std::vector<Token> results; |
| for (const auto& item : dictionary) { |
| results.push_back(Token(item.first, item.second)); |
| } |
| return results; |
| } |
| |
| std::vector<Token> ExtractContent(const std::string& content_id, |
| const base::string16& text, |
| double weight, |
| const std::string& locale) { |
| // Use two different string tokenizing algorithms for Latin and non Latin |
| // locale. |
| TokenizedString::Mode mode; |
| if (IsNonLatinLocale(locale)) { |
| mode = TokenizedString::Mode::kCamelCase; |
| } else { |
| mode = TokenizedString::Mode::kWords; |
| } |
| |
| const TokenizedString tokenized_string(text, mode); |
| DCHECK(tokenized_string.tokens().size() == |
| tokenized_string.mappings().size()); |
| |
| const size_t num_tokens = tokenized_string.tokens().size(); |
| std::vector<Token> tokens; |
| |
| for (size_t i = 0; i < num_tokens; i++) { |
| const base::string16 word = Normalizer(tokenized_string.tokens()[i]); |
| if (IsStopword(word, locale)) |
| continue; |
| tokens.push_back(Token( |
| word, |
| {WeightedPosition( |
| weight, Position(content_id, tokenized_string.mappings()[i].start(), |
| tokenized_string.mappings()[i].end() - |
| tokenized_string.mappings()[i].start()))})); |
| } |
| |
| return tokens; |
| } |
| |
| bool IsNonLatinLocale(const std::string& locale) { |
| static const base::NoDestructor<base::flat_set<std::string>> |
| non_latin_locales({"am", "ar", "be", "bg", "bn", "el", "fa", "gu", |
| "hi", "hy", "iw", "ja", "ka", "kk", "km", "kn", |
| "ko", "ky", "lo", "mk", "ml", "mn", "mr", "my", |
| "pa", "ru", "sr", "ta", "te", "th", "uk", "zh"}); |
| return base::Contains(*non_latin_locales, locale.substr(0, 2)); |
| } |
| |
| bool IsStopword(const base::string16& word, const std::string& locale) { |
| // TODO(thanhdng): Currently we support stopword list for English only. In the |
| // future, when we need to support other languages, creates resource files to |
| // store the stopwords. |
| if (locale.substr(0, 2) != "en") |
| return false; |
| |
| // A set of stopwords in English. This set is taken from NLTK library. |
| static const base::NoDestructor<base::flat_set<std::string>> |
| english_stopwords( |
| {"i", "me", "my", "myself", "we", |
| "our", "ours", "ourselves", "you", "you're", |
| "you've", "you'll", "you'd", "your", "yours", |
| "yourself", "yourselves", "he", "him", "his", |
| "himself", "she", "she's", "her", "hers", |
| "herself", "it", "it's", "its", "itself", |
| "they", "them", "their", "theirs", "themselves", |
| "what", "which", "who", "whom", "this", |
| "that", "that'll", "these", "those", "am", |
| "is", "are", "was", "were", "be", |
| "been", "being", "have", "has", "had", |
| "having", "do", "does", "did", "doing", |
| "a", "an", "the", "and", "but", |
| "if", "or", "because", "as", "until", |
| "while", "of", "at", "by", "for", |
| "with", "about", "against", "between", "into", |
| "through", "during", "before", "after", "above", |
| "below", "to", "from", "up", "down", |
| "in", "out", "on", "off", "over", |
| "under", "again", "further", "then", "once", |
| "here", "there", "when", "where", "why", |
| "how", "all", "any", "both", "each", |
| "few", "more", "most", "other", "some", |
| "such", "no", "nor", "not", "only", |
| "own", "same", "so", "than", "too", |
| "very", "s", "t", "can", "will", |
| "just", "don", "don't", "should", "should've", |
| "now", "d", "ll", "m", "o", |
| "re", "ve", "y", "ain", "aren", |
| "aren't", "couldn", "couldn't", "didn", "didn't", |
| "doesn", "doesn't", "hadn", "hadn't", "hasn", |
| "hasn't", "haven", "haven't", "isn", "isn't", |
| "ma", "mightn", "mightn't", "mustn", "mustn't", |
| "needn", "needn't", "shan", "shan't", "shouldn", |
| "shouldn't", "wasn", "wasn't", "weren", "weren't", |
| "won", "won't", "wouldn", "wouldn't"}); |
| return base::Contains(*english_stopwords, base::UTF16ToUTF8(word)); |
| } |
| |
| base::string16 Normalizer(const base::string16& word, bool remove_hyphen) { |
| // Case folding. |
| icu::UnicodeString source = icu::UnicodeString::fromUTF8( |
| base::UTF16ToUTF8(base::i18n::FoldCase(word))); |
| |
| // Removes diacritic. |
| UErrorCode status = U_ZERO_ERROR; |
| UParseError parse_error; |
| |
| // Adds a rule to remove diacritic from text. Adds a few characters that are |
| // not handled by ICU (ł > l; ø > o; đ > d). |
| std::unique_ptr<icu::Transliterator> diacritic_remover = |
| base::WrapUnique(icu::Transliterator::createFromRules( |
| UNICODE_STRING_SIMPLE("RemoveDiacritic"), |
| icu::UnicodeString::fromUTF8("::NFD; ::[:Nonspacing Mark:] Remove; " |
| "::NFC; ł > l; ø > o; đ > d;"), |
| UTRANS_FORWARD, parse_error, status)); |
| diacritic_remover->transliterate(source); |
| |
| // Removes hyphen. |
| if (remove_hyphen) { |
| // Hyphen characters list is taken from here: http://jkorpela.fi/dashes.html |
| // U+002D(-), U+007E(~), U+058A(֊), U+05BE(־), U+1806(᠆), U+2010(‐), |
| // U+2011(‑), U+2012(‒), U+2013(–), U+2014(—), U+2015(―), U+2053(⁓), |
| // U+207B(⁻), U+208B(₋), U+2212(−), U+2E3A(⸺ ), U+2E3B(⸻ ), U+301C(〜), |
| // U+3030(〰), U+30A0(゠), U+FE58(﹘), U+FE63(﹣), U+FF0D(-). |
| std::unique_ptr<icu::Transliterator> hyphen_remover = |
| base::WrapUnique(icu::Transliterator::createFromRules( |
| UNICODE_STRING_SIMPLE("RemoveHyphen"), |
| icu::UnicodeString::fromUTF8( |
| "::[-~֊־᠆‐‑‒–—―⁓⁻₋−⸺⸻〜〰゠﹘﹣-] Remove;"), |
| UTRANS_FORWARD, parse_error, status)); |
| hyphen_remover->transliterate(source); |
| } |
| |
| return base::i18n::UnicodeStringToString16(source); |
| } |
| } // namespace local_search_service |
| } // namespace chromeos |