blob: ec1270252ad3f22df609efae0cea8666bb902486 [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_pattern_index/ngram_extractor.h"
#include <stdint.h>
#include <string>
#include <vector>
#include "testing/gtest/include/gtest/gtest.h"
namespace url_pattern_index {
namespace {
bool IsSeparatorTrue(char) {
return true;
}
bool IsSeparatorFalse(char) {
return false;
}
bool IsSpecialChar(char c) {
return c == '*' || c == '^';
}
template <typename IntType>
IntType EncodeStringToInteger(const std::string& data) {
EXPECT_LE(data.size(), sizeof(IntType));
IntType encoded_string = 0;
for (size_t i = 0; i < data.size(); ++i) {
encoded_string = (encoded_string << 8) | static_cast<IntType>(data[i]);
}
return encoded_string;
}
template <typename IntType>
std::vector<IntType> EncodeStringsToIntegers(
const std::vector<std::string>& ngrams) {
std::vector<IntType> int_grams;
for (const std::string& ngram : ngrams) {
int_grams.push_back(EncodeStringToInteger<IntType>(ngram));
}
return int_grams;
}
} // namespace
TEST(NGramExtractorTest, EmptyString) {
const char* kString = "";
auto extractor =
CreateNGramExtractor<3, uint32_t, NGramCaseExtraction::kLowerCase>(
kString, IsSpecialChar);
EXPECT_EQ(extractor.begin(), extractor.end());
}
TEST(NGramExtractorTest, ShortString) {
const char* kString = "abacab";
auto extractor =
CreateNGramExtractor<7, uint64_t, NGramCaseExtraction::kLowerCase>(
kString, IsSeparatorFalse);
EXPECT_EQ(extractor.begin(), extractor.end());
}
TEST(NGramExtractorTest, ShortPieces) {
const char* kString = "1**abac*abc*abcd*00";
auto extractor =
CreateNGramExtractor<6, uint64_t, NGramCaseExtraction::kLowerCase>(
kString, IsSpecialChar);
EXPECT_EQ(extractor.begin(), extractor.end());
}
TEST(NGramExtractorTest, IsSeparatorAlwaysTrue) {
const char* kString = "abacaba";
auto extractor =
CreateNGramExtractor<3, uint32_t, NGramCaseExtraction::kLowerCase>(
kString, IsSeparatorTrue);
EXPECT_EQ(extractor.begin(), extractor.end());
}
TEST(NGramExtractorTest, IsSeparatorAlwaysFalse) {
const std::string kString = "abacaba123";
constexpr size_t N = 3;
std::vector<uint32_t> expected_ngrams = EncodeStringsToIntegers<uint32_t>(
{"aba", "bac", "aca", "cab", "aba", "ba1", "a12", "123"});
auto extractor =
CreateNGramExtractor<N, uint32_t, NGramCaseExtraction::kLowerCase>(
kString, IsSeparatorFalse);
std::vector<uint32_t> actual_ngrams(extractor.begin(), extractor.end());
EXPECT_EQ(expected_ngrams, actual_ngrams);
}
TEST(NGramExtractorTest, LowerCaseExtraction) {
const std::string kString = "aBcDEFG";
constexpr size_t N = 3;
std::vector<uint32_t> expected_ngrams =
EncodeStringsToIntegers<uint32_t>({"abc", "bcd", "cde", "def", "efg"});
auto extractor =
CreateNGramExtractor<N, uint32_t, NGramCaseExtraction::kLowerCase>(
kString, IsSeparatorFalse);
std::vector<uint32_t> actual_ngrams(extractor.begin(), extractor.end());
EXPECT_EQ(expected_ngrams, actual_ngrams);
}
TEST(NGramExtractorTest, CaseSensitiveExtraction) {
const std::string kString = "aBcDEFG";
constexpr size_t N = 3;
std::vector<uint32_t> expected_ngrams =
EncodeStringsToIntegers<uint32_t>({"aBc", "BcD", "cDE", "DEF", "EFG"});
auto extractor =
CreateNGramExtractor<N, uint32_t, NGramCaseExtraction::kCaseSensitive>(
kString, IsSeparatorFalse);
std::vector<uint32_t> actual_ngrams(extractor.begin(), extractor.end());
EXPECT_EQ(expected_ngrams, actual_ngrams);
}
TEST(NGramExtractorTest, NGramsArePresent) {
constexpr size_t N = 6;
const std::string kTestCases[] = {
"abcdef", "abacaba", "*abacaba",
"abacaba*", "*abacaba*", "*abacaba*abc^1005001*",
};
for (const std::string& string : kTestCases) {
SCOPED_TRACE(testing::Message() << "String: " << string);
std::vector<uint64_t> expected_ngrams;
for (size_t begin = 0; begin + N <= string.size(); ++begin) {
bool is_valid_ngram = true;
for (size_t i = 0; i < N; ++i) {
if (IsSpecialChar(string[begin + i])) {
is_valid_ngram = false;
break;
}
}
if (is_valid_ngram) {
expected_ngrams.push_back(
EncodeStringToInteger<uint64_t>(string.substr(begin, N)));
}
}
auto extractor =
CreateNGramExtractor<N, uint64_t, NGramCaseExtraction::kLowerCase>(
string, IsSpecialChar);
std::vector<uint64_t> actual_ngrams(extractor.begin(), extractor.end());
EXPECT_EQ(expected_ngrams, actual_ngrams);
}
}
} // namespace url_pattern_index