|  | // Copyright 2017 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "components/zucchini/suffix_array.h" | 
|  |  | 
|  | #include <stddef.h> | 
|  | #include <stdint.h> | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <initializer_list> | 
|  | #include <string> | 
|  | #include <vector> | 
|  |  | 
|  | #include "testing/gtest/include/gtest/gtest.h" | 
|  |  | 
|  | namespace zucchini { | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | using SLType = InducedSuffixSort::SLType; | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | using ustring = std::basic_string<unsigned char>; | 
|  |  | 
|  | constexpr uint16_t kNumChar = 256; | 
|  |  | 
|  | ustring MakeUnsignedString(const std::string& str) { | 
|  | return {str.begin(), str.end()}; | 
|  | } | 
|  |  | 
|  | template <class T> | 
|  | std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { | 
|  | return {ilist.begin(), ilist.end()}; | 
|  | } | 
|  |  | 
|  | void TestSlPartition(std::initializer_list<SLType> expected_sl_partition, | 
|  | std::initializer_list<size_t> expected_lms_indices, | 
|  | std::string str) { | 
|  | using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>; | 
|  |  | 
|  | std::vector<SLType> sl_partition(str.size()); | 
|  | EXPECT_EQ(expected_lms_indices.size(), | 
|  | SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar, | 
|  | sl_partition.rbegin())); | 
|  | EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); | 
|  |  | 
|  | std::vector<size_t> lms_indices(expected_lms_indices.size()); | 
|  | SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); | 
|  | EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); | 
|  | } | 
|  |  | 
|  | TEST(InducedSuffixSortTest, BuildSLPartition) { | 
|  | TestSlPartition({}, {}, ""); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType, | 
|  | }, | 
|  | {}, "a"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {}, "ba"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::SType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {}, "ab"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::SType, | 
|  | SLType::SType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {}, "aab"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType, | 
|  | SLType::LType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {}, "bba"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType, | 
|  | SLType::SType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {1}, "bab"); | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType, | 
|  | SLType::SType, | 
|  | SLType::SType, | 
|  | SLType::LType, | 
|  | }, | 
|  | {1}, "baab"); | 
|  |  | 
|  | TestSlPartition( | 
|  | { | 
|  | SLType::LType,  // zucchini | 
|  | SLType::LType,  // ucchini | 
|  | SLType::SType,  // cchini | 
|  | SLType::SType,  // chini | 
|  | SLType::SType,  // hini | 
|  | SLType::SType,  // ini | 
|  | SLType::LType,  // ni | 
|  | SLType::LType,  // i | 
|  | }, | 
|  | {2}, "zucchini"); | 
|  | } | 
|  |  | 
|  | std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, | 
|  | uint16_t max_key) { | 
|  | using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>; | 
|  | return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); | 
|  | } | 
|  |  | 
|  | TEST(InducedSuffixSortTest, BucketCount) { | 
|  | using vec = std::vector<size_t>; | 
|  |  | 
|  | EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); | 
|  | EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); | 
|  | EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); | 
|  | } | 
|  |  | 
|  | std::vector<size_t> InducedSortSubstring(ustring str) { | 
|  | using SaisImpl = InducedSuffixSort::Implementation<size_t, uint16_t>; | 
|  | std::vector<SLType> sl_partition(str.size()); | 
|  | size_t lms_count = SaisImpl::BuildSLPartition( | 
|  | str.begin(), str.size(), kNumChar, sl_partition.rbegin()); | 
|  | std::vector<size_t> lms_indices(lms_count); | 
|  | SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); | 
|  | auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar); | 
|  |  | 
|  | std::vector<size_t> suffix_array(str.size()); | 
|  | SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, | 
|  | suffix_array.begin()); | 
|  |  | 
|  | return suffix_array; | 
|  | } | 
|  |  | 
|  | TEST(InducedSuffixSortTest, InducedSortSubstring) { | 
|  | using vec = std::vector<size_t>; | 
|  |  | 
|  | auto us = MakeUnsignedString; | 
|  |  | 
|  | // L; a$ | 
|  | EXPECT_EQ(vec({0}), InducedSortSubstring(us("a"))); | 
|  |  | 
|  | // SL; ab$, b$ | 
|  | EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab"))); | 
|  |  | 
|  | // LL; a$, ba$ | 
|  | EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba"))); | 
|  |  | 
|  | // SLL; a$, aba$, ba$ | 
|  | EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba"))); | 
|  |  | 
|  | // LSL; ab$, b$, ba | 
|  | EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab"))); | 
|  |  | 
|  | // SSL; aab$, ab$, b$ | 
|  | EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab"))); | 
|  |  | 
|  | // LSSL; aab$, ab$, b$, ba | 
|  | EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab"))); | 
|  | } | 
|  |  | 
|  | template <class Algorithm> | 
|  | void TestSuffixSort(ustring test_str) { | 
|  | std::vector<size_t> suffix_array = | 
|  | MakeSuffixArray<Algorithm>(test_str, kNumChar); | 
|  | EXPECT_EQ(test_str.size(), suffix_array.size()); | 
|  |  | 
|  | // Expect that I[] is a permutation of [0, len]. | 
|  | std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); | 
|  | std::sort(sorted_suffix.begin(), sorted_suffix.end()); | 
|  | for (size_t i = 0; i < test_str.size(); ++i) | 
|  | EXPECT_EQ(i, sorted_suffix[i]); | 
|  |  | 
|  | // Expect that all suffixes are strictly ordered. | 
|  | auto end = test_str.end(); | 
|  | for (size_t i = 1; i < test_str.size(); ++i) { | 
|  | auto suf1 = test_str.begin() + suffix_array[i - 1]; | 
|  | auto suf2 = test_str.begin() + suffix_array[i]; | 
|  | bool is_less = std::lexicographical_compare(suf1, end, suf2, end); | 
|  | EXPECT_TRUE(is_less); | 
|  | } | 
|  | } | 
|  |  | 
|  | constexpr const char* test_strs[] = { | 
|  | "", | 
|  | "a", | 
|  | "aa", | 
|  | "za", | 
|  | "CACAO", | 
|  | "aaaaa", | 
|  | "banana", | 
|  | "tobeornottobe", | 
|  | "The quick brown fox jumps over the lazy dog.", | 
|  | "elephantelephantelephantelephantelephant", | 
|  | "walawalawashington", | 
|  | "-------------------------", | 
|  | "011010011001011010010110011010010", | 
|  | "3141592653589793238462643383279502884197169399375105", | 
|  | "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", | 
|  | "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", | 
|  | "0123456789876543210", | 
|  | "9876543210123456789", | 
|  | "aababcabcdabcdeabcdefabcdefg", | 
|  | "asdhklgalksdjghalksdjghalksdjgh", | 
|  | }; | 
|  |  | 
|  | TEST(SuffixSortTest, NaiveSuffixSort) { | 
|  | for (const std::string& test_str : test_strs) { | 
|  | TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST(SuffixSortTest, InducedSuffixSortSort) { | 
|  | for (const std::string& test_str : test_strs) { | 
|  | TestSuffixSort<InducedSuffixSort>(MakeUnsignedString(test_str)); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Test with sequence that has every character. | 
|  | TEST(SuffixSortTest, AllChar) { | 
|  | std::vector<unsigned char> all_char(kNumChar); | 
|  | std::iota(all_char.begin(), all_char.end(), 0); | 
|  |  | 
|  | { | 
|  | std::vector<size_t> suffix_array = | 
|  | MakeSuffixArray<InducedSuffixSort>(all_char, kNumChar); | 
|  | for (size_t i = 0; i < kNumChar; ++i) | 
|  | EXPECT_EQ(i, suffix_array[i]); | 
|  | } | 
|  |  | 
|  | std::vector<unsigned char> all_char_reverse(all_char.rbegin(), | 
|  | all_char.rend()); | 
|  | { | 
|  | std::vector<size_t> suffix_array = | 
|  | MakeSuffixArray<InducedSuffixSort>(all_char_reverse, kNumChar); | 
|  | for (size_t i = 0; i < kNumChar; ++i) | 
|  | EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); | 
|  | } | 
|  | } | 
|  |  | 
|  | void TestSuffixLowerBound(ustring base_str, ustring search_str) { | 
|  | std::vector<size_t> suffix_array = | 
|  | MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar); | 
|  |  | 
|  | auto pos = SuffixLowerBound(suffix_array, base_str.begin(), | 
|  | search_str.begin(), search_str.end()); | 
|  |  | 
|  | auto end = base_str.end(); | 
|  | if (pos != suffix_array.begin()) { | 
|  | // Previous suffix is less than |search_str|. | 
|  | auto suf = base_str.begin() + pos[-1]; | 
|  | bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), | 
|  | search_str.end()); | 
|  | EXPECT_TRUE(is_less); | 
|  | } | 
|  | if (pos != suffix_array.end()) { | 
|  | // Current suffix is greater of equal to |search_str|. | 
|  | auto suf = base_str.begin() + *pos; | 
|  | bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), | 
|  | search_str.end()); | 
|  | EXPECT_FALSE(is_less); | 
|  | } | 
|  | } | 
|  |  | 
|  | TEST(SuffixArrayTest, LowerBound) { | 
|  | auto us = MakeUnsignedString; | 
|  |  | 
|  | TestSuffixLowerBound(us(""), us("")); | 
|  | TestSuffixLowerBound(us(""), us("a")); | 
|  | TestSuffixLowerBound(us("b"), us("")); | 
|  | TestSuffixLowerBound(us("b"), us("a")); | 
|  | TestSuffixLowerBound(us("b"), us("c")); | 
|  | TestSuffixLowerBound(us("b"), us("bc")); | 
|  | TestSuffixLowerBound(us("aa"), us("a")); | 
|  | TestSuffixLowerBound(us("aa"), us("aa")); | 
|  |  | 
|  | ustring sentence = us("the quick brown fox jumps over the lazy dog."); | 
|  | // Entire string: exact and unique. | 
|  | TestSuffixLowerBound(sentence, sentence); | 
|  | // Empty string: exact and non-unique. | 
|  | TestSuffixLowerBound(sentence, us("")); | 
|  | // Exact and unique suffix matches. | 
|  | TestSuffixLowerBound(sentence, us(".")); | 
|  | TestSuffixLowerBound(sentence, us("the lazy dog.")); | 
|  | // Exact and unique non-suffix matches. | 
|  | TestSuffixLowerBound(sentence, us("quick")); | 
|  | TestSuffixLowerBound(sentence, us("the quick")); | 
|  | // Partial and unique matches. | 
|  | TestSuffixLowerBound(sentence, us("fox jumps with the hosps")); | 
|  | TestSuffixLowerBound(sentence, us("xyz")); | 
|  | // Exact and non-unique match: take lexicographical first. | 
|  | TestSuffixLowerBound(sentence, us("the")); | 
|  | TestSuffixLowerBound(sentence, us(" ")); | 
|  | // Partial and non-unique match. | 
|  | // query      < "the l"... < "the q"... | 
|  | TestSuffixLowerBound(sentence, us("the apple")); | 
|  | // "the l"... < query      < "the q"... | 
|  | TestSuffixLowerBound(sentence, us("the opera")); | 
|  | // "the l"... < "the q"... < query | 
|  | TestSuffixLowerBound(sentence, us("the zebra")); | 
|  | // Prefix match dominates suffix match (unique). | 
|  | TestSuffixLowerBound(sentence, us("over quick brown fox")); | 
|  | // Empty matchs. | 
|  | TestSuffixLowerBound(sentence, us(",")); | 
|  | TestSuffixLowerBound(sentence, us("1234")); | 
|  | TestSuffixLowerBound(sentence, us("THE QUICK BROWN FOX")); | 
|  | TestSuffixLowerBound(sentence, us("(the")); | 
|  | } | 
|  |  | 
|  | TEST(SuffixArrayTest, LowerBoundExact) { | 
|  | for (const std::string& test_str : test_strs) { | 
|  | ustring test_ustr = MakeUnsignedString(test_str); | 
|  |  | 
|  | std::vector<size_t> suffix_array = | 
|  | MakeSuffixArray<InducedSuffixSort>(test_ustr, kNumChar); | 
|  |  | 
|  | for (size_t lo = 0; lo < test_str.size(); ++lo) { | 
|  | for (size_t hi = lo + 1; hi <= test_str.size(); ++hi) { | 
|  | ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); | 
|  | ASSERT_EQ(query.size(), hi - lo); | 
|  | auto pos = SuffixLowerBound(suffix_array, test_ustr.begin(), | 
|  | query.begin(), query.end()); | 
|  | EXPECT_TRUE( | 
|  | std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | }  // namespace zucchini |