| // Copyright 2017 The Chromium Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style license that can be | 
 | // found in the LICENSE file. | 
 |  | 
 | #ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ | 
 | #define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ | 
 |  | 
 | #include <algorithm> | 
 | #include <iterator> | 
 | #include <numeric> | 
 | #include <vector> | 
 |  | 
 | #include "base/logging.h" | 
 | #include "base/macros.h" | 
 |  | 
 | namespace zucchini { | 
 |  | 
 | // A functor class that implements the naive suffix sorting algorithm that uses | 
 | // std::sort with lexicographical compare. This is only meant as reference of | 
 | // the interface. | 
 | class NaiveSuffixSort { | 
 |  public: | 
 |   // Type requirements: | 
 |   // |InputRng| is an input random access range. | 
 |   // |KeyType| is an unsigned integer type. | 
 |   // |SAIt| is a random access iterator with mutable references. | 
 |   template <class InputRng, class KeyType, class SAIt> | 
 |   // |str| is the input string on which suffix sort is applied. | 
 |   // Characters found in |str| must be in the range [0, |key_bound|) | 
 |   // |suffix_array| is the beginning of the destination range, which is at least | 
 |   // as large as |str|. | 
 |   void operator()(const InputRng& str, | 
 |                   KeyType key_bound, | 
 |                   SAIt suffix_array) const { | 
 |     using size_type = typename SAIt::value_type; | 
 |  | 
 |     size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | 
 |  | 
 |     // |suffix_array| is first filled with ordered indices of |str|. | 
 |     // Those indices are then sorted with lexicographical comparisons in |str|. | 
 |     std::iota(suffix_array, suffix_array + n, 0); | 
 |     std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { | 
 |       return std::lexicographical_compare(std::begin(str) + i, std::end(str), | 
 |                                           std::begin(str) + j, std::end(str)); | 
 |     }); | 
 |   } | 
 | }; | 
 |  | 
 | // A functor class that implements suffix array induced sorting (SA-IS) | 
 | // algorithm with linear time and memory complexity, | 
 | // see http://ieeexplore.ieee.org/abstract/document/5582081/ | 
 | class InducedSuffixSort { | 
 |  public: | 
 |   // Type requirements: | 
 |   // |InputRng| is an input random access range. | 
 |   // |KeyType| is an unsigned integer type. | 
 |   // |SAIt| is a random access iterator with mutable values. | 
 |   template <class InputRng, class KeyType, class SAIt> | 
 |   // |str| is the input string on which suffix sort is applied. | 
 |   // Characters found in |str| must be in the range [0, |key_bound|) | 
 |   // |suffix_array| is the beginning of the destination range, which is at least | 
 |   // as large as |str|. | 
 |   void operator()(const InputRng& str, | 
 |                   KeyType key_bound, | 
 |                   SAIt suffix_array) const { | 
 |     using value_type = typename InputRng::value_type; | 
 |     using size_type = typename SAIt::value_type; | 
 |  | 
 |     static_assert(std::is_unsigned<value_type>::value, | 
 |                   "SA-IS only supports input string with unsigned values"); | 
 |     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | 
 |  | 
 |     size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | 
 |  | 
 |     Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n, | 
 |                                                    key_bound, suffix_array); | 
 |   } | 
 |  | 
 |   // Given string S of length n. We assume S is terminated by a unique sentinel | 
 |   // $, which is considered as the smallest character. This sentinel does not | 
 |   // exist in memory and is only treated implicitly, hence |n| does not count | 
 |   // the sentinel in this implementation. We denote suf(S,i) the suffix formed | 
 |   // by S[i..n). | 
 |  | 
 |   // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1) | 
 |   // or suf(S,i) > suf(S,i+1), respectively. | 
 |   enum SLType : bool { SType, LType }; | 
 |  | 
 |   // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is | 
 |   // S-type or L-type, respectively. | 
 |  | 
 |   // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and | 
 |   // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS | 
 |   // character. | 
 |  | 
 |   // A substring S[i..j) is an LMS-substring if | 
 |   // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other | 
 |   //     LMS characters, or | 
 |   // (2) S[i..j) is the sentinel $. | 
 |  | 
 |   template <class SizeType, class KeyType> | 
 |   struct Implementation { | 
 |     static_assert(std::is_unsigned<SizeType>::value, | 
 |                   "SizeType must be unsigned"); | 
 |     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | 
 |     using size_type = SizeType; | 
 |     using key_type = KeyType; | 
 |  | 
 |     using iterator = typename std::vector<size_type>::iterator; | 
 |     using const_iterator = typename std::vector<size_type>::const_iterator; | 
 |  | 
 |     // Partition every suffix based on SL-type. Returns the number of LMS | 
 |     // suffixes. | 
 |     template <class StrIt> | 
 |     static size_type BuildSLPartition( | 
 |         StrIt str, | 
 |         size_type length, | 
 |         key_type key_bound, | 
 |         std::vector<SLType>::reverse_iterator sl_partition_it) { | 
 |       // We will count LMS suffixes (S to L-type or last S-type). | 
 |       size_type lms_count = 0; | 
 |  | 
 |       // |previous_type| is initialized to L-type to avoid counting an extra | 
 |       // LMS suffix at the end | 
 |       SLType previous_type = LType; | 
 |  | 
 |       // Initialized to dummy, impossible key. | 
 |       key_type previous_key = key_bound; | 
 |  | 
 |       // We're travelling backward to determine the partition, | 
 |       // as if we prepend one character at a time to the string, ex: | 
 |       // b$ is L-type because b > $. | 
 |       // ab$ is S-type because a < b, implying ab$ < b$. | 
 |       // bab$ is L-type because b > a, implying bab$ > ab$. | 
 |       // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$. | 
 |       for (auto str_it = std::reverse_iterator<StrIt>(str + length); | 
 |            str_it != std::reverse_iterator<StrIt>(str); | 
 |            ++str_it, ++sl_partition_it) { | 
 |         key_type current_key = *str_it; | 
 |  | 
 |         if (current_key > previous_key || previous_key == key_bound) { | 
 |           // S[i] > S[i + 1] or S[i] is last character. | 
 |           if (previous_type == SType) | 
 |             // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore, | 
 |             // suf(S,i+1) was a LMS suffix. | 
 |             ++lms_count; | 
 |  | 
 |           previous_type = LType;  // For next round. | 
 |         } else if (current_key < previous_key) { | 
 |           // S[i] < S[i + 1] | 
 |           previous_type = SType;  // For next round. | 
 |         } | 
 |         // Else, S[i] == S[i + 1]: | 
 |         // The next character that differs determines the SL-type, | 
 |         // so we reuse the last seen type. | 
 |  | 
 |         *sl_partition_it = previous_type; | 
 |         previous_key = current_key;  // For next round. | 
 |       } | 
 |  | 
 |       return lms_count; | 
 |     } | 
 |  | 
 |     // Find indices of LMS suffixes and write result to |lms_indices|. | 
 |     static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, | 
 |                                 iterator lms_indices) { | 
 |       // |previous_type| is initialized to S-type to avoid counting an extra | 
 |       // LMS suffix at the beginning | 
 |       SLType previous_type = SType; | 
 |       for (size_type i = 0; i < sl_partition.size(); ++i) { | 
 |         if (sl_partition[i] == SType && previous_type == LType) | 
 |           *lms_indices++ = i; | 
 |         previous_type = sl_partition[i]; | 
 |       } | 
 |     } | 
 |  | 
 |     template <class StrIt> | 
 |     static std::vector<size_type> MakeBucketCount(StrIt str, | 
 |                                                   size_type length, | 
 |                                                   key_type key_bound) { | 
 |       // Occurrence of every unique character is counted in |buckets| | 
 |       std::vector<size_type> buckets(static_cast<size_type>(key_bound)); | 
 |  | 
 |       for (auto it = str; it != str + length; ++it) | 
 |         ++buckets[*it]; | 
 |       return buckets; | 
 |     } | 
 |  | 
 |     // Apply induced sort from |lms_indices| to |suffix_array| associated with | 
 |     // the string |str|. | 
 |     template <class StrIt, class SAIt> | 
 |     static void InducedSort(StrIt str, | 
 |                             size_type length, | 
 |                             const std::vector<SLType>& sl_partition, | 
 |                             const std::vector<size_type>& lms_indices, | 
 |                             const std::vector<size_type>& buckets, | 
 |                             SAIt suffix_array) { | 
 |       // All indices are first marked as unset with the illegal value |length|. | 
 |       std::fill(suffix_array, suffix_array + length, length); | 
 |  | 
 |       // Used to mark bucket boundaries (head or end) as indices in str. | 
 |       DCHECK(!buckets.empty()); | 
 |       std::vector<size_type> bucket_bounds(buckets.size()); | 
 |  | 
 |       // Step 1: Assign indices for LMS suffixes, populating the end of | 
 |       // respective buckets but keeping relative order. | 
 |  | 
 |       // Find the end of each bucket and write it to |bucket_bounds|. | 
 |       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | 
 |  | 
 |       // Process each |lms_indices| backward, and assign them to the end of | 
 |       // their respective buckets, so relative order is preserved. | 
 |       for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) { | 
 |         key_type key = str[*it]; | 
 |         suffix_array[--bucket_bounds[key]] = *it; | 
 |       } | 
 |  | 
 |       // Step 2 | 
 |       // Scan forward |suffix_array|; for each modified suf(S,i) for which | 
 |       // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current | 
 |       // head of the corresponding bucket and forward the bucket head to the | 
 |       // right. | 
 |  | 
 |       // Find the head of each bucket and write it to |bucket_bounds|. Since | 
 |       // only LMS suffixes where inserted in |suffix_array| during Step 1, | 
 |       // |bucket_bounds| does not contains the head of each bucket and needs to | 
 |       // be updated. | 
 |       bucket_bounds[0] = 0; | 
 |       std::partial_sum(buckets.begin(), buckets.end() - 1, | 
 |                        bucket_bounds.begin() + 1); | 
 |  | 
 |       // From Step 1, the sentinel $, which we treat implicitly, would have | 
 |       // been placed at the beginning of |suffix_array|, since $ is always | 
 |       // considered as the smallest character. We then have to deal with the | 
 |       // previous (last) suffix. | 
 |       if (sl_partition[length - 1] == LType) { | 
 |         key_type key = str[length - 1]; | 
 |         suffix_array[bucket_bounds[key]++] = length - 1; | 
 |       } | 
 |       for (auto it = suffix_array; it != suffix_array + length; ++it) { | 
 |         size_type suffix_index = *it; | 
 |  | 
 |         // While the original algorithm marks unset suffixes with -1, | 
 |         // we found that marking them with |length| is also possible and more | 
 |         // convenient because we are working with unsigned integers. | 
 |         if (suffix_index != length && suffix_index > 0 && | 
 |             sl_partition[--suffix_index] == LType) { | 
 |           key_type key = str[suffix_index]; | 
 |           suffix_array[bucket_bounds[key]++] = suffix_index; | 
 |         } | 
 |       } | 
 |  | 
 |       // Step 3 | 
 |       // Scan backward |suffix_array|; for each modified suf(S, i) for which | 
 |       // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current | 
 |       // end of the corresponding bucket and forward the bucket head to the | 
 |       // left. | 
 |  | 
 |       // Find the end of each bucket and write it to |bucket_bounds|. Since | 
 |       // only L-type suffixes where inserted in |suffix_array| during Step 2, | 
 |       // |bucket_bounds| does not contain the end of each bucket and needs to | 
 |       // be updated. | 
 |       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | 
 |  | 
 |       for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); | 
 |            it != std::reverse_iterator<SAIt>(suffix_array); ++it) { | 
 |         size_type suffix_index = *it; | 
 |         if (suffix_index != length && suffix_index > 0 && | 
 |             sl_partition[--suffix_index] == SType) { | 
 |           key_type key = str[suffix_index]; | 
 |           suffix_array[--bucket_bounds[key]] = suffix_index; | 
 |         } | 
 |       } | 
 |       // Deals with the last suffix, because of the sentinel. | 
 |       if (sl_partition[length - 1] == SType) { | 
 |         key_type key = str[length - 1]; | 
 |         suffix_array[--bucket_bounds[key]] = length - 1; | 
 |       } | 
 |     } | 
 |  | 
 |     // Given a string S starting at |str| with length |length|, an array | 
 |     // starting at |substring_array| containing lexicographically ordered LMS | 
 |     // terminated substring indices of S and an SL-Type partition |sl_partition| | 
 |     // of S, assigns a unique label to every unique LMS substring. The sorted | 
 |     // labels for all LMS substrings are written to |lms_str|, while the indices | 
 |     // of LMS suffixes are written to |lms_indices|. In addition, returns the | 
 |     // total number of unique labels. | 
 |     template <class StrIt, class SAIt> | 
 |     static size_type LabelLmsSubstrings(StrIt str, | 
 |                                         size_type length, | 
 |                                         const std::vector<SLType>& sl_partition, | 
 |                                         SAIt suffix_array, | 
 |                                         iterator lms_indices, | 
 |                                         iterator lms_str) { | 
 |       // Labelling starts at 0. | 
 |       size_type label = 0; | 
 |  | 
 |       // |previous_lms| is initialized to 0 to indicate it is unset. | 
 |       // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in | 
 |       // lexicographical order. | 
 |       size_type previous_lms = 0; | 
 |       for (auto it = suffix_array; it != suffix_array + length; ++it) { | 
 |         if (*it > 0 && sl_partition[*it] == SType && | 
 |             sl_partition[*it - 1] == LType) { | 
 |           // suf(S, *it) is a LMS suffix. | 
 |  | 
 |           size_type current_lms = *it; | 
 |           if (previous_lms != 0) { | 
 |             // There was a previous LMS suffix. Check if the current LMS | 
 |             // substring is equal to the previous one. | 
 |             SLType current_lms_type = SType; | 
 |             SLType previous_lms_type = SType; | 
 |             for (size_type k = 0;; ++k) { | 
 |               // |current_lms_end| and |previous_lms_end| denote whether we have | 
 |               // reached the end of the current and previous LMS substring, | 
 |               // respectively | 
 |               bool current_lms_end = false; | 
 |               bool previous_lms_end = false; | 
 |  | 
 |               // Check for both previous and current substring ends. | 
 |               // Note that it is more convenient to check if | 
 |               // suf(S,current_lms + k) is an LMS suffix than to retrieve it | 
 |               // from lms_indices. | 
 |               if (current_lms + k >= length || | 
 |                   (current_lms_type == LType && | 
 |                    sl_partition[current_lms + k] == SType)) { | 
 |                 current_lms_end = true; | 
 |               } | 
 |               if (previous_lms + k >= length || | 
 |                   (previous_lms_type == LType && | 
 |                    sl_partition[previous_lms + k] == SType)) { | 
 |                 previous_lms_end = true; | 
 |               } | 
 |  | 
 |               if (current_lms_end && previous_lms_end) { | 
 |                 break;  // Previous and current substrings are identical. | 
 |               } else if (current_lms_end != previous_lms_end || | 
 |                          str[current_lms + k] != str[previous_lms + k]) { | 
 |                 // Previous and current substrings differ, a new label is used. | 
 |                 ++label; | 
 |                 break; | 
 |               } | 
 |  | 
 |               current_lms_type = sl_partition[current_lms + k]; | 
 |               previous_lms_type = sl_partition[previous_lms + k]; | 
 |             } | 
 |           } | 
 |           *lms_indices++ = *it; | 
 |           *lms_str++ = label; | 
 |           previous_lms = current_lms; | 
 |         } | 
 |       } | 
 |  | 
 |       return label + 1; | 
 |     } | 
 |  | 
 |     // Implementation of the SA-IS algorithm. |str| must be a random access | 
 |     // iterator pointing at the beginning of S with length |length|. The result | 
 |     // is writtend in |suffix_array|, a random access iterator. | 
 |     template <class StrIt, class SAIt> | 
 |     static void SuffixSort(StrIt str, | 
 |                            size_type length, | 
 |                            key_type key_bound, | 
 |                            SAIt suffix_array) { | 
 |       if (length == 1) | 
 |         *suffix_array = 0; | 
 |       if (length < 2) | 
 |         return; | 
 |  | 
 |       std::vector<SLType> sl_partition(length); | 
 |       size_type lms_count = | 
 |           BuildSLPartition(str, length, key_bound, sl_partition.rbegin()); | 
 |       std::vector<size_type> lms_indices(lms_count); | 
 |       FindLmsSuffixes(sl_partition, lms_indices.begin()); | 
 |       std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound); | 
 |  | 
 |       if (lms_indices.size() > 1) { | 
 |         // Given |lms_indices| in the same order they appear in |str|, induce | 
 |         // LMS substrings relative order and write result to |suffix_array|. | 
 |         InducedSort(str, length, sl_partition, lms_indices, buckets, | 
 |                     suffix_array); | 
 |         std::vector<size_type> lms_str(lms_indices.size()); | 
 |  | 
 |         // Given LMS substrings in relative order found in |suffix_array|, | 
 |         // map LMS substrings to unique labels to form a new string, |lms_str|. | 
 |         size_type label_count = | 
 |             LabelLmsSubstrings(str, length, sl_partition, suffix_array, | 
 |                                lms_indices.begin(), lms_str.begin()); | 
 |  | 
 |         if (label_count < lms_str.size()) { | 
 |           // Reorder |lms_str| to have LMS suffixes in the same order they | 
 |           // appear in |str|. | 
 |           for (size_type i = 0; i < lms_indices.size(); ++i) | 
 |             suffix_array[lms_indices[i]] = lms_str[i]; | 
 |  | 
 |           SLType previous_type = SType; | 
 |           for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { | 
 |             if (sl_partition[i] == SType && previous_type == LType) { | 
 |               lms_str[j] = suffix_array[i]; | 
 |               lms_indices[j++] = i; | 
 |             } | 
 |             previous_type = sl_partition[i]; | 
 |           } | 
 |  | 
 |           // Recursively apply SuffixSort on |lms_str|, which is formed from | 
 |           // labeled LMS suffixes in the same order they appear in |str|. | 
 |           // Note that |KeyType| will be size_type because |lms_str| contains | 
 |           // indices. |lms_str| is at most half the length of |str|. | 
 |           Implementation<size_type, size_type>::SuffixSort( | 
 |               lms_str.begin(), static_cast<size_type>(lms_str.size()), | 
 |               label_count, suffix_array); | 
 |  | 
 |           // Map LMS labels back to indices in |str| and write result to | 
 |           // |lms_indices|. We're using |suffix_array| as a temporary buffer. | 
 |           for (size_type i = 0; i < lms_indices.size(); ++i) | 
 |             suffix_array[i] = lms_indices[suffix_array[i]]; | 
 |           std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin()); | 
 |  | 
 |           // At this point, |lms_indices| contains sorted LMS suffixes of |str|. | 
 |         } | 
 |       } | 
 |       // Given |lms_indices| where LMS suffixes are sorted, induce the full | 
 |       // order of suffixes in |str|. | 
 |       InducedSort(str, length, sl_partition, lms_indices, buckets, | 
 |                   suffix_array); | 
 |     } | 
 |  | 
 |    private: | 
 |     DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation); | 
 |   }; | 
 | }; | 
 |  | 
 | // Generates a sorted suffix array for the input string |str| using the functor | 
 | // |Algorithm| which provides an interface equivalent to NaiveSuffixSort. | 
 | /// Characters found in |str| are assumed to be in range [0, |key_bound|). | 
 | // Returns the suffix array as a vector. | 
 | // |StrRng| is an input random access range. | 
 | // |KeyType| is an unsigned integer type. | 
 | template <class Algorithm, class StrRng, class KeyType> | 
 | std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, | 
 |                                                         KeyType key_bound) { | 
 |   Algorithm sort; | 
 |   std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); | 
 |   sort(str, key_bound, suffix_array.begin()); | 
 |   return suffix_array; | 
 | } | 
 |  | 
 | // Type requirements: | 
 | // |SARng| is an input random access range. | 
 | // |StrIt1| is a random access iterator. | 
 | // |StrIt2| is a forward iterator. | 
 | template <class SARng, class StrIt1, class StrIt2> | 
 | // Lexicographical lower bound using binary search for | 
 | // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string | 
 | // starting at |str1_first|. This does not necessarily return the index of | 
 | // the longest matching substring. | 
 | auto SuffixLowerBound(const SARng& suffix_array, | 
 |                       StrIt1 str1_first, | 
 |                       StrIt2 str2_first, | 
 |                       StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { | 
 |   using size_type = typename SARng::value_type; | 
 |  | 
 |   size_t n = std::end(suffix_array) - std::begin(suffix_array); | 
 |   auto it = std::lower_bound( | 
 |       std::begin(suffix_array), std::end(suffix_array), str2_first, | 
 |       [str1_first, str2_last, n](size_type a, StrIt2 b) { | 
 |         return std::lexicographical_compare(str1_first + a, str1_first + n, b, | 
 |                                             str2_last); | 
 |       }); | 
 |   return it; | 
 | } | 
 |  | 
 | }  // namespace zucchini | 
 |  | 
 | #endif  // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_ |