components/zucchini/suffix_array.h - chromium/src.git - Git at Google

 // Copyright 2017 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
 #define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_

 #include <algorithm>
 #include <iterator>
 #include <numeric>
 #include <vector>

 #include "base/logging.h"
 #include "base/macros.h"

 namespace zucchini {

 // A functor class that implements the naive suffix sorting algorithm that uses
 // std::sort with lexicographical compare. This is only meant as reference of
 // the interface.
 class NaiveSuffixSort {
  public:
   // Type requirements:
   // |InputRng| is an input random access range.
   // |KeyType| is an unsigned integer type.
   // |SAIt| is a random access iterator with mutable references.
   template <class InputRng, class KeyType, class SAIt>
   // |str| is the input string on which suffix sort is applied.
   // Characters found in |str| must be in the range [0, |key_bound|)
   // |suffix_array| is the beginning of the destination range, which is at least
   // as large as |str|.
   void operator()(const InputRng& str,
                   KeyType key_bound,
                   SAIt suffix_array) const {
     using size_type = typename SAIt::value_type;

     size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

     // |suffix_array| is first filled with ordered indices of |str|.
     // Those indices are then sorted with lexicographical comparisons in |str|.
     std::iota(suffix_array, suffix_array + n, 0);
     std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
       return std::lexicographical_compare(std::begin(str) + i, std::end(str),
                                           std::begin(str) + j, std::end(str));
     });
   }
 };

 // A functor class that implements suffix array induced sorting (SA-IS)
 // algorithm with linear time and memory complexity,
 // see http://ieeexplore.ieee.org/abstract/document/5582081/
 class InducedSuffixSort {
  public:
   // Type requirements:
   // |InputRng| is an input random access range.
   // |KeyType| is an unsigned integer type.
   // |SAIt| is a random access iterator with mutable values.
   template <class InputRng, class KeyType, class SAIt>
   // |str| is the input string on which suffix sort is applied.
   // Characters found in |str| must be in the range [0, |key_bound|)
   // |suffix_array| is the beginning of the destination range, which is at least
   // as large as |str|.
   void operator()(const InputRng& str,
                   KeyType key_bound,
                   SAIt suffix_array) const {
     using value_type = typename InputRng::value_type;
     using size_type = typename SAIt::value_type;

     static_assert(std::is_unsigned<value_type>::value,
                   "SA-IS only supports input string with unsigned values");
     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

     size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

     Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
                                                    key_bound, suffix_array);
   }

   // Given string S of length n. We assume S is terminated by a unique sentinel
   // $, which is considered as the smallest character. This sentinel does not
   // exist in memory and is only treated implicitly, hence |n| does not count
   // the sentinel in this implementation. We denote suf(S,i) the suffix formed
   // by S[i..n).

   // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
   // or suf(S,i) > suf(S,i+1), respectively.
   enum SLType : bool { SType, LType };

   // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
   // S-type or L-type, respectively.

   // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
   // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
   // character.

   // A substring S[i..j) is an LMS-substring if
   // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
   //     LMS characters, or
   // (2) S[i..j) is the sentinel $.

   template <class SizeType, class KeyType>
   struct Implementation {
     static_assert(std::is_unsigned<SizeType>::value,
                   "SizeType must be unsigned");
     static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
     using size_type = SizeType;
     using key_type = KeyType;

     using iterator = typename std::vector<size_type>::iterator;
     using const_iterator = typename std::vector<size_type>::const_iterator;

     // Partition every suffix based on SL-type. Returns the number of LMS
     // suffixes.
     template <class StrIt>
     static size_type BuildSLPartition(
         StrIt str,
         size_type length,
         key_type key_bound,
         std::vector<SLType>::reverse_iterator sl_partition_it) {
       // We will count LMS suffixes (S to L-type or last S-type).
       size_type lms_count = 0;

       // |previous_type| is initialized to L-type to avoid counting an extra
       // LMS suffix at the end
       SLType previous_type = LType;

       // Initialized to dummy, impossible key.
       key_type previous_key = key_bound;

       // We're travelling backward to determine the partition,
       // as if we prepend one character at a time to the string, ex:
       // b$ is L-type because b > $.
       // ab$ is S-type because a < b, implying ab$ < b$.
       // bab$ is L-type because b > a, implying bab$ > ab$.
       // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
       for (auto str_it = std::reverse_iterator<StrIt>(str + length);
            str_it != std::reverse_iterator<StrIt>(str);
            ++str_it, ++sl_partition_it) {
         key_type current_key = *str_it;

         if (current_key > previous_key || previous_key == key_bound) {
           // S[i] > S[i + 1] or S[i] is last character.
           if (previous_type == SType)
             // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
             // suf(S,i+1) was a LMS suffix.
             ++lms_count;

           previous_type = LType;  // For next round.
         } else if (current_key < previous_key) {
           // S[i] < S[i + 1]
           previous_type = SType;  // For next round.
         }
         // Else, S[i] == S[i + 1]:
         // The next character that differs determines the SL-type,
         // so we reuse the last seen type.

         *sl_partition_it = previous_type;
         previous_key = current_key;  // For next round.
       }

       return lms_count;
     }

     // Find indices of LMS suffixes and write result to |lms_indices|.
     static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
                                 iterator lms_indices) {
       // |previous_type| is initialized to S-type to avoid counting an extra
       // LMS suffix at the beginning
       SLType previous_type = SType;
       for (size_type i = 0; i < sl_partition.size(); ++i) {
         if (sl_partition[i] == SType && previous_type == LType)
           *lms_indices++ = i;
         previous_type = sl_partition[i];
       }
     }

     template <class StrIt>
     static std::vector<size_type> MakeBucketCount(StrIt str,
                                                   size_type length,
                                                   key_type key_bound) {
       // Occurrence of every unique character is counted in |buckets|
       std::vector<size_type> buckets(static_cast<size_type>(key_bound));

       for (auto it = str; it != str + length; ++it)
         ++buckets[*it];
       return buckets;
     }

     // Apply induced sort from |lms_indices| to |suffix_array| associated with
     // the string |str|.
     template <class StrIt, class SAIt>
     static void InducedSort(StrIt str,
                             size_type length,
                             const std::vector<SLType>& sl_partition,
                             const std::vector<size_type>& lms_indices,
                             const std::vector<size_type>& buckets,
                             SAIt suffix_array) {
       // All indices are first marked as unset with the illegal value |length|.
       std::fill(suffix_array, suffix_array + length, length);

       // Used to mark bucket boundaries (head or end) as indices in str.
       DCHECK(!buckets.empty());
       std::vector<size_type> bucket_bounds(buckets.size());

       // Step 1: Assign indices for LMS suffixes, populating the end of
       // respective buckets but keeping relative order.

       // Find the end of each bucket and write it to |bucket_bounds|.
       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

       // Process each |lms_indices| backward, and assign them to the end of
       // their respective buckets, so relative order is preserved.
       for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
         key_type key = str[*it];
         suffix_array[--bucket_bounds[key]] = *it;
       }

       // Step 2
       // Scan forward |suffix_array|; for each modified suf(S,i) for which
       // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
       // head of the corresponding bucket and forward the bucket head to the
       // right.

       // Find the head of each bucket and write it to |bucket_bounds|. Since
       // only LMS suffixes where inserted in |suffix_array| during Step 1,
       // |bucket_bounds| does not contains the head of each bucket and needs to
       // be updated.
       bucket_bounds[0] = 0;
       std::partial_sum(buckets.begin(), buckets.end() - 1,
                        bucket_bounds.begin() + 1);

       // From Step 1, the sentinel $, which we treat implicitly, would have
       // been placed at the beginning of |suffix_array|, since $ is always
       // considered as the smallest character. We then have to deal with the
       // previous (last) suffix.
       if (sl_partition[length - 1] == LType) {
         key_type key = str[length - 1];
         suffix_array[bucket_bounds[key]++] = length - 1;
       }
       for (auto it = suffix_array; it != suffix_array + length; ++it) {
         size_type suffix_index = *it;

         // While the original algorithm marks unset suffixes with -1,
         // we found that marking them with |length| is also possible and more
         // convenient because we are working with unsigned integers.
         if (suffix_index != length && suffix_index > 0 &&
             sl_partition[--suffix_index] == LType) {
           key_type key = str[suffix_index];
           suffix_array[bucket_bounds[key]++] = suffix_index;
         }
       }

       // Step 3
       // Scan backward |suffix_array|; for each modified suf(S, i) for which
       // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
       // end of the corresponding bucket and forward the bucket head to the
       // left.

       // Find the end of each bucket and write it to |bucket_bounds|. Since
       // only L-type suffixes where inserted in |suffix_array| during Step 2,
       // |bucket_bounds| does not contain the end of each bucket and needs to
       // be updated.
       std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

       for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
            it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
         size_type suffix_index = *it;
         if (suffix_index != length && suffix_index > 0 &&
             sl_partition[--suffix_index] == SType) {
           key_type key = str[suffix_index];
           suffix_array[--bucket_bounds[key]] = suffix_index;
         }
       }
       // Deals with the last suffix, because of the sentinel.
       if (sl_partition[length - 1] == SType) {
         key_type key = str[length - 1];
         suffix_array[--bucket_bounds[key]] = length - 1;
       }
     }

     // Given a string S starting at |str| with length |length|, an array
     // starting at |substring_array| containing lexicographically ordered LMS
     // terminated substring indices of S and an SL-Type partition |sl_partition|
     // of S, assigns a unique label to every unique LMS substring. The sorted
     // labels for all LMS substrings are written to |lms_str|, while the indices
     // of LMS suffixes are written to |lms_indices|. In addition, returns the
     // total number of unique labels.
     template <class StrIt, class SAIt>
     static size_type LabelLmsSubstrings(StrIt str,
                                         size_type length,
                                         const std::vector<SLType>& sl_partition,
                                         SAIt suffix_array,
                                         iterator lms_indices,
                                         iterator lms_str) {
       // Labelling starts at 0.
       size_type label = 0;

       // |previous_lms| is initialized to 0 to indicate it is unset.
       // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
       // lexicographical order.
       size_type previous_lms = 0;
       for (auto it = suffix_array; it != suffix_array + length; ++it) {
         if (*it > 0 && sl_partition[*it] == SType &&
             sl_partition[*it - 1] == LType) {
           // suf(S, *it) is a LMS suffix.

           size_type current_lms = *it;
           if (previous_lms != 0) {
             // There was a previous LMS suffix. Check if the current LMS
             // substring is equal to the previous one.
             SLType current_lms_type = SType;
             SLType previous_lms_type = SType;
             for (size_type k = 0;; ++k) {
               // |current_lms_end| and |previous_lms_end| denote whether we have
               // reached the end of the current and previous LMS substring,
               // respectively
               bool current_lms_end = false;
               bool previous_lms_end = false;

               // Check for both previous and current substring ends.
               // Note that it is more convenient to check if
               // suf(S,current_lms + k) is an LMS suffix than to retrieve it
               // from lms_indices.
               if (current_lms + k >= length ||
                   (current_lms_type == LType &&
                    sl_partition[current_lms + k] == SType)) {
                 current_lms_end = true;
               }
               if (previous_lms + k >= length ||
                   (previous_lms_type == LType &&
                    sl_partition[previous_lms + k] == SType)) {
                 previous_lms_end = true;
               }

               if (current_lms_end && previous_lms_end) {
                 break;  // Previous and current substrings are identical.
               } else if (current_lms_end != previous_lms_end ||
                          str[current_lms + k] != str[previous_lms + k]) {
                 // Previous and current substrings differ, a new label is used.
                 ++label;
                 break;
               }

               current_lms_type = sl_partition[current_lms + k];
               previous_lms_type = sl_partition[previous_lms + k];
             }
           }
           *lms_indices++ = *it;
           *lms_str++ = label;
           previous_lms = current_lms;
         }
       }

       return label + 1;
     }

     // Implementation of the SA-IS algorithm. |str| must be a random access
     // iterator pointing at the beginning of S with length |length|. The result
     // is writtend in |suffix_array|, a random access iterator.
     template <class StrIt, class SAIt>
     static void SuffixSort(StrIt str,
                            size_type length,
                            key_type key_bound,
                            SAIt suffix_array) {
       if (length == 1)
         *suffix_array = 0;
       if (length < 2)
         return;

       std::vector<SLType> sl_partition(length);
       size_type lms_count =
           BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
       std::vector<size_type> lms_indices(lms_count);
       FindLmsSuffixes(sl_partition, lms_indices.begin());
       std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);

       if (lms_indices.size() > 1) {
         // Given |lms_indices| in the same order they appear in |str|, induce
         // LMS substrings relative order and write result to |suffix_array|.
         InducedSort(str, length, sl_partition, lms_indices, buckets,
                     suffix_array);
         std::vector<size_type> lms_str(lms_indices.size());

         // Given LMS substrings in relative order found in |suffix_array|,
         // map LMS substrings to unique labels to form a new string, |lms_str|.
         size_type label_count =
             LabelLmsSubstrings(str, length, sl_partition, suffix_array,
                                lms_indices.begin(), lms_str.begin());

         if (label_count < lms_str.size()) {
           // Reorder |lms_str| to have LMS suffixes in the same order they
           // appear in |str|.
           for (size_type i = 0; i < lms_indices.size(); ++i)
             suffix_array[lms_indices[i]] = lms_str[i];

           SLType previous_type = SType;
           for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
             if (sl_partition[i] == SType && previous_type == LType) {
               lms_str[j] = suffix_array[i];
               lms_indices[j++] = i;
             }
             previous_type = sl_partition[i];
           }

           // Recursively apply SuffixSort on |lms_str|, which is formed from
           // labeled LMS suffixes in the same order they appear in |str|.
           // Note that |KeyType| will be size_type because |lms_str| contains
           // indices. |lms_str| is at most half the length of |str|.
           Implementation<size_type, size_type>::SuffixSort(
               lms_str.begin(), static_cast<size_type>(lms_str.size()),
               label_count, suffix_array);

           // Map LMS labels back to indices in |str| and write result to
           // |lms_indices|. We're using |suffix_array| as a temporary buffer.
           for (size_type i = 0; i < lms_indices.size(); ++i)
             suffix_array[i] = lms_indices[suffix_array[i]];
           std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());

           // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
         }
       }
       // Given |lms_indices| where LMS suffixes are sorted, induce the full
       // order of suffixes in |str|.
       InducedSort(str, length, sl_partition, lms_indices, buckets,
                   suffix_array);
     }

    private:
     DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation);
   };
 };

 // Generates a sorted suffix array for the input string |str| using the functor
 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
 /// Characters found in |str| are assumed to be in range [0, |key_bound|).
 // Returns the suffix array as a vector.
 // |StrRng| is an input random access range.
 // |KeyType| is an unsigned integer type.
 template <class Algorithm, class StrRng, class KeyType>
 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
                                                         KeyType key_bound) {
   Algorithm sort;
   std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
   sort(str, key_bound, suffix_array.begin());
   return suffix_array;
 }

 // Type requirements:
 // |SARng| is an input random access range.
 // |StrIt1| is a random access iterator.
 // |StrIt2| is a forward iterator.
 template <class SARng, class StrIt1, class StrIt2>
 // Lexicographical lower bound using binary search for
 // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
 // starting at |str1_first|. This does not necessarily return the index of
 // the longest matching substring.
 auto SuffixLowerBound(const SARng& suffix_array,
                       StrIt1 str1_first,
                       StrIt2 str2_first,
                       StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
   using size_type = typename SARng::value_type;

   size_t n = std::end(suffix_array) - std::begin(suffix_array);
   auto it = std::lower_bound(
       std::begin(suffix_array), std::end(suffix_array), str2_first,
       [str1_first, str2_last, n](size_type a, StrIt2 b) {
         return std::lexicographical_compare(str1_first + a, str1_first + n, b,
                                             str2_last);
       });
   return it;
 }

 }  // namespace zucchini

 #endif  // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
	// Copyright 2017 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_
	#define COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_

	#include <algorithm>
	#include <iterator>
	#include <numeric>
	#include <vector>

	#include "base/logging.h"
	#include "base/macros.h"

	namespace zucchini {

	// A functor class that implements the naive suffix sorting algorithm that uses
	// std::sort with lexicographical compare. This is only meant as reference of
	// the interface.
	class NaiveSuffixSort {
	public:
	// Type requirements:
	// \|InputRng\| is an input random access range.
	// \|KeyType\| is an unsigned integer type.
	// \|SAIt\| is a random access iterator with mutable references.
	template <class InputRng, class KeyType, class SAIt>
	// \|str\| is the input string on which suffix sort is applied.
	// Characters found in \|str\| must be in the range [0, \|key_bound\|)
	// \|suffix_array\| is the beginning of the destination range, which is at least
	// as large as \|str\|.
	void operator()(const InputRng& str,
	KeyType key_bound,
	SAIt suffix_array) const {
	using size_type = typename SAIt::value_type;

	size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	// \|suffix_array\| is first filled with ordered indices of \|str\|.
	// Those indices are then sorted with lexicographical comparisons in \|str\|.
	std::iota(suffix_array, suffix_array + n, 0);
	std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
	return std::lexicographical_compare(std::begin(str) + i, std::end(str),
	std::begin(str) + j, std::end(str));
	});
	}
	};

	// A functor class that implements suffix array induced sorting (SA-IS)
	// algorithm with linear time and memory complexity,
	// see http://ieeexplore.ieee.org/abstract/document/5582081/
	class InducedSuffixSort {
	public:
	// Type requirements:
	// \|InputRng\| is an input random access range.
	// \|KeyType\| is an unsigned integer type.
	// \|SAIt\| is a random access iterator with mutable values.
	template <class InputRng, class KeyType, class SAIt>
	// \|str\| is the input string on which suffix sort is applied.
	// Characters found in \|str\| must be in the range [0, \|key_bound\|)
	// \|suffix_array\| is the beginning of the destination range, which is at least
	// as large as \|str\|.
	void operator()(const InputRng& str,
	KeyType key_bound,
	SAIt suffix_array) const {
	using value_type = typename InputRng::value_type;
	using size_type = typename SAIt::value_type;

	static_assert(std::is_unsigned<value_type>::value,
	"SA-IS only supports input string with unsigned values");
	static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
	key_bound, suffix_array);
	}

	// Given string S of length n. We assume S is terminated by a unique sentinel
	// $, which is considered as the smallest character. This sentinel does not
	// exist in memory and is only treated implicitly, hence \|n\| does not count
	// the sentinel in this implementation. We denote suf(S,i) the suffix formed
	// by S[i..n).

	// A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
	// or suf(S,i) > suf(S,i+1), respectively.
	enum SLType : bool { SType, LType };

	// A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
	// S-type or L-type, respectively.

	// A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
	// S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
	// character.

	// A substring S[i..j) is an LMS-substring if
	// (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
	// LMS characters, or
	// (2) S[i..j) is the sentinel $.

	template <class SizeType, class KeyType>
	struct Implementation {
	static_assert(std::is_unsigned<SizeType>::value,
	"SizeType must be unsigned");
	static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
	using size_type = SizeType;
	using key_type = KeyType;

	using iterator = typename std::vector<size_type>::iterator;
	using const_iterator = typename std::vector<size_type>::const_iterator;

	// Partition every suffix based on SL-type. Returns the number of LMS
	// suffixes.
	template <class StrIt>
	static size_type BuildSLPartition(
	StrIt str,
	size_type length,
	key_type key_bound,
	std::vector<SLType>::reverse_iterator sl_partition_it) {
	// We will count LMS suffixes (S to L-type or last S-type).
	size_type lms_count = 0;

	// \|previous_type\| is initialized to L-type to avoid counting an extra
	// LMS suffix at the end
	SLType previous_type = LType;

	// Initialized to dummy, impossible key.
	key_type previous_key = key_bound;

	// We're travelling backward to determine the partition,
	// as if we prepend one character at a time to the string, ex:
	// b$ is L-type because b > $.
	// ab$ is S-type because a < b, implying ab$ < b$.
	// bab$ is L-type because b > a, implying bab$ > ab$.
	// bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
	for (auto str_it = std::reverse_iterator<StrIt>(str + length);
	str_it != std::reverse_iterator<StrIt>(str);
	++str_it, ++sl_partition_it) {
	key_type current_key = *str_it;

	if (current_key > previous_key \|\| previous_key == key_bound) {
	// S[i] > S[i + 1] or S[i] is last character.
	if (previous_type == SType)
	// suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
	// suf(S,i+1) was a LMS suffix.
	++lms_count;

	previous_type = LType; // For next round.
	} else if (current_key < previous_key) {
	// S[i] < S[i + 1]
	previous_type = SType; // For next round.
	}
	// Else, S[i] == S[i + 1]:
	// The next character that differs determines the SL-type,
	// so we reuse the last seen type.

	*sl_partition_it = previous_type;
	previous_key = current_key; // For next round.
	}

	return lms_count;
	}

	// Find indices of LMS suffixes and write result to \|lms_indices\|.
	static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
	iterator lms_indices) {
	// \|previous_type\| is initialized to S-type to avoid counting an extra
	// LMS suffix at the beginning
	SLType previous_type = SType;
	for (size_type i = 0; i < sl_partition.size(); ++i) {
	if (sl_partition[i] == SType && previous_type == LType)
	*lms_indices++ = i;
	previous_type = sl_partition[i];
	}
	}

	template <class StrIt>
	static std::vector<size_type> MakeBucketCount(StrIt str,
	size_type length,
	key_type key_bound) {
	// Occurrence of every unique character is counted in \|buckets\|
	std::vector<size_type> buckets(static_cast<size_type>(key_bound));

	for (auto it = str; it != str + length; ++it)
	++buckets[*it];
	return buckets;
	}

	// Apply induced sort from \|lms_indices\| to \|suffix_array\| associated with
	// the string \|str\|.
	template <class StrIt, class SAIt>
	static void InducedSort(StrIt str,
	size_type length,
	const std::vector<SLType>& sl_partition,
	const std::vector<size_type>& lms_indices,
	const std::vector<size_type>& buckets,
	SAIt suffix_array) {
	// All indices are first marked as unset with the illegal value \|length\|.
	std::fill(suffix_array, suffix_array + length, length);

	// Used to mark bucket boundaries (head or end) as indices in str.
	DCHECK(!buckets.empty());
	std::vector<size_type> bucket_bounds(buckets.size());

	// Step 1: Assign indices for LMS suffixes, populating the end of
	// respective buckets but keeping relative order.

	// Find the end of each bucket and write it to \|bucket_bounds\|.
	std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	// Process each \|lms_indices\| backward, and assign them to the end of
	// their respective buckets, so relative order is preserved.
	for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
	key_type key = str[*it];
	suffix_array[--bucket_bounds[key]] = *it;
	}

	// Step 2
	// Scan forward \|suffix_array\|; for each modified suf(S,i) for which
	// suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
	// head of the corresponding bucket and forward the bucket head to the
	// right.

	// Find the head of each bucket and write it to \|bucket_bounds\|. Since
	// only LMS suffixes where inserted in \|suffix_array\| during Step 1,
	// \|bucket_bounds\| does not contains the head of each bucket and needs to
	// be updated.
	bucket_bounds[0] = 0;
	std::partial_sum(buckets.begin(), buckets.end() - 1,
	bucket_bounds.begin() + 1);

	// From Step 1, the sentinel $, which we treat implicitly, would have
	// been placed at the beginning of \|suffix_array\|, since $ is always
	// considered as the smallest character. We then have to deal with the
	// previous (last) suffix.
	if (sl_partition[length - 1] == LType) {
	key_type key = str[length - 1];
	suffix_array[bucket_bounds[key]++] = length - 1;
	}
	for (auto it = suffix_array; it != suffix_array + length; ++it) {
	size_type suffix_index = *it;

	// While the original algorithm marks unset suffixes with -1,
	// we found that marking them with \|length\| is also possible and more
	// convenient because we are working with unsigned integers.
	if (suffix_index != length && suffix_index > 0 &&
	sl_partition[--suffix_index] == LType) {
	key_type key = str[suffix_index];
	suffix_array[bucket_bounds[key]++] = suffix_index;
	}
	}

	// Step 3
	// Scan backward \|suffix_array\|; for each modified suf(S, i) for which
	// suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
	// end of the corresponding bucket and forward the bucket head to the
	// left.

	// Find the end of each bucket and write it to \|bucket_bounds\|. Since
	// only L-type suffixes where inserted in \|suffix_array\| during Step 2,
	// \|bucket_bounds\| does not contain the end of each bucket and needs to
	// be updated.
	std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
	it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
	size_type suffix_index = *it;
	if (suffix_index != length && suffix_index > 0 &&
	sl_partition[--suffix_index] == SType) {
	key_type key = str[suffix_index];
	suffix_array[--bucket_bounds[key]] = suffix_index;
	}
	}
	// Deals with the last suffix, because of the sentinel.
	if (sl_partition[length - 1] == SType) {
	key_type key = str[length - 1];
	suffix_array[--bucket_bounds[key]] = length - 1;
	}
	}

	// Given a string S starting at \|str\| with length \|length\|, an array
	// starting at \|substring_array\| containing lexicographically ordered LMS
	// terminated substring indices of S and an SL-Type partition \|sl_partition\|
	// of S, assigns a unique label to every unique LMS substring. The sorted
	// labels for all LMS substrings are written to \|lms_str\|, while the indices
	// of LMS suffixes are written to \|lms_indices\|. In addition, returns the
	// total number of unique labels.
	template <class StrIt, class SAIt>
	static size_type LabelLmsSubstrings(StrIt str,
	size_type length,
	const std::vector<SLType>& sl_partition,
	SAIt suffix_array,
	iterator lms_indices,
	iterator lms_str) {
	// Labelling starts at 0.
	size_type label = 0;

	// \|previous_lms\| is initialized to 0 to indicate it is unset.
	// Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
	// lexicographical order.
	size_type previous_lms = 0;
	for (auto it = suffix_array; it != suffix_array + length; ++it) {
	if (it > 0 && sl_partition[it] == SType &&
	sl_partition[*it - 1] == LType) {
	// suf(S, *it) is a LMS suffix.

	size_type current_lms = *it;
	if (previous_lms != 0) {
	// There was a previous LMS suffix. Check if the current LMS
	// substring is equal to the previous one.
	SLType current_lms_type = SType;
	SLType previous_lms_type = SType;
	for (size_type k = 0;; ++k) {
	// \|current_lms_end\| and \|previous_lms_end\| denote whether we have
	// reached the end of the current and previous LMS substring,
	// respectively
	bool current_lms_end = false;
	bool previous_lms_end = false;

	// Check for both previous and current substring ends.
	// Note that it is more convenient to check if
	// suf(S,current_lms + k) is an LMS suffix than to retrieve it
	// from lms_indices.
	if (current_lms + k >= length \|\|
	(current_lms_type == LType &&
	sl_partition[current_lms + k] == SType)) {
	current_lms_end = true;
	}
	if (previous_lms + k >= length \|\|
	(previous_lms_type == LType &&
	sl_partition[previous_lms + k] == SType)) {
	previous_lms_end = true;
	}

	if (current_lms_end && previous_lms_end) {
	break; // Previous and current substrings are identical.
	} else if (current_lms_end != previous_lms_end \|\|
	str[current_lms + k] != str[previous_lms + k]) {
	// Previous and current substrings differ, a new label is used.
	++label;
	break;
	}

	current_lms_type = sl_partition[current_lms + k];
	previous_lms_type = sl_partition[previous_lms + k];
	}
	}
	lms_indices++ = it;
	*lms_str++ = label;
	previous_lms = current_lms;
	}
	}

	return label + 1;
	}

	// Implementation of the SA-IS algorithm. \|str\| must be a random access
	// iterator pointing at the beginning of S with length \|length\|. The result
	// is writtend in \|suffix_array\|, a random access iterator.
	template <class StrIt, class SAIt>
	static void SuffixSort(StrIt str,
	size_type length,
	key_type key_bound,
	SAIt suffix_array) {
	if (length == 1)
	*suffix_array = 0;
	if (length < 2)
	return;

	std::vector<SLType> sl_partition(length);
	size_type lms_count =
	BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
	std::vector<size_type> lms_indices(lms_count);
	FindLmsSuffixes(sl_partition, lms_indices.begin());
	std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);

	if (lms_indices.size() > 1) {
	// Given \|lms_indices\| in the same order they appear in \|str\|, induce
	// LMS substrings relative order and write result to \|suffix_array\|.
	InducedSort(str, length, sl_partition, lms_indices, buckets,
	suffix_array);
	std::vector<size_type> lms_str(lms_indices.size());

	// Given LMS substrings in relative order found in \|suffix_array\|,
	// map LMS substrings to unique labels to form a new string, \|lms_str\|.
	size_type label_count =
	LabelLmsSubstrings(str, length, sl_partition, suffix_array,
	lms_indices.begin(), lms_str.begin());

	if (label_count < lms_str.size()) {
	// Reorder \|lms_str\| to have LMS suffixes in the same order they
	// appear in \|str\|.
	for (size_type i = 0; i < lms_indices.size(); ++i)
	suffix_array[lms_indices[i]] = lms_str[i];

	SLType previous_type = SType;
	for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
	if (sl_partition[i] == SType && previous_type == LType) {
	lms_str[j] = suffix_array[i];
	lms_indices[j++] = i;
	}
	previous_type = sl_partition[i];
	}

	// Recursively apply SuffixSort on \|lms_str\|, which is formed from
	// labeled LMS suffixes in the same order they appear in \|str\|.
	// Note that \|KeyType\| will be size_type because \|lms_str\| contains
	// indices. \|lms_str\| is at most half the length of \|str\|.
	Implementation<size_type, size_type>::SuffixSort(
	lms_str.begin(), static_cast<size_type>(lms_str.size()),
	label_count, suffix_array);

	// Map LMS labels back to indices in \|str\| and write result to
	// \|lms_indices\|. We're using \|suffix_array\| as a temporary buffer.
	for (size_type i = 0; i < lms_indices.size(); ++i)
	suffix_array[i] = lms_indices[suffix_array[i]];
	std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());

	// At this point, \|lms_indices\| contains sorted LMS suffixes of \|str\|.
	}
	}
	// Given \|lms_indices\| where LMS suffixes are sorted, induce the full
	// order of suffixes in \|str\|.
	InducedSort(str, length, sl_partition, lms_indices, buckets,
	suffix_array);
	}

	private:
	DISALLOW_IMPLICIT_CONSTRUCTORS(Implementation);
	};
	};

	// Generates a sorted suffix array for the input string \|str\| using the functor
	// \|Algorithm\| which provides an interface equivalent to NaiveSuffixSort.
	/// Characters found in \|str\| are assumed to be in range [0, \|key_bound\|).
	// Returns the suffix array as a vector.
	// \|StrRng\| is an input random access range.
	// \|KeyType\| is an unsigned integer type.
	template <class Algorithm, class StrRng, class KeyType>
	std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
	KeyType key_bound) {
	Algorithm sort;
	std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
	sort(str, key_bound, suffix_array.begin());
	return suffix_array;
	}

	// Type requirements:
	// \|SARng\| is an input random access range.
	// \|StrIt1\| is a random access iterator.
	// \|StrIt2\| is a forward iterator.
	template <class SARng, class StrIt1, class StrIt2>
	// Lexicographical lower bound using binary search for
	// [\|str2_first\|, \|str2_last\|) in the suffix array \|suffix_array\| of a string
	// starting at \|str1_first\|. This does not necessarily return the index of
	// the longest matching substring.
	auto SuffixLowerBound(const SARng& suffix_array,
	StrIt1 str1_first,
	StrIt2 str2_first,
	StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
	using size_type = typename SARng::value_type;

	size_t n = std::end(suffix_array) - std::begin(suffix_array);
	auto it = std::lower_bound(
	std::begin(suffix_array), std::end(suffix_array), str2_first,
	[str1_first, str2_last, n](size_type a, StrIt2 b) {
	return std::lexicographical_compare(str1_first + a, str1_first + n, b,
	str2_last);
	});
	return it;
	}

	} // namespace zucchini

	#endif // COMPONENTS_ZUCCHINI_SUFFIX_ARRAY_H_