blob: d91a04674b39c5500fd91336e25b0d9b72f71035 [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
#define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
#include <stddef.h>
#include <iterator>
#include <type_traits>
#include "base/logging.h"
#include "base/strings/string_piece.h"
namespace url_pattern_index {
// The class used to iteratively extract N-grams from strings. An N-gram is a
// string consisting of N (up to 8) non-special characters, which are stored in
// the lowest N non-zero bytes, lower bytes corresponding to later symbols. The
// size of the integer type limits the maximum value of N. For example an
// uint64_t can store up to 8-grams.
//
// Note: If used for UTF-8 strings, the N-grams can have partial byte sequences.
//
// Template parameters:
// * N - the size of N-grams.
// * NGramType - the integer type used to encode N-grams.
// * IsSeparator - the type of a bool(char) functor.
template <size_t N, typename NGramType, typename IsSeparator>
class NGramExtractor {
public:
// An STL compatible input iterator over N-grams contained in a string.
class Iterator : public std::iterator<std::input_iterator_tag, NGramType> {
public:
// Creates an iterator, which points to the leftmost valid N-gram within the
// |extractor|'s string, starting from |head|.
Iterator(const NGramExtractor& extractor,
base::StringPiece::const_iterator head)
: extractor_(extractor), head_(head), end_(extractor.string_.end()) {
DCHECK_GE(head, extractor_.string_.begin());
DCHECK_LE(head, end_);
CompleteNGramFrom(0);
}
bool operator==(const Iterator& rhs) const { return head_ == rhs.head_; }
bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }
NGramType operator*() const { return ngram_; }
NGramType* operator->() const { return &ngram_; }
Iterator& operator++() {
ngram_ &= ~(static_cast<NGramType>(0xFFu) << 8 * (N - 1));
++head_;
CompleteNGramFrom(N - 1);
return *this;
}
Iterator operator++(int) {
Iterator copy(*this);
operator++();
return copy;
}
private:
// Consumes characters starting with the one pointed to by |head_|, as many
// of them as needed to extend |ngram_| from its |current_length| to a
// length of N. Leaves |head_| pointing to the last character consumed.
void CompleteNGramFrom(size_t current_length) {
for (; head_ != end_; ++head_) {
if (extractor_.is_separator_(*head_)) {
current_length = 0;
ngram_ = 0;
} else {
ngram_ = ngram_ << 8 | static_cast<NGramType>(*head_);
if (++current_length == N)
break;
}
}
}
const NGramExtractor& extractor_;
// Always points to the last character included in the current |ngram_|.
base::StringPiece::const_iterator head_;
// Always points to extractor_.string_.end().
base::StringPiece::const_iterator end_;
// Contains the N-gram currently pointed to by the iterator. Undefined if
// the iterator is at the end.
NGramType ngram_ = 0;
};
// Constructs an extractor for iterating over N-grams contained in the
// |string|. |is_separator| is used to determine whether a certain character
// is a separator and should not be contained in an N-gram.
NGramExtractor(base::StringPiece string, IsSeparator is_separator)
: string_(string), is_separator_(is_separator) {}
Iterator begin() const { return Iterator(*this, string_.begin()); }
Iterator end() const { return Iterator(*this, string_.end()); }
private:
static_assert(std::is_integral<NGramType>::value, "Not an integral type.");
static_assert(std::is_unsigned<NGramType>::value, "Not an unsigned type.");
static_assert(N > 0u, "N should be positive.");
static_assert(N <= sizeof(NGramType), "N-gram doesn't fit into the type.");
base::StringPiece string_;
IsSeparator is_separator_;
};
// A helper function used to create an NGramExtractor for a |string| without
// knowing the direct type of the |is_separator| functor.
//
// Typical usage:
// const char* str = "no*abacaba*abcd";
// auto extractor = CreateNGramExtractor<5, uint64_t>(
// str, [](char c) { return c == '*'; });
// for (uint64_t ngram : extractor) {
// ... process the |ngram| ...
// }
template <size_t N, typename NGramType, typename IsSeparator>
NGramExtractor<N, NGramType, IsSeparator> CreateNGramExtractor(
base::StringPiece string,
IsSeparator is_separator) {
return NGramExtractor<N, NGramType, IsSeparator>(string, is_separator);
}
} // namespace url_pattern_index
#endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_