components/query_parser/snippet.cc - chromium/src - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/query_parser/snippet.h"

 #include <stdint.h>

 #include <algorithm>

 #include "base/logging.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/strings/string_split.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "third_party/icu/source/common/unicode/brkiter.h"
 #include "third_party/icu/source/common/unicode/utext.h"
 #include "third_party/icu/source/common/unicode/utf8.h"

 namespace query_parser {
 namespace {

 bool PairFirstLessThan(const Snippet::MatchPosition& a,
                        const Snippet::MatchPosition& b) {
   return a.first < b.first;
 }

 // Combines all pairs after offset in match_positions that are contained
 // or touch the pair at offset.
 void CoalescePositionsFrom(size_t offset,
                            Snippet::MatchPositions* match_positions) {
   DCHECK(offset < match_positions->size());
   Snippet::MatchPosition& pair((*match_positions)[offset]);
   ++offset;
   while (offset < match_positions->size() &&
          pair.second >= (*match_positions)[offset].first) {
     pair.second = std::max(pair.second, (*match_positions)[offset].second);
     match_positions->erase(match_positions->begin() + offset);
   }
 }

 // Makes sure there is a pair in match_positions that contains the specified
 // range. This keeps the pairs ordered in match_positions by first, and makes
 // sure none of the pairs in match_positions touch each other.
 void AddMatch(size_t start,
               size_t end,
               Snippet::MatchPositions* match_positions) {
   DCHECK(start < end);
   DCHECK(match_positions);
   Snippet::MatchPosition pair(start, end);
   if (match_positions->empty()) {
     match_positions->push_back(pair);
     return;
   }
   // There's at least one match. Find the position of the new match,
   // potentially extending pairs around it.
   Snippet::MatchPositions::iterator i =
       std::lower_bound(match_positions->begin(), match_positions->end(),
                        pair, &PairFirstLessThan);
   if (i != match_positions->end() && i->first == start) {
     // Match not at the end and there is already a pair with the same
     // start.
     if (end > i->second) {
       // New pair extends beyond existing pair. Extend existing pair and
       // coalesce matches after it.
       i->second = end;
       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
     }  // else case, new pair completely contained in existing pair, nothing
        // to do.
   } else if (i == match_positions->begin()) {
     // Match at the beginning and the first pair doesn't have the same
     // start. Insert new pair and coalesce matches after it.
     match_positions->insert(i, pair);
     CoalescePositionsFrom(0, match_positions);
   } else {
     // Not at the beginning (but may be at the end).
     --i;
     if (start <= i->second && end > i->second) {
       // Previous element contains match. Extend it and coalesce.
       i->second = end;
       CoalescePositionsFrom(i - match_positions->begin(), match_positions);
     } else if (end > i->second) {
       // Region doesn't touch previous element. See if region touches current
       // element.
       ++i;
       if (i == match_positions->end() || end < i->first) {
         match_positions->insert(i, pair);
       } else {
         i->first = start;
         i->second = end;
         CoalescePositionsFrom(i - match_positions->begin(), match_positions);
       }
     }
   }
 }

 // Converts an index in a utf8 string into the index in the corresponding utf16
 // string and returns the utf16 index. This is intended to be called in a loop
 // iterating through a utf8 string.
 //
 // utf8_string: the utf8 string.
 // utf8_length: length of the utf8 string.
 // offset: the utf8 offset to convert.
 // utf8_pos: current offset in the utf8 string. This is modified and on return
 //           matches offset.
 // wide_pos: current index in the wide string. This is the same as the return
 //           value.
 size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
                                 int32_t utf8_length,
                                 int32_t offset,
                                 int32_t* utf8_pos,
                                 size_t* utf16_pos) {
   DCHECK(offset >= *utf8_pos && offset <= utf8_length);

   UChar32 wide_char;
   while (*utf8_pos < offset) {
     U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
     *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
   }
   return *utf16_pos;
 }

 // Given a character break iterator over a UTF-8 string, set the iterator
 // position to |*utf8_pos| and move by |count| characters. |count| can
 // be either positive or negative.
 void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
   // Ignore the return value. A side effect of the current position
   // being set at or following |*utf8_pos| is exploited here.
   // It's simpler than calling following(n) and then previous().
   // isBoundary() is not very fast, but should be good enough for the
   // snippet generation. If not, revisit the way we scan in ComputeSnippet.
   bi->isBoundary(static_cast<int32_t>(*utf8_pos));
   bi->next(count);
   *utf8_pos = static_cast<size_t>(bi->current());
 }

 // The amount of context to include for a given hit. Note that it's counted
 // in terms of graphemes rather than bytes.
 const int kSnippetContext = 50;

 // Returns true if next match falls within a snippet window
 // from the previous match. The window size is counted in terms
 // of graphemes rather than bytes in UTF-8.
 bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
                                     size_t previous_match_end,
                                     size_t next_match_start) {
   // If it's within a window in terms of bytes, it's certain
   // that it's within a window in terms of graphemes as well.
   if (next_match_start < previous_match_end + kSnippetContext)
     return true;
   bi->isBoundary(static_cast<int32_t>(previous_match_end));
   // An alternative to this is to call |bi->next()| at most
   // kSnippetContext times, compare |bi->current()| with |next_match_start|
   // after each call and return early if possible. There are other
   // heuristics to speed things up if necessary, but it's not likely that
   // we need to bother.
   bi->next(kSnippetContext);
   int64_t current = bi->current();
   return (next_match_start < static_cast<uint64_t>(current) ||
           current == icu::BreakIterator::DONE);
 }

 }  // namespace

 // static
 void Snippet::ExtractMatchPositions(const std::string& offsets_str,
                                     const std::string& column_num,
                                     MatchPositions* match_positions) {
   DCHECK(match_positions);
   if (offsets_str.empty())
     return;
   std::vector<std::string> offsets = base::SplitString(
       offsets_str, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
   // SQLite offsets are sets of four integers:
   //   column, query term, match offset, match length
   // Matches within a string are marked by (start, end) pairs.
   for (size_t i = 0; i < offsets.size() - 3; i += 4) {
     if (offsets[i] != column_num)
       continue;
     const size_t start = atoi(offsets[i + 2].c_str());
     const size_t end = start + atoi(offsets[i + 3].c_str());
     // Switch to DCHECK after debugging http://crbug.com/15261.
     CHECK(end >= start);
     AddMatch(start, end, match_positions);
   }
 }

 // static
 void Snippet::ConvertMatchPositionsToWide(
     const std::string& utf8_string,
     Snippet::MatchPositions* match_positions) {
   DCHECK(match_positions);
   int32_t utf8_pos = 0;
   size_t utf16_pos = 0;
   const char* utf8_cstring = utf8_string.c_str();
   const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
   for (Snippet::MatchPositions::iterator i = match_positions->begin();
        i != match_positions->end(); ++i) {
     i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
                                         static_cast<int32_t>(i->first),
                                         &utf8_pos, &utf16_pos);
     i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
                                          static_cast<int32_t>(i->second),
                                          &utf8_pos, &utf16_pos);
   }
 }

 Snippet::Snippet() {
 }

 Snippet::Snippet(const Snippet& other) = default;

 Snippet::~Snippet() {
 }

 void Snippet::ComputeSnippet(const MatchPositions& match_positions,
                              const std::string& document) {
   // The length of snippets we try to produce.
   // We can generate longer snippets but stop once we cross kSnippetMaxLength.
   const size_t kSnippetMaxLength = 200;
   const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");

   UText* document_utext = NULL;
   UErrorCode status = U_ZERO_ERROR;
   document_utext = utext_openUTF8(document_utext, document.data(),
                                   document.size(), &status);
   // Locale does not matter because there's no per-locale customization
   // for character iterator.
   scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
       icu::Locale::getDefault(), status));
   bi->setText(document_utext, status);
   DCHECK(U_SUCCESS(status));

   // We build the snippet by iterating through the matches and then grabbing
   // context around each match.  If matches are near enough each other (within
   // kSnippetContext), we skip the "..." between them.
   base::string16 snippet;
   size_t start = 0;
   for (size_t i = 0; i < match_positions.size(); ++i) {
     // Some shorter names for the current match.
     const size_t match_start = match_positions[i].first;
     const size_t match_end = match_positions[i].second;

     // Switch to DCHECK after debugging http://crbug.com/15261.
     CHECK(match_end > match_start);
     CHECK(match_end <= document.size());

     // Add the context, if any, to show before the match.
     size_t context_start = match_start;
     MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
     start = std::max(start, context_start);
     if (start < match_start) {
       if (start > 0)
         snippet += kEllipsis;
       // Switch to DCHECK after debugging http://crbug.com/15261.
       CHECK(start < document.size());
       snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
     }

     // Add the match.
     const size_t first = snippet.size();
     snippet += base::UTF8ToUTF16(document.substr(match_start,
                                                  match_end - match_start));
     matches_.push_back(std::make_pair(first, snippet.size()));

     // Compute the context, if any, to show after the match.
     size_t end;
     // Check if the next match falls within our snippet window.
     if (i + 1 < match_positions.size() &&
         IsNextMatchWithinSnippetWindow(bi.get(), match_end,
             match_positions[i + 1].first)) {
       // Yes, it's within the window.  Make the end context extend just up
       // to the next match.
       end = match_positions[i + 1].first;
       // Switch to DCHECK after debugging http://crbug.com/15261.
       CHECK(end >= match_end);
       CHECK(end <= document.size());
       snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
     } else {
       // No, there's either no next match or the next match is too far away.
       end = match_end;
       MoveByNGraphemes(bi.get(), kSnippetContext, &end);
       // Switch to DCHECK after debugging http://crbug.com/15261.
       CHECK(end >= match_end);
       CHECK(end <= document.size());
       snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
       if (end < document.size())
         snippet += kEllipsis;
     }
     start = end;

     // Stop here if we have enough snippet computed.
     if (snippet.size() >= kSnippetMaxLength)
       break;
   }

   utext_close(document_utext);
   swap(text_, snippet);
 }

 void Snippet::Swap(Snippet* other) {
   text_.swap(other->text_);
   matches_.swap(other->matches_);
 }

 }  // namespace query_parser
	// Copyright 2014 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/query_parser/snippet.h"

	#include <stdint.h>

	#include <algorithm>

	#include "base/logging.h"
	#include "base/memory/scoped_ptr.h"
	#include "base/strings/string_split.h"
	#include "base/strings/string_util.h"
	#include "base/strings/utf_string_conversions.h"
	#include "third_party/icu/source/common/unicode/brkiter.h"
	#include "third_party/icu/source/common/unicode/utext.h"
	#include "third_party/icu/source/common/unicode/utf8.h"

	namespace query_parser {
	namespace {

	bool PairFirstLessThan(const Snippet::MatchPosition& a,
	const Snippet::MatchPosition& b) {
	return a.first < b.first;
	}

	// Combines all pairs after offset in match_positions that are contained
	// or touch the pair at offset.
	void CoalescePositionsFrom(size_t offset,
	Snippet::MatchPositions* match_positions) {
	DCHECK(offset < match_positions->size());
	Snippet::MatchPosition& pair((*match_positions)[offset]);
	++offset;
	while (offset < match_positions->size() &&
	pair.second >= (*match_positions)[offset].first) {
	pair.second = std::max(pair.second, (*match_positions)[offset].second);
	match_positions->erase(match_positions->begin() + offset);
	}
	}

	// Makes sure there is a pair in match_positions that contains the specified
	// range. This keeps the pairs ordered in match_positions by first, and makes
	// sure none of the pairs in match_positions touch each other.
	void AddMatch(size_t start,
	size_t end,
	Snippet::MatchPositions* match_positions) {
	DCHECK(start < end);
	DCHECK(match_positions);
	Snippet::MatchPosition pair(start, end);
	if (match_positions->empty()) {
	match_positions->push_back(pair);
	return;
	}
	// There's at least one match. Find the position of the new match,
	// potentially extending pairs around it.
	Snippet::MatchPositions::iterator i =
	std::lower_bound(match_positions->begin(), match_positions->end(),
	pair, &PairFirstLessThan);
	if (i != match_positions->end() && i->first == start) {
	// Match not at the end and there is already a pair with the same
	// start.
	if (end > i->second) {
	// New pair extends beyond existing pair. Extend existing pair and
	// coalesce matches after it.
	i->second = end;
	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
	} // else case, new pair completely contained in existing pair, nothing
	// to do.
	} else if (i == match_positions->begin()) {
	// Match at the beginning and the first pair doesn't have the same
	// start. Insert new pair and coalesce matches after it.
	match_positions->insert(i, pair);
	CoalescePositionsFrom(0, match_positions);
	} else {
	// Not at the beginning (but may be at the end).
	--i;
	if (start <= i->second && end > i->second) {
	// Previous element contains match. Extend it and coalesce.
	i->second = end;
	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
	} else if (end > i->second) {
	// Region doesn't touch previous element. See if region touches current
	// element.
	++i;
	if (i == match_positions->end() \|\| end < i->first) {
	match_positions->insert(i, pair);
	} else {
	i->first = start;
	i->second = end;
	CoalescePositionsFrom(i - match_positions->begin(), match_positions);
	}
	}
	}
	}

	// Converts an index in a utf8 string into the index in the corresponding utf16
	// string and returns the utf16 index. This is intended to be called in a loop
	// iterating through a utf8 string.
	//
	// utf8_string: the utf8 string.
	// utf8_length: length of the utf8 string.
	// offset: the utf8 offset to convert.
	// utf8_pos: current offset in the utf8 string. This is modified and on return
	// matches offset.
	// wide_pos: current index in the wide string. This is the same as the return
	// value.
	size_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
	int32_t utf8_length,
	int32_t offset,
	int32_t* utf8_pos,
	size_t* utf16_pos) {
	DCHECK(offset >= *utf8_pos && offset <= utf8_length);

	UChar32 wide_char;
	while (*utf8_pos < offset) {
	U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
	*utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
	}
	return *utf16_pos;
	}

	// Given a character break iterator over a UTF-8 string, set the iterator
	// position to \|*utf8_pos\| and move by \|count\| characters. \|count\| can
	// be either positive or negative.
	void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
	// Ignore the return value. A side effect of the current position
	// being set at or following \|*utf8_pos\| is exploited here.
	// It's simpler than calling following(n) and then previous().
	// isBoundary() is not very fast, but should be good enough for the
	// snippet generation. If not, revisit the way we scan in ComputeSnippet.
	bi->isBoundary(static_cast<int32_t>(*utf8_pos));
	bi->next(count);
	*utf8_pos = static_cast<size_t>(bi->current());
	}

	// The amount of context to include for a given hit. Note that it's counted
	// in terms of graphemes rather than bytes.
	const int kSnippetContext = 50;

	// Returns true if next match falls within a snippet window
	// from the previous match. The window size is counted in terms
	// of graphemes rather than bytes in UTF-8.
	bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
	size_t previous_match_end,
	size_t next_match_start) {
	// If it's within a window in terms of bytes, it's certain
	// that it's within a window in terms of graphemes as well.
	if (next_match_start < previous_match_end + kSnippetContext)
	return true;
	bi->isBoundary(static_cast<int32_t>(previous_match_end));
	// An alternative to this is to call \|bi->next()\| at most
	// kSnippetContext times, compare \|bi->current()\| with \|next_match_start\|
	// after each call and return early if possible. There are other
	// heuristics to speed things up if necessary, but it's not likely that
	// we need to bother.
	bi->next(kSnippetContext);
	int64_t current = bi->current();
	return (next_match_start < static_cast<uint64_t>(current) \|\|
	current == icu::BreakIterator::DONE);
	}

	} // namespace

	// static
	void Snippet::ExtractMatchPositions(const std::string& offsets_str,
	const std::string& column_num,
	MatchPositions* match_positions) {
	DCHECK(match_positions);
	if (offsets_str.empty())
	return;
	std::vector<std::string> offsets = base::SplitString(
	offsets_str, " ", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
	// SQLite offsets are sets of four integers:
	// column, query term, match offset, match length
	// Matches within a string are marked by (start, end) pairs.
	for (size_t i = 0; i < offsets.size() - 3; i += 4) {
	if (offsets[i] != column_num)
	continue;
	const size_t start = atoi(offsets[i + 2].c_str());
	const size_t end = start + atoi(offsets[i + 3].c_str());
	// Switch to DCHECK after debugging http://crbug.com/15261.
	CHECK(end >= start);
	AddMatch(start, end, match_positions);
	}
	}

	// static
	void Snippet::ConvertMatchPositionsToWide(
	const std::string& utf8_string,
	Snippet::MatchPositions* match_positions) {
	DCHECK(match_positions);
	int32_t utf8_pos = 0;
	size_t utf16_pos = 0;
	const char* utf8_cstring = utf8_string.c_str();
	const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
	for (Snippet::MatchPositions::iterator i = match_positions->begin();
	i != match_positions->end(); ++i) {
	i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
	static_cast<int32_t>(i->first),
	&utf8_pos, &utf16_pos);
	i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
	static_cast<int32_t>(i->second),
	&utf8_pos, &utf16_pos);
	}
	}

	Snippet::Snippet() {
	}

	Snippet::Snippet(const Snippet& other) = default;

	Snippet::~Snippet() {
	}

	void Snippet::ComputeSnippet(const MatchPositions& match_positions,
	const std::string& document) {
	// The length of snippets we try to produce.
	// We can generate longer snippets but stop once we cross kSnippetMaxLength.
	const size_t kSnippetMaxLength = 200;
	const base::string16 kEllipsis = base::ASCIIToUTF16(" ... ");

	UText* document_utext = NULL;
	UErrorCode status = U_ZERO_ERROR;
	document_utext = utext_openUTF8(document_utext, document.data(),
	document.size(), &status);
	// Locale does not matter because there's no per-locale customization
	// for character iterator.
	scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
	icu::Locale::getDefault(), status));
	bi->setText(document_utext, status);
	DCHECK(U_SUCCESS(status));

	// We build the snippet by iterating through the matches and then grabbing
	// context around each match. If matches are near enough each other (within
	// kSnippetContext), we skip the "..." between them.
	base::string16 snippet;
	size_t start = 0;
	for (size_t i = 0; i < match_positions.size(); ++i) {
	// Some shorter names for the current match.
	const size_t match_start = match_positions[i].first;
	const size_t match_end = match_positions[i].second;

	// Switch to DCHECK after debugging http://crbug.com/15261.
	CHECK(match_end > match_start);
	CHECK(match_end <= document.size());

	// Add the context, if any, to show before the match.
	size_t context_start = match_start;
	MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
	start = std::max(start, context_start);
	if (start < match_start) {
	if (start > 0)
	snippet += kEllipsis;
	// Switch to DCHECK after debugging http://crbug.com/15261.
	CHECK(start < document.size());
	snippet += base::UTF8ToUTF16(document.substr(start, match_start - start));
	}

	// Add the match.
	const size_t first = snippet.size();
	snippet += base::UTF8ToUTF16(document.substr(match_start,
	match_end - match_start));
	matches_.push_back(std::make_pair(first, snippet.size()));

	// Compute the context, if any, to show after the match.
	size_t end;
	// Check if the next match falls within our snippet window.
	if (i + 1 < match_positions.size() &&
	IsNextMatchWithinSnippetWindow(bi.get(), match_end,
	match_positions[i + 1].first)) {
	// Yes, it's within the window. Make the end context extend just up
	// to the next match.
	end = match_positions[i + 1].first;
	// Switch to DCHECK after debugging http://crbug.com/15261.
	CHECK(end >= match_end);
	CHECK(end <= document.size());
	snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
	} else {
	// No, there's either no next match or the next match is too far away.
	end = match_end;
	MoveByNGraphemes(bi.get(), kSnippetContext, &end);
	// Switch to DCHECK after debugging http://crbug.com/15261.
	CHECK(end >= match_end);
	CHECK(end <= document.size());
	snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end));
	if (end < document.size())
	snippet += kEllipsis;
	}
	start = end;

	// Stop here if we have enough snippet computed.
	if (snippet.size() >= kSnippetMaxLength)
	break;
	}

	utext_close(document_utext);
	swap(text_, snippet);
	}

	void Snippet::Swap(Snippet* other) {
	text_.swap(other->text_);
	matches_.swap(other->matches_);
	}

	} // namespace query_parser