content/common/android/address_parser.cc - chromium/chromium - Git at Google

 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "content/common/android/address_parser.h"

 #include "base/logging.h"
 #include "base/strings/string_util.h"
 #include "content/common/android/address_parser_internal.h"

 namespace {

 // Minimum number of words in an address after the house number
 // before a state is expected to be found.
 // A value too high can miss short addresses.
 const size_t kMinAddressWords = 3;

 // Maximum number of words allowed in an address between the house number
 // and the state, both not included.
 const size_t kMaxAddressWords = 12;

 // Maximum number of lines allowed in an address between the house number
 // and the state, both not included.
 const size_t kMaxAddressLines = 5;

 // Maximum length allowed for any address word between the house number
 // and the state, both not included.
 const size_t kMaxAddressNameWordLength = 25;

 // Maximum number of words after the house number in which the location name
 // should be found.
 const size_t kMaxLocationNameDistance = 4;

 // Additional characters used as new line delimiters.
 const base::char16 kNewlineDelimiters[] = {
   '\n',
   ',',
   '*',
   0x2022,  // Unicode bullet
   0,
 };

 }  // anonymous namespace

 namespace content {

 namespace address_parser {

 using namespace internal;

 bool FindAddress(const base::string16& text, base::string16* address) {
   size_t start, end;
   if (FindAddress(text.begin(), text.end(), &start, &end)) {
     size_t len = end >= start ? end - start : 0;
     address->assign(text.substr(start, len));
     return true;
   }
   return false;
 }

 bool FindAddress(const base::string16::const_iterator& begin,
                  const base::string16::const_iterator& end,
                  size_t* start_pos,
                  size_t* end_pos) {
   HouseNumberParser house_number_parser;

   // Keep going through the input string until a potential house number is
   // detected. Start tokenizing the following words to find a valid
   // street name within a word range. Then, find a state name followed
   // by a valid zip code for that state. Also keep a look for any other
   // possible house numbers to continue from in case of no match and for
   // state names not followed by a zip code (e.g. New York, NY 10000).
   const base::string16 newline_delimiters = kNewlineDelimiters;
   const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
   for (base::string16::const_iterator it = begin; it != end; ) {
     Word house_number;
     if (!house_number_parser.Parse(it, end, &house_number))
       return false;

     String16Tokenizer tokenizer(house_number.end, end, delimiters);
     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

     WordList words;
     words.push_back(house_number);

     bool found_location_name = false;
     bool continue_on_house_number = true;
     bool consecutive_house_numbers = true;
     size_t next_house_number_word = 0;
     size_t num_lines = 1;

     // Don't include the house number in the word count.
     size_t next_word = 1;
     for (; next_word <= kMaxAddressWords + 1; ++next_word) {

       // Extract a new word from the tokenizer.
       if (next_word == words.size()) {
         do {
           if (!tokenizer.GetNext())
             return false;

           // Check the number of address lines.
           if (tokenizer.token_is_delim() && newline_delimiters.find(
               *tokenizer.token_begin()) != base::string16::npos) {
             ++num_lines;
           }
         } while (tokenizer.token_is_delim());

         if (num_lines > kMaxAddressLines)
           break;

         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
       }

       // Check the word length. If too long, don't try to continue from
       // the next house number as no address can hold this word.
       const Word& current_word = words[next_word];
       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
       size_t current_word_length = std::distance(
           current_word.begin, current_word.end);
       if (current_word_length > kMaxAddressNameWordLength) {
         continue_on_house_number = false;
         break;
       }

       // Check if the new word is a valid house number.
       if (house_number_parser.Parse(current_word.begin, current_word.end,
           NULL)) {
         // Increase the number of consecutive house numbers since the beginning.
         if (consecutive_house_numbers) {
           // Check if there is a new line between consecutive house numbers.
           // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
           if (num_lines > 1) {
             next_house_number_word = next_word;
             break;
           }
         }

         // Keep the next candidate to resume parsing from in case of failure.
         if (next_house_number_word == 0) {
           next_house_number_word = next_word;
           continue;
         }
       } else {
         consecutive_house_numbers = false;
       }

       // Look for location names in the words after the house number.
       // A range limitation is introduced to avoid matching
       // anything that starts with a number before a legitimate address.
       if (next_word <= kMaxLocationNameDistance &&
           IsValidLocationName(current_word)) {
         found_location_name = true;
         continue;
       }

       // Don't count the house number.
       if (next_word > kMinAddressWords) {
         // Looking for the state is likely to add new words to the list while
         // checking for multi-word state names.
         size_t state_first_word = next_word;
         size_t state_last_word, state_index;
         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
                                     &tokenizer, &state_index)) {

           // A location name should have been found at this point.
           if (!found_location_name)
             break;

           // Explicitly exclude "et al", as "al" is a valid state code.
           if (current_word_length == 2 && words.size() > 2) {
             const Word& previous_word = words[state_first_word - 1];
             if (previous_word.end - previous_word.begin == 2 &&
                 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
                                      "et") &&
                 LowerCaseEqualsASCII(current_word.begin, current_word.end,
                                      "al"))
               break;
           }

           // Extract one more word from the tokenizer if not already available.
           size_t zip_word = state_last_word + 1;
           if (zip_word == words.size()) {
             do {
               if (!tokenizer.GetNext())
                 return false;
             } while (tokenizer.token_is_delim());
             words.push_back(Word(tokenizer.token_begin(),
                             tokenizer.token_end()));
           }

           // Check the parsing validity and state range of the zip code.
           next_word = state_last_word;
           if (!IsZipValid(words[zip_word], state_index))
             continue;

           *start_pos = words[0].begin - begin;
           *end_pos = words[zip_word].end - begin;
           return true;
         }
       }
     }

     // Avoid skipping too many words because of a non-address number
     // at the beginning of the contents to parse.
     if (continue_on_house_number && next_house_number_word > 0) {
       it = words[next_house_number_word].begin;
     } else {
       DCHECK(!words.empty());
       next_word = std::min(next_word, words.size() - 1);
       it = words[next_word].end;
     }
   }

   return false;
 }

 }  // namespace address_parser

 }  // namespace content
	// Copyright (c) 2012 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "content/common/android/address_parser.h"

	#include "base/logging.h"
	#include "base/strings/string_util.h"
	#include "content/common/android/address_parser_internal.h"

	namespace {

	// Minimum number of words in an address after the house number
	// before a state is expected to be found.
	// A value too high can miss short addresses.
	const size_t kMinAddressWords = 3;

	// Maximum number of words allowed in an address between the house number
	// and the state, both not included.
	const size_t kMaxAddressWords = 12;

	// Maximum number of lines allowed in an address between the house number
	// and the state, both not included.
	const size_t kMaxAddressLines = 5;

	// Maximum length allowed for any address word between the house number
	// and the state, both not included.
	const size_t kMaxAddressNameWordLength = 25;

	// Maximum number of words after the house number in which the location name
	// should be found.
	const size_t kMaxLocationNameDistance = 4;

	// Additional characters used as new line delimiters.
	const base::char16 kNewlineDelimiters[] = {
	'\n',
	',',
	'*',
	0x2022, // Unicode bullet
	0,
	};

	} // anonymous namespace

	namespace content {

	namespace address_parser {

	using namespace internal;

	bool FindAddress(const base::string16& text, base::string16* address) {
	size_t start, end;
	if (FindAddress(text.begin(), text.end(), &start, &end)) {
	size_t len = end >= start ? end - start : 0;
	address->assign(text.substr(start, len));
	return true;
	}
	return false;
	}

	bool FindAddress(const base::string16::const_iterator& begin,
	const base::string16::const_iterator& end,
	size_t* start_pos,
	size_t* end_pos) {
	HouseNumberParser house_number_parser;

	// Keep going through the input string until a potential house number is
	// detected. Start tokenizing the following words to find a valid
	// street name within a word range. Then, find a state name followed
	// by a valid zip code for that state. Also keep a look for any other
	// possible house numbers to continue from in case of no match and for
	// state names not followed by a zip code (e.g. New York, NY 10000).
	const base::string16 newline_delimiters = kNewlineDelimiters;
	const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
	for (base::string16::const_iterator it = begin; it != end; ) {
	Word house_number;
	if (!house_number_parser.Parse(it, end, &house_number))
	return false;

	String16Tokenizer tokenizer(house_number.end, end, delimiters);
	tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);

	WordList words;
	words.push_back(house_number);

	bool found_location_name = false;
	bool continue_on_house_number = true;
	bool consecutive_house_numbers = true;
	size_t next_house_number_word = 0;
	size_t num_lines = 1;

	// Don't include the house number in the word count.
	size_t next_word = 1;
	for (; next_word <= kMaxAddressWords + 1; ++next_word) {

	// Extract a new word from the tokenizer.
	if (next_word == words.size()) {
	do {
	if (!tokenizer.GetNext())
	return false;

	// Check the number of address lines.
	if (tokenizer.token_is_delim() && newline_delimiters.find(
	*tokenizer.token_begin()) != base::string16::npos) {
	++num_lines;
	}
	} while (tokenizer.token_is_delim());

	if (num_lines > kMaxAddressLines)
	break;

	words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
	}

	// Check the word length. If too long, don't try to continue from
	// the next house number as no address can hold this word.
	const Word& current_word = words[next_word];
	DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
	size_t current_word_length = std::distance(
	current_word.begin, current_word.end);
	if (current_word_length > kMaxAddressNameWordLength) {
	continue_on_house_number = false;
	break;
	}

	// Check if the new word is a valid house number.
	if (house_number_parser.Parse(current_word.begin, current_word.end,
	NULL)) {
	// Increase the number of consecutive house numbers since the beginning.
	if (consecutive_house_numbers) {
	// Check if there is a new line between consecutive house numbers.
	// This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
	if (num_lines > 1) {
	next_house_number_word = next_word;
	break;
	}
	}

	// Keep the next candidate to resume parsing from in case of failure.
	if (next_house_number_word == 0) {
	next_house_number_word = next_word;
	continue;
	}
	} else {
	consecutive_house_numbers = false;
	}

	// Look for location names in the words after the house number.
	// A range limitation is introduced to avoid matching
	// anything that starts with a number before a legitimate address.
	if (next_word <= kMaxLocationNameDistance &&
	IsValidLocationName(current_word)) {
	found_location_name = true;
	continue;
	}

	// Don't count the house number.
	if (next_word > kMinAddressWords) {
	// Looking for the state is likely to add new words to the list while
	// checking for multi-word state names.
	size_t state_first_word = next_word;
	size_t state_last_word, state_index;
	if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
	&tokenizer, &state_index)) {

	// A location name should have been found at this point.
	if (!found_location_name)
	break;

	// Explicitly exclude "et al", as "al" is a valid state code.
	if (current_word_length == 2 && words.size() > 2) {
	const Word& previous_word = words[state_first_word - 1];
	if (previous_word.end - previous_word.begin == 2 &&
	LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
	"et") &&
	LowerCaseEqualsASCII(current_word.begin, current_word.end,
	"al"))
	break;
	}

	// Extract one more word from the tokenizer if not already available.
	size_t zip_word = state_last_word + 1;
	if (zip_word == words.size()) {
	do {
	if (!tokenizer.GetNext())
	return false;
	} while (tokenizer.token_is_delim());
	words.push_back(Word(tokenizer.token_begin(),
	tokenizer.token_end()));
	}

	// Check the parsing validity and state range of the zip code.
	next_word = state_last_word;
	if (!IsZipValid(words[zip_word], state_index))
	continue;

	*start_pos = words[0].begin - begin;
	*end_pos = words[zip_word].end - begin;
	return true;
	}
	}
	}

	// Avoid skipping too many words because of a non-address number
	// at the beginning of the contents to parse.
	if (continue_on_house_number && next_house_number_word > 0) {
	it = words[next_house_number_word].begin;
	} else {
	DCHECK(!words.empty());
	next_word = std::min(next_word, words.size() - 1);
	it = words[next_word].end;
	}
	}

	return false;
	}

	} // namespace address_parser

	} // namespace content