blob: 30fa304ba4cd7d1219b5c244d1343bd0f953ae3e [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/common/android/address_parser.h"
#include "base/logging.h"
#include "base/strings/string_util.h"
#include "content/common/android/address_parser_internal.h"
namespace {
// Minimum number of words in an address after the house number
// before a state is expected to be found.
// A value too high can miss short addresses.
const size_t kMinAddressWords = 3;
// Maximum number of words allowed in an address between the house number
// and the state, both not included.
const size_t kMaxAddressWords = 12;
// Maximum number of lines allowed in an address between the house number
// and the state, both not included.
const size_t kMaxAddressLines = 5;
// Maximum length allowed for any address word between the house number
// and the state, both not included.
const size_t kMaxAddressNameWordLength = 25;
// Maximum number of words after the house number in which the location name
// should be found.
const size_t kMaxLocationNameDistance = 4;
// Additional characters used as new line delimiters.
const base::char16 kNewlineDelimiters[] = {
'\n',
',',
'*',
0x2022, // Unicode bullet
0,
};
} // anonymous namespace
namespace content {
namespace address_parser {
using namespace internal;
bool FindAddress(const base::string16& text, base::string16* address) {
size_t start, end;
if (FindAddress(text.begin(), text.end(), &start, &end)) {
size_t len = end >= start ? end - start : 0;
address->assign(text.substr(start, len));
return true;
}
return false;
}
bool FindAddress(const base::string16::const_iterator& begin,
const base::string16::const_iterator& end,
size_t* start_pos,
size_t* end_pos) {
HouseNumberParser house_number_parser;
// Keep going through the input string until a potential house number is
// detected. Start tokenizing the following words to find a valid
// street name within a word range. Then, find a state name followed
// by a valid zip code for that state. Also keep a look for any other
// possible house numbers to continue from in case of no match and for
// state names not followed by a zip code (e.g. New York, NY 10000).
const base::string16 newline_delimiters = kNewlineDelimiters;
const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
for (base::string16::const_iterator it = begin; it != end; ) {
Word house_number;
if (!house_number_parser.Parse(it, end, &house_number))
return false;
String16Tokenizer tokenizer(house_number.end, end, delimiters);
tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
WordList words;
words.push_back(house_number);
bool found_location_name = false;
bool continue_on_house_number = true;
bool consecutive_house_numbers = true;
size_t next_house_number_word = 0;
size_t num_lines = 1;
// Don't include the house number in the word count.
size_t next_word = 1;
for (; next_word <= kMaxAddressWords + 1; ++next_word) {
// Extract a new word from the tokenizer.
if (next_word == words.size()) {
do {
if (!tokenizer.GetNext())
return false;
// Check the number of address lines.
if (tokenizer.token_is_delim() && newline_delimiters.find(
*tokenizer.token_begin()) != base::string16::npos) {
++num_lines;
}
} while (tokenizer.token_is_delim());
if (num_lines > kMaxAddressLines)
break;
words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
}
// Check the word length. If too long, don't try to continue from
// the next house number as no address can hold this word.
const Word& current_word = words[next_word];
DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
size_t current_word_length = std::distance(
current_word.begin, current_word.end);
if (current_word_length > kMaxAddressNameWordLength) {
continue_on_house_number = false;
break;
}
// Check if the new word is a valid house number.
if (house_number_parser.Parse(current_word.begin, current_word.end,
NULL)) {
// Increase the number of consecutive house numbers since the beginning.
if (consecutive_house_numbers) {
// Check if there is a new line between consecutive house numbers.
// This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
if (num_lines > 1) {
next_house_number_word = next_word;
break;
}
}
// Keep the next candidate to resume parsing from in case of failure.
if (next_house_number_word == 0) {
next_house_number_word = next_word;
continue;
}
} else {
consecutive_house_numbers = false;
}
// Look for location names in the words after the house number.
// A range limitation is introduced to avoid matching
// anything that starts with a number before a legitimate address.
if (next_word <= kMaxLocationNameDistance &&
IsValidLocationName(current_word)) {
found_location_name = true;
continue;
}
// Don't count the house number.
if (next_word > kMinAddressWords) {
// Looking for the state is likely to add new words to the list while
// checking for multi-word state names.
size_t state_first_word = next_word;
size_t state_last_word, state_index;
if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
&tokenizer, &state_index)) {
// A location name should have been found at this point.
if (!found_location_name)
break;
// Explicitly exclude "et al", as "al" is a valid state code.
if (current_word_length == 2 && words.size() > 2) {
const Word& previous_word = words[state_first_word - 1];
if (previous_word.end - previous_word.begin == 2 &&
LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
"et") &&
LowerCaseEqualsASCII(current_word.begin, current_word.end,
"al"))
break;
}
// Extract one more word from the tokenizer if not already available.
size_t zip_word = state_last_word + 1;
if (zip_word == words.size()) {
do {
if (!tokenizer.GetNext())
return false;
} while (tokenizer.token_is_delim());
words.push_back(Word(tokenizer.token_begin(),
tokenizer.token_end()));
}
// Check the parsing validity and state range of the zip code.
next_word = state_last_word;
if (!IsZipValid(words[zip_word], state_index))
continue;
*start_pos = words[0].begin - begin;
*end_pos = words[zip_word].end - begin;
return true;
}
}
}
// Avoid skipping too many words because of a non-address number
// at the beginning of the contents to parse.
if (continue_on_house_number && next_house_number_word > 0) {
it = words[next_house_number_word].begin;
} else {
DCHECK(!words.empty());
next_word = std::min(next_word, words.size() - 1);
it = words[next_word].end;
}
}
return false;
}
} // namespace address_parser
} // namespace content