blob: 5cb679025fbd6c7785c6b4f96515ff7faea2c10a [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package org.chromium.chrome.browser.contextualsearch;
import android.text.TextUtils;
import org.chromium.base.CollectionUtil;
import org.chromium.base.VisibleForTesting;
import org.chromium.chrome.browser.contextualsearch.ContextualSearchFieldTrial.ContextualSearchSwitch;
import java.util.HashSet;
import java.util.Locale;
/**
* Implements a simple first-cut heuristic for whether a Tap is on an entity or not.
* This is intended to be a proof-of-concept that entities are worth tapping upon.
* This implementation only recognizes one simple pattern that we recognize as a proper noun: two
* camel-case words that are not at the beginning of a sentence, in a page that we think is in a
* Roman language that uses camel-case for proper nouns.
* <p>
* This is not a robust implementation -- it's only really suitable as a strong-positive signal
* that can sometimes be extracted from the page content. This implementation uses the CLD to
* determine if the page is really in a white-list of languages that uses camel-case for proper
* nouns, and excluding German because it capitalizes all nouns, and excluding languages that do not
* have clear word-breaks. Leveraging an on-device entity recognizer is another natural extension
* to this idea.
* <p>
* The current algorithm is designed to have relatively high precision at the expense of very low
* recall (lots of false-negatives, but patterns that are "recognized" should usually be entities).
* To handle the low recall we pass a separate signal to Ranker to let it know whether the page was
* even eligible for this kind of entity-recognition.
* <p>
* We implement suppression, but only really apply that for testing and interactive demo purposes.
*/
class ContextualSearchEntityHeuristic extends ContextualSearchHeuristic {
private static final int INVALID_OFFSET = ContextualSearchContext.INVALID_OFFSET;
// Languages are ordered by popularity, and vetted with a simple web search for capitalization
// rules.
// Do not add German!
private static final HashSet<String> ROMAN_CAMEL_CASE_PROPER_NOUN_LANGUAGES =
CollectionUtil.newHashSet("es", // Spanish
"en", // English,
"pt", // Portuguese
"ru", // Russian
"fr", // French,
"it" // Italian
);
private final boolean mIsSuppressionEnabled;
private final boolean mIsConditionSatisfied;
private final boolean mIsContextCamelCaseForProperNouns;
private final boolean mIsProbablyEntity;
/**
* Constructs a heuristic to determine if the current Tap looks like it was on a name or not.
* @param contextualSearchContext The current {@link ContextualSearchContext} so we can figure
* out the words around what has been tapped.
*/
ContextualSearchEntityHeuristic(ContextualSearchContext contextualSearchContext) {
this(contextualSearchContext,
ContextualSearchFieldTrial.getSwitch(
ContextualSearchSwitch.IS_NOT_AN_ENTITY_SUPPRESSION_ENABLED));
}
/**
* Constructs an instance for testing.
*/
static ContextualSearchEntityHeuristic testInstance(
ContextualSearchContext contextualSearchContext, boolean isEnabled) {
return new ContextualSearchEntityHeuristic(contextualSearchContext, isEnabled);
}
/**
* Constructs an instance of a {@link ContextualSearchHeuristic} that provides a signal for a
* tap that is probably on a proper noun in a language that uses camel-case capitalization of
* proper nouns.
* @param contextualSearchContext The current {@link ContextualSearchContext} so we can detect
* the language and figure out the words around what has been
* tapped.
* @param isEnabled Whether or not to enable suppression.
*/
private ContextualSearchEntityHeuristic(
ContextualSearchContext contextualSearchContext, boolean isEnabled) {
mIsSuppressionEnabled = isEnabled;
mIsContextCamelCaseForProperNouns =
isContextCamelCaseForProperNouns(contextualSearchContext);
mIsProbablyEntity = mIsContextCamelCaseForProperNouns
&& isTapOnTwoCamelCaseWordsMidSentence(contextualSearchContext);
mIsConditionSatisfied = !mIsProbablyEntity;
}
@Override
protected boolean isConditionSatisfiedAndEnabled() {
return mIsSuppressionEnabled && mIsConditionSatisfied;
}
@Override
protected void logResultsSeen(boolean wasSearchContentViewSeen, boolean wasActivatedByTap) {
if (wasActivatedByTap) {
ContextualSearchUma.logTapOnEntitySeen(
wasSearchContentViewSeen, !mIsConditionSatisfied);
}
}
@Override
protected void logRankerTapSuppression(ContextualSearchInteractionRecorder logger) {
logger.logFeature(ContextualSearchInteractionRecorder.Feature.IS_ENTITY, mIsProbablyEntity);
logger.logFeature(ContextualSearchInteractionRecorder.Feature.IS_ENTITY_ELIGIBLE,
mIsContextCamelCaseForProperNouns);
}
@VisibleForTesting
protected boolean isProbablyEntityBasedOnCamelCase() {
return mIsProbablyEntity;
}
/**
* @return Whether the tap is on a proper noun, based on two camel-case words mid-sentence.
*/
private boolean isTapOnTwoCamelCaseWordsMidSentence(
ContextualSearchContext contextualSearchContext) {
// Check common cases that we can quickly reject.
String tappedWord = contextualSearchContext.getWordTapped();
if (TextUtils.isEmpty(tappedWord)
|| !isCapitalizedCamelCase(tappedWord)) {
return false;
}
// Check if the tapped word is the first word of a two-word entity.
if (isTwoWordCamelCaseSpaceSeparatedEntity(contextualSearchContext, tappedWord,
contextualSearchContext.getWordTappedOffset(),
contextualSearchContext.getWordFollowingTap(),
contextualSearchContext.getWordFollowingTapOffset())) {
return true;
}
// Otherwise the tapped word needs to be the second word of a two-word entity.
return isTwoWordCamelCaseSpaceSeparatedEntity(contextualSearchContext,
contextualSearchContext.getWordPreviousToTap(),
contextualSearchContext.getWordPreviousToTapOffset(), tappedWord,
contextualSearchContext.getWordTappedOffset());
}
/**
* Considers whether the given words at the given offsets are probably a two-word entity based
* on our simple rules for capitalization: both camel-case and separated by just a single space
* and not following whitespace that precedes a character that commonly ends a sentence.
* @param contextualSearchContext The {@link ContextualSearchContext} that the words came from.
* @param firstWord The first word of a possible entity.
* @param firstWordOffset The offset of the first word.
* @param secondWord The second word of a possible entity.
* @param secondWordOffset The offset of the second word.
* @return Whether the words are probably an entity.
*/
private boolean isTwoWordCamelCaseSpaceSeparatedEntity(
ContextualSearchContext contextualSearchContext, String firstWord, int firstWordOffset,
String secondWord, int secondWordOffset) {
if (!isCapitalizedCamelCase(firstWord) || !isCapitalizedCamelCase(secondWord)) return false;
if (!isWhitespaceThatDoesntEndASentence(contextualSearchContext, firstWordOffset - 1)) {
return false;
}
// Check that there's just one separator character.
if (firstWordOffset + firstWord.length() + 1 != secondWordOffset) return false;
// Check that it's whitespace.
return isWhitespaceAtOffset(contextualSearchContext, secondWordOffset - 1);
}
/**
* Scans previous characters starting from the given offset in the given context.
* @return Whether there is whitespace that doesn't end a sentence at the given offset.
*/
private boolean isWhitespaceThatDoesntEndASentence(
ContextualSearchContext contextualSearchContext, int offset) {
int whitespaceScanOffset = offset;
while (whitespaceScanOffset > 0
&& isWhitespaceAtOffset(contextualSearchContext, whitespaceScanOffset)) {
--whitespaceScanOffset;
}
return whitespaceScanOffset > 0
&& !isEndOfSentenceChar(contextualSearchContext.getSurroundingText().charAt(
whitespaceScanOffset));
}
/**
* Determines if the given character is used as an end-of-sentence character when followed by
* whitespace.
* Warning! This functionality has not been verified in languages other than
* English, even though we do apply it for a wide range of languages from our white-list for ML
* purposes only.
* @return Whether the given character is often used to end a sentence when followed by
* whitespace.
*/
private boolean isEndOfSentenceChar(char c) {
return c == '.' || c == '?' || c == '!' || c == ':';
}
/**
* @return {@code true} if the word starts with an upper-case letter and has at least one letter
* that is not considered upper-case.
*/
private boolean isCapitalizedCamelCase(String word) {
if (TextUtils.isEmpty(word) || word.length() <= 1) return false;
Character firstChar = word.charAt(0);
return Character.isUpperCase(firstChar)
&& !word.toUpperCase(Locale.getDefault()).equals(word);
}
/**
* @return Whether the surrounding text has a whitespace character at the given offset.
*/
private boolean isWhitespaceAtOffset(
ContextualSearchContext contextualSearchContext, int offset) {
if (offset == INVALID_OFFSET) return false;
Character charAtOffset = contextualSearchContext.getSurroundingText().charAt(offset);
return Character.isWhitespace(charAtOffset);
}
/**
* Detects the language of the Context and returns whether that language uses camel-case for
* proper nouns.
* @return Whether the language of the Context uses "camel" case (mixed upper and lower case)
* for proper nouns.
*/
private boolean isContextCamelCaseForProperNouns(
ContextualSearchContext contextualSearchContext) {
return ROMAN_CAMEL_CASE_PROPER_NOUN_LANGUAGES.contains(
contextualSearchContext.getDetectedLanguage());
}
}