| // Copyright 2017 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| package org.chromium.chrome.browser.contextualsearch; |
| |
| import android.text.TextUtils; |
| |
| import org.chromium.base.CollectionUtil; |
| import org.chromium.base.VisibleForTesting; |
| import org.chromium.chrome.browser.contextualsearch.ContextualSearchFieldTrial.ContextualSearchSwitch; |
| |
| import java.util.HashSet; |
| import java.util.Locale; |
| |
| /** |
| * Implements a simple first-cut heuristic for whether a Tap is on an entity or not. |
| * This is intended to be a proof-of-concept that entities are worth tapping upon. |
| * This implementation only recognizes one simple pattern that we recognize as a proper noun: two |
| * camel-case words that are not at the beginning of a sentence, in a page that we think is in a |
| * Roman language that uses camel-case for proper nouns. |
| * <p> |
| * This is not a robust implementation -- it's only really suitable as a strong-positive signal |
| * that can sometimes be extracted from the page content. This implementation uses the CLD to |
| * determine if the page is really in a white-list of languages that uses camel-case for proper |
| * nouns, and excluding German because it capitalizes all nouns, and excluding languages that do not |
| * have clear word-breaks. Leveraging an on-device entity recognizer is another natural extension |
| * to this idea. |
| * <p> |
| * The current algorithm is designed to have relatively high precision at the expense of very low |
| * recall (lots of false-negatives, but patterns that are "recognized" should usually be entities). |
| * To handle the low recall we pass a separate signal to Ranker to let it know whether the page was |
| * even eligible for this kind of entity-recognition. |
| * <p> |
| * We implement suppression, but only really apply that for testing and interactive demo purposes. |
| */ |
| class ContextualSearchEntityHeuristic extends ContextualSearchHeuristic { |
| private static final int INVALID_OFFSET = ContextualSearchContext.INVALID_OFFSET; |
| |
| // Languages are ordered by popularity, and vetted with a simple web search for capitalization |
| // rules. |
| // Do not add German! |
| private static final HashSet<String> ROMAN_CAMEL_CASE_PROPER_NOUN_LANGUAGES = |
| CollectionUtil.newHashSet("es", // Spanish |
| "en", // English, |
| "pt", // Portuguese |
| "ru", // Russian |
| "fr", // French, |
| "it" // Italian |
| ); |
| |
| private final boolean mIsSuppressionEnabled; |
| private final boolean mIsConditionSatisfied; |
| private final boolean mIsContextCamelCaseForProperNouns; |
| private final boolean mIsProbablyEntity; |
| |
| /** |
| * Constructs a heuristic to determine if the current Tap looks like it was on a name or not. |
| * @param contextualSearchContext The current {@link ContextualSearchContext} so we can figure |
| * out the words around what has been tapped. |
| */ |
| ContextualSearchEntityHeuristic(ContextualSearchContext contextualSearchContext) { |
| this(contextualSearchContext, |
| ContextualSearchFieldTrial.getSwitch( |
| ContextualSearchSwitch.IS_NOT_AN_ENTITY_SUPPRESSION_ENABLED)); |
| } |
| |
| /** |
| * Constructs an instance for testing. |
| */ |
| static ContextualSearchEntityHeuristic testInstance( |
| ContextualSearchContext contextualSearchContext, boolean isEnabled) { |
| return new ContextualSearchEntityHeuristic(contextualSearchContext, isEnabled); |
| } |
| |
| /** |
| * Constructs an instance of a {@link ContextualSearchHeuristic} that provides a signal for a |
| * tap that is probably on a proper noun in a language that uses camel-case capitalization of |
| * proper nouns. |
| * @param contextualSearchContext The current {@link ContextualSearchContext} so we can detect |
| * the language and figure out the words around what has been |
| * tapped. |
| * @param isEnabled Whether or not to enable suppression. |
| */ |
| private ContextualSearchEntityHeuristic( |
| ContextualSearchContext contextualSearchContext, boolean isEnabled) { |
| mIsSuppressionEnabled = isEnabled; |
| mIsContextCamelCaseForProperNouns = |
| isContextCamelCaseForProperNouns(contextualSearchContext); |
| mIsProbablyEntity = mIsContextCamelCaseForProperNouns |
| && isTapOnTwoCamelCaseWordsMidSentence(contextualSearchContext); |
| mIsConditionSatisfied = !mIsProbablyEntity; |
| } |
| |
| @Override |
| protected boolean isConditionSatisfiedAndEnabled() { |
| return mIsSuppressionEnabled && mIsConditionSatisfied; |
| } |
| |
| @Override |
| protected void logResultsSeen(boolean wasSearchContentViewSeen, boolean wasActivatedByTap) { |
| if (wasActivatedByTap) { |
| ContextualSearchUma.logTapOnEntitySeen( |
| wasSearchContentViewSeen, !mIsConditionSatisfied); |
| } |
| } |
| |
| @Override |
| protected void logRankerTapSuppression(ContextualSearchInteractionRecorder logger) { |
| logger.logFeature(ContextualSearchInteractionRecorder.Feature.IS_ENTITY, mIsProbablyEntity); |
| logger.logFeature(ContextualSearchInteractionRecorder.Feature.IS_ENTITY_ELIGIBLE, |
| mIsContextCamelCaseForProperNouns); |
| } |
| |
| @VisibleForTesting |
| protected boolean isProbablyEntityBasedOnCamelCase() { |
| return mIsProbablyEntity; |
| } |
| |
| /** |
| * @return Whether the tap is on a proper noun, based on two camel-case words mid-sentence. |
| */ |
| private boolean isTapOnTwoCamelCaseWordsMidSentence( |
| ContextualSearchContext contextualSearchContext) { |
| // Check common cases that we can quickly reject. |
| String tappedWord = contextualSearchContext.getWordTapped(); |
| if (TextUtils.isEmpty(tappedWord) |
| || !isCapitalizedCamelCase(tappedWord)) { |
| return false; |
| } |
| |
| // Check if the tapped word is the first word of a two-word entity. |
| if (isTwoWordCamelCaseSpaceSeparatedEntity(contextualSearchContext, tappedWord, |
| contextualSearchContext.getWordTappedOffset(), |
| contextualSearchContext.getWordFollowingTap(), |
| contextualSearchContext.getWordFollowingTapOffset())) { |
| return true; |
| } |
| |
| // Otherwise the tapped word needs to be the second word of a two-word entity. |
| return isTwoWordCamelCaseSpaceSeparatedEntity(contextualSearchContext, |
| contextualSearchContext.getWordPreviousToTap(), |
| contextualSearchContext.getWordPreviousToTapOffset(), tappedWord, |
| contextualSearchContext.getWordTappedOffset()); |
| } |
| |
| /** |
| * Considers whether the given words at the given offsets are probably a two-word entity based |
| * on our simple rules for capitalization: both camel-case and separated by just a single space |
| * and not following whitespace that precedes a character that commonly ends a sentence. |
| * @param contextualSearchContext The {@link ContextualSearchContext} that the words came from. |
| * @param firstWord The first word of a possible entity. |
| * @param firstWordOffset The offset of the first word. |
| * @param secondWord The second word of a possible entity. |
| * @param secondWordOffset The offset of the second word. |
| * @return Whether the words are probably an entity. |
| */ |
| private boolean isTwoWordCamelCaseSpaceSeparatedEntity( |
| ContextualSearchContext contextualSearchContext, String firstWord, int firstWordOffset, |
| String secondWord, int secondWordOffset) { |
| if (!isCapitalizedCamelCase(firstWord) || !isCapitalizedCamelCase(secondWord)) return false; |
| |
| if (!isWhitespaceThatDoesntEndASentence(contextualSearchContext, firstWordOffset - 1)) { |
| return false; |
| } |
| |
| // Check that there's just one separator character. |
| if (firstWordOffset + firstWord.length() + 1 != secondWordOffset) return false; |
| |
| // Check that it's whitespace. |
| return isWhitespaceAtOffset(contextualSearchContext, secondWordOffset - 1); |
| } |
| |
| /** |
| * Scans previous characters starting from the given offset in the given context. |
| * @return Whether there is whitespace that doesn't end a sentence at the given offset. |
| */ |
| private boolean isWhitespaceThatDoesntEndASentence( |
| ContextualSearchContext contextualSearchContext, int offset) { |
| int whitespaceScanOffset = offset; |
| while (whitespaceScanOffset > 0 |
| && isWhitespaceAtOffset(contextualSearchContext, whitespaceScanOffset)) { |
| --whitespaceScanOffset; |
| } |
| return whitespaceScanOffset > 0 |
| && !isEndOfSentenceChar(contextualSearchContext.getSurroundingText().charAt( |
| whitespaceScanOffset)); |
| } |
| |
| /** |
| * Determines if the given character is used as an end-of-sentence character when followed by |
| * whitespace. |
| * Warning! This functionality has not been verified in languages other than |
| * English, even though we do apply it for a wide range of languages from our white-list for ML |
| * purposes only. |
| * @return Whether the given character is often used to end a sentence when followed by |
| * whitespace. |
| */ |
| private boolean isEndOfSentenceChar(char c) { |
| return c == '.' || c == '?' || c == '!' || c == ':'; |
| } |
| |
| /** |
| * @return {@code true} if the word starts with an upper-case letter and has at least one letter |
| * that is not considered upper-case. |
| */ |
| private boolean isCapitalizedCamelCase(String word) { |
| if (TextUtils.isEmpty(word) || word.length() <= 1) return false; |
| |
| Character firstChar = word.charAt(0); |
| return Character.isUpperCase(firstChar) |
| && !word.toUpperCase(Locale.getDefault()).equals(word); |
| } |
| |
| /** |
| * @return Whether the surrounding text has a whitespace character at the given offset. |
| */ |
| private boolean isWhitespaceAtOffset( |
| ContextualSearchContext contextualSearchContext, int offset) { |
| if (offset == INVALID_OFFSET) return false; |
| |
| Character charAtOffset = contextualSearchContext.getSurroundingText().charAt(offset); |
| return Character.isWhitespace(charAtOffset); |
| } |
| |
| /** |
| * Detects the language of the Context and returns whether that language uses camel-case for |
| * proper nouns. |
| * @return Whether the language of the Context uses "camel" case (mixed upper and lower case) |
| * for proper nouns. |
| */ |
| private boolean isContextCamelCaseForProperNouns( |
| ContextualSearchContext contextualSearchContext) { |
| return ROMAN_CAMEL_CASE_PROPER_NOUN_LANGUAGES.contains( |
| contextualSearchContext.getDetectedLanguage()); |
| } |
| } |