content/renderer/accessibility/ax_image_stopwords.h - chromium/src - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_
 #define CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_

 #include "base/containers/flat_set.h"
 #include "base/no_destructor.h"
 #include "base/stl_util.h"
 #include "base/strings/string_piece.h"
 #include "content/common/content_export.h"

 namespace content {

 // Maintains a set of image stopwords and provides a function to check
 // whether or not a given word is an image stopword.
 //
 // A stopword in general is a word that's filtered out before doing
 // natural language processing. In English, common stopwords include
 // "the" or "of" - they are words that are part of grammatically correct
 // sentences but don't add any useful semantics themselves.
 //
 // This set is used as part of an algorithm to determine whether the
 // accessible label for an image (including the "alt" attribute and
 // other attributes) contains a useful description or not. For this
 // application, both common stopwords like "the", but also image-related
 // words like "image" and "photo" are included, because an image that's
 // just labeled with the word "photo" is essentially unlabeled.
 //
 // Stopwords from all supported languages are grouped together, because
 // it's simpler to just have one set rather than to try to split by the
 // element language (which is sometimes wrong). This leads to a small
 // but acceptable number of false positives if a stopword in one language
 // is a meaningful word in another language.
 //
 // The set of supported languages should include all of the languages
 // that we can generate automatic image descriptions for. This will grow
 // over time.
 //
 // Words consisting of just one or two characters made up of letters from
 // Latin alphabets are always considered stopwords, but that doesn't
 // generalize to all languages / character sets.
 //
 // The set of stopwords was obtained by extracting the alt text of images
 // from billions of web pages, tokenizing, counting, and then manually
 // categorizing the top words, with the help of dictionaries and language
 // experts. More details in this (Google-internal) design doc:
 // http://goto.google.com/augment-existing-image-descriptions
 class CONTENT_EXPORT AXImageStopwords {
  public:
   static AXImageStopwords& GetInstance();

   // The input should be a word, after already splitting by punctuation and
   // whitespace. Returns true if the word is an image stopword.
   // Case-insensitive and language-neutral (includes words from all
   // languages).
   bool IsImageStopword(const char* utf8_string) const;

  private:
   friend base::NoDestructor<AXImageStopwords>;

   AXImageStopwords();
   ~AXImageStopwords();

   base::flat_set<base::StringPiece> stopword_set_;
 };

 }  // namespace content

 #endif  // CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_
	#define CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_

	#include "base/containers/flat_set.h"
	#include "base/no_destructor.h"
	#include "base/stl_util.h"
	#include "base/strings/string_piece.h"
	#include "content/common/content_export.h"

	namespace content {

	// Maintains a set of image stopwords and provides a function to check
	// whether or not a given word is an image stopword.
	//
	// A stopword in general is a word that's filtered out before doing
	// natural language processing. In English, common stopwords include
	// "the" or "of" - they are words that are part of grammatically correct
	// sentences but don't add any useful semantics themselves.
	//
	// This set is used as part of an algorithm to determine whether the
	// accessible label for an image (including the "alt" attribute and
	// other attributes) contains a useful description or not. For this
	// application, both common stopwords like "the", but also image-related
	// words like "image" and "photo" are included, because an image that's
	// just labeled with the word "photo" is essentially unlabeled.
	//
	// Stopwords from all supported languages are grouped together, because
	// it's simpler to just have one set rather than to try to split by the
	// element language (which is sometimes wrong). This leads to a small
	// but acceptable number of false positives if a stopword in one language
	// is a meaningful word in another language.
	//
	// The set of supported languages should include all of the languages
	// that we can generate automatic image descriptions for. This will grow
	// over time.
	//
	// Words consisting of just one or two characters made up of letters from
	// Latin alphabets are always considered stopwords, but that doesn't
	// generalize to all languages / character sets.
	//
	// The set of stopwords was obtained by extracting the alt text of images
	// from billions of web pages, tokenizing, counting, and then manually
	// categorizing the top words, with the help of dictionaries and language
	// experts. More details in this (Google-internal) design doc:
	// http://goto.google.com/augment-existing-image-descriptions
	class CONTENT_EXPORT AXImageStopwords {
	public:
	static AXImageStopwords& GetInstance();

	// The input should be a word, after already splitting by punctuation and
	// whitespace. Returns true if the word is an image stopword.
	// Case-insensitive and language-neutral (includes words from all
	// languages).
	bool IsImageStopword(const char* utf8_string) const;

	private:
	friend base::NoDestructor<AXImageStopwords>;

	AXImageStopwords();
	~AXImageStopwords();

	base::flat_set<base::StringPiece> stopword_set_;
	};

	} // namespace content

	#endif // CONTENT_RENDERER_ACCESSIBILITY_AX_IMAGE_STOPWORDS_H_