components/dom_distiller/core/page_features.cc - chromium/src - Git at Google

 // Copyright 2015 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "components/dom_distiller/core/page_features.h"

 #include <stddef.h>

 #include <memory>
 #include <string>

 #include "base/json/json_reader.h"
 #include "third_party/re2/src/re2/re2.h"
 #include "url/gurl.h"

 namespace dom_distiller {
 /* This code needs to derive features in the same way and order in which they
  * are derived when training the model. Parts of that code are reproduced in the
  * comments below.
  */

 namespace {

 std::string GetLastSegment(const std::string& path) {
   // return re.search('[^/]*\/?$', path).group(0)
   if (path.size() == 0)
     return "";
   if (path.size() == 1) {
     DCHECK(path[0] == '/');
     return path;
   }
   size_t start = path.rfind("/", path.size() - 2);
   return start == std::string::npos ? "" : path.substr(start + 1);
 }

 int CountMatches(const std::string& s, const std::string& p) {
   // return len(re.findall(p, s))
   re2::StringPiece sp(s);
   re2::RE2 regexp(p);
   int count = 0;
   while (re2::RE2::FindAndConsume(&sp, regexp))
     count++;
   return count;
 }

 int GetWordCount(const std::string& s) {
   return CountMatches(s, "\\w+");
 }

 bool Contains(const std::string& n, const std::string& h) {
   return h.find(n) != std::string::npos;
 }

 bool EndsWith(const std::string& t, const std::string& s) {
   return s.size() >= t.size() &&
          s.compare(s.size() - t.size(), std::string::npos, t) == 0;
 }

 }  // namespace

 int kDerivedFeaturesCount = 29;

 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
                                              const GURL& url,
                                              double numElements,
                                              double numAnchors,
                                              double numForms,
                                              const std::string& innerText,
                                              const std::string& textContent,
                                              const std::string& innerHTML) {
   // In the training pipeline, the strings are explicitly encoded in utf-8 (as
   // they are here).
   const std::string& path = url.path();
   int innerTextWords = GetWordCount(innerText);
   int textContentWords = GetWordCount(textContent);
   int innerHTMLWords = GetWordCount(innerHTML);
   std::vector<double> features;
   // 'opengraph', opengraph,
   features.push_back(isOGArticle);
   // 'forum', 'forum' in path,
   features.push_back(Contains("forum", path));
   // 'index', 'index' in path,
   features.push_back(Contains("index", path));
   // 'view', 'view' in path,
   features.push_back(Contains("view", path));
   // 'asp', '.asp' in path,
   features.push_back(Contains(".asp", path));
   // 'phpbb', 'phpbb' in path,
   features.push_back(Contains("phpbb", path));
   // 'php', path.endswith('.php'),
   features.push_back(EndsWith(".php", path));
   // 'pathlength', len(path),
   features.push_back(path.size());
   // 'domain', len(path) < 2,
   features.push_back(path.size() < 2);
   // 'pathcomponents', CountMatches(path, r'\/.'),
   features.push_back(CountMatches(path, "\\/."));
   // 'slugdetector', CountMatches(path, r'[^\w/]'),
   features.push_back(CountMatches(path, "[^\\w/]"));
   // 'pathnumbers', CountMatches(path, r'\d+'),
   features.push_back(CountMatches(path, "\\d+"));
   // 'lastSegmentLength', len(GetLastSegment(path)),
   features.push_back(GetLastSegment(path).size());
   // 'formcount', numForms,
   features.push_back(numForms);
   // 'anchorcount', numAnchors,
   features.push_back(numAnchors);
   // 'elementcount', numElements,
   features.push_back(numElements);
   // 'anchorratio', float(numAnchors) / max(1, numElements),
   features.push_back(double(numAnchors) / std::max<double>(1, numElements));
   // 'innertextlength', len(innerText),
   features.push_back(innerText.size());
   // 'textcontentlength', len(textContent),
   features.push_back(textContent.size());
   // 'innerhtmllength', len(innerHTML),
   features.push_back(innerHTML.size());
   // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
   features.push_back(double(innerText.size()) /
                      std::max<double>(1.0, innerHTML.size()));
   // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
   features.push_back(double(textContent.size()) /
                      std::max<double>(1.0, innerHTML.size()));
   // 'innertexttextcontentlengthratio',
   // float(len(innerText)) / max(1, len(textContent)),
   features.push_back(double(innerText.size()) /
                      std::max<double>(1.0, textContent.size()));
   // 'innertextwordcount', innerTextWords,
   features.push_back(innerTextWords);
   // 'textcontentwordcount', textContentWords,
   features.push_back(textContentWords);
   // 'innerhtmlwordcount', innerHTMLWords,
   features.push_back(innerHTMLWords);
   // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
   features.push_back(double(innerTextWords) /
                      std::max<int>(1.0, innerHTMLWords));
   // 'textcontentwordcountratio',
   // float(textContentWords) / max(1, innerHTMLWords),
   features.push_back(double(textContentWords) /
                      std::max<int>(1.0, innerHTMLWords));
   // 'innertexttextcontentwordcountratio',
   // float(innerTextWords) / max(1, textContentWords),
   features.push_back(double(innerTextWords) /
                      std::max<int>(1.0, textContentWords));
   return features;
 }

 std::vector<double> CalculateDerivedFeaturesFromJSON(
     const base::Value* stringified_json) {
   std::string stringified;
   if (!stringified_json->GetAsString(&stringified)) {
     return std::vector<double>();
   }

   std::unique_ptr<base::Value> json = base::JSONReader::Read(stringified);
   if (!json) {
     return std::vector<double>();
   }

   const base::DictionaryValue* dict;
   if (!json->GetAsDictionary(&dict)) {
     return std::vector<double>();
   }

   bool isOGArticle = false;
   std::string url, innerText, textContent, innerHTML;
   double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;

   if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
         dict->GetString("url", &url) &&
         dict->GetDouble("numElements", &numElements) &&
         dict->GetDouble("numAnchors", &numAnchors) &&
         dict->GetDouble("numForms", &numForms) &&
         dict->GetString("innerText", &innerText) &&
         dict->GetString("textContent", &textContent) &&
         dict->GetString("innerHTML", &innerHTML))) {
     return std::vector<double>();
   }

   GURL parsed_url(url);
   if (!parsed_url.is_valid()) {
     return std::vector<double>();
   }

   return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
                                   numAnchors, numForms, innerText, textContent,
                                   innerHTML);
 }

 std::vector<double> CalculateDerivedFeatures(
     bool openGraph,
     const GURL& url,
     unsigned elementCount,
     unsigned anchorCount,
     unsigned formCount,
     double mozScore,
     double mozScoreAllSqrt,
     double mozScoreAllLinear) {
   const std::string& path = url.path();
   std::vector<double> features;
   // 'opengraph', opengraph,
   features.push_back(openGraph);
   // 'forum', 'forum' in path,
   features.push_back(Contains("forum", path));
   // 'index', 'index' in path,
   features.push_back(Contains("index", path));
   // 'search', 'search' in path,
   features.push_back(Contains("search", path));
   // 'view', 'view' in path,
   features.push_back(Contains("view", path));
   // 'archive', 'archive' in path,
   features.push_back(Contains("archive", path));
   // 'asp', '.asp' in path,
   features.push_back(Contains(".asp", path));
   // 'phpbb', 'phpbb' in path,
   features.push_back(Contains("phpbb", path));
   // 'php', path.endswith('.php'),
   features.push_back(EndsWith(".php", path));
   // 'pathLength', len(path),
   features.push_back(path.size());
   // 'domain', len(path) < 2,
   features.push_back(path.size() < 2);
   // 'pathComponents', CountMatches(path, r'\/.'),
   features.push_back(CountMatches(path, "\\/."));
   // 'slugDetector', CountMatches(path, r'[^\w/]'),
   features.push_back(CountMatches(path, "[^\\w/]"));
   // 'pathNumbers', CountMatches(path, r'\d+'),
   features.push_back(CountMatches(path, "\\d+"));
   // 'lastSegmentLength', len(GetLastSegment(path)),
   features.push_back(GetLastSegment(path).size());
   // 'formCount', numForms,
   features.push_back(formCount);
   // 'anchorCount', numAnchors,
   features.push_back(anchorCount);
   // 'elementCount', numElements,
   features.push_back(elementCount);
   // 'anchorRatio', float(numAnchors) / max(1, numElements),
   features.push_back(
       double(anchorCount) / std::max<double>(1, elementCount));
   // 'mozScore'
   features.push_back(mozScore);
   // 'mozScoreAllSqrt'
   features.push_back(mozScoreAllSqrt);
   // 'mozScoreAllLinear'
   features.push_back(mozScoreAllLinear);

   return features;
 }

 }  // namespace dom_distiller
	// Copyright 2015 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "components/dom_distiller/core/page_features.h"

	#include <stddef.h>

	#include <memory>
	#include <string>

	#include "base/json/json_reader.h"
	#include "third_party/re2/src/re2/re2.h"
	#include "url/gurl.h"

	namespace dom_distiller {
	/* This code needs to derive features in the same way and order in which they
	* are derived when training the model. Parts of that code are reproduced in the
	* comments below.
	*/

	namespace {

	std::string GetLastSegment(const std::string& path) {
	// return re.search('[^/]*\/?$', path).group(0)
	if (path.size() == 0)
	return "";
	if (path.size() == 1) {
	DCHECK(path[0] == '/');
	return path;
	}
	size_t start = path.rfind("/", path.size() - 2);
	return start == std::string::npos ? "" : path.substr(start + 1);
	}

	int CountMatches(const std::string& s, const std::string& p) {
	// return len(re.findall(p, s))
	re2::StringPiece sp(s);
	re2::RE2 regexp(p);
	int count = 0;
	while (re2::RE2::FindAndConsume(&sp, regexp))
	count++;
	return count;
	}

	int GetWordCount(const std::string& s) {
	return CountMatches(s, "\\w+");
	}

	bool Contains(const std::string& n, const std::string& h) {
	return h.find(n) != std::string::npos;
	}

	bool EndsWith(const std::string& t, const std::string& s) {
	return s.size() >= t.size() &&
	s.compare(s.size() - t.size(), std::string::npos, t) == 0;
	}

	} // namespace

	int kDerivedFeaturesCount = 29;

	std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
	const GURL& url,
	double numElements,
	double numAnchors,
	double numForms,
	const std::string& innerText,
	const std::string& textContent,
	const std::string& innerHTML) {
	// In the training pipeline, the strings are explicitly encoded in utf-8 (as
	// they are here).
	const std::string& path = url.path();
	int innerTextWords = GetWordCount(innerText);
	int textContentWords = GetWordCount(textContent);
	int innerHTMLWords = GetWordCount(innerHTML);
	std::vector<double> features;
	// 'opengraph', opengraph,
	features.push_back(isOGArticle);
	// 'forum', 'forum' in path,
	features.push_back(Contains("forum", path));
	// 'index', 'index' in path,
	features.push_back(Contains("index", path));
	// 'view', 'view' in path,
	features.push_back(Contains("view", path));
	// 'asp', '.asp' in path,
	features.push_back(Contains(".asp", path));
	// 'phpbb', 'phpbb' in path,
	features.push_back(Contains("phpbb", path));
	// 'php', path.endswith('.php'),
	features.push_back(EndsWith(".php", path));
	// 'pathlength', len(path),
	features.push_back(path.size());
	// 'domain', len(path) < 2,
	features.push_back(path.size() < 2);
	// 'pathcomponents', CountMatches(path, r'\/.'),
	features.push_back(CountMatches(path, "\\/."));
	// 'slugdetector', CountMatches(path, r'[^\w/]'),
	features.push_back(CountMatches(path, "[^\\w/]"));
	// 'pathnumbers', CountMatches(path, r'\d+'),
	features.push_back(CountMatches(path, "\\d+"));
	// 'lastSegmentLength', len(GetLastSegment(path)),
	features.push_back(GetLastSegment(path).size());
	// 'formcount', numForms,
	features.push_back(numForms);
	// 'anchorcount', numAnchors,
	features.push_back(numAnchors);
	// 'elementcount', numElements,
	features.push_back(numElements);
	// 'anchorratio', float(numAnchors) / max(1, numElements),
	features.push_back(double(numAnchors) / std::max<double>(1, numElements));
	// 'innertextlength', len(innerText),
	features.push_back(innerText.size());
	// 'textcontentlength', len(textContent),
	features.push_back(textContent.size());
	// 'innerhtmllength', len(innerHTML),
	features.push_back(innerHTML.size());
	// 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
	features.push_back(double(innerText.size()) /
	std::max<double>(1.0, innerHTML.size()));
	// 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
	features.push_back(double(textContent.size()) /
	std::max<double>(1.0, innerHTML.size()));
	// 'innertexttextcontentlengthratio',
	// float(len(innerText)) / max(1, len(textContent)),
	features.push_back(double(innerText.size()) /
	std::max<double>(1.0, textContent.size()));
	// 'innertextwordcount', innerTextWords,
	features.push_back(innerTextWords);
	// 'textcontentwordcount', textContentWords,
	features.push_back(textContentWords);
	// 'innerhtmlwordcount', innerHTMLWords,
	features.push_back(innerHTMLWords);
	// 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
	features.push_back(double(innerTextWords) /
	std::max<int>(1.0, innerHTMLWords));
	// 'textcontentwordcountratio',
	// float(textContentWords) / max(1, innerHTMLWords),
	features.push_back(double(textContentWords) /
	std::max<int>(1.0, innerHTMLWords));
	// 'innertexttextcontentwordcountratio',
	// float(innerTextWords) / max(1, textContentWords),
	features.push_back(double(innerTextWords) /
	std::max<int>(1.0, textContentWords));
	return features;
	}

	std::vector<double> CalculateDerivedFeaturesFromJSON(
	const base::Value* stringified_json) {
	std::string stringified;
	if (!stringified_json->GetAsString(&stringified)) {
	return std::vector<double>();
	}

	std::unique_ptr<base::Value> json = base::JSONReader::Read(stringified);
	if (!json) {
	return std::vector<double>();
	}

	const base::DictionaryValue* dict;
	if (!json->GetAsDictionary(&dict)) {
	return std::vector<double>();
	}

	bool isOGArticle = false;
	std::string url, innerText, textContent, innerHTML;
	double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;

	if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
	dict->GetString("url", &url) &&
	dict->GetDouble("numElements", &numElements) &&
	dict->GetDouble("numAnchors", &numAnchors) &&
	dict->GetDouble("numForms", &numForms) &&
	dict->GetString("innerText", &innerText) &&
	dict->GetString("textContent", &textContent) &&
	dict->GetString("innerHTML", &innerHTML))) {
	return std::vector<double>();
	}

	GURL parsed_url(url);
	if (!parsed_url.is_valid()) {
	return std::vector<double>();
	}

	return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
	numAnchors, numForms, innerText, textContent,
	innerHTML);
	}

	std::vector<double> CalculateDerivedFeatures(
	bool openGraph,
	const GURL& url,
	unsigned elementCount,
	unsigned anchorCount,
	unsigned formCount,
	double mozScore,
	double mozScoreAllSqrt,
	double mozScoreAllLinear) {
	const std::string& path = url.path();
	std::vector<double> features;
	// 'opengraph', opengraph,
	features.push_back(openGraph);
	// 'forum', 'forum' in path,
	features.push_back(Contains("forum", path));
	// 'index', 'index' in path,
	features.push_back(Contains("index", path));
	// 'search', 'search' in path,
	features.push_back(Contains("search", path));
	// 'view', 'view' in path,
	features.push_back(Contains("view", path));
	// 'archive', 'archive' in path,
	features.push_back(Contains("archive", path));
	// 'asp', '.asp' in path,
	features.push_back(Contains(".asp", path));
	// 'phpbb', 'phpbb' in path,
	features.push_back(Contains("phpbb", path));
	// 'php', path.endswith('.php'),
	features.push_back(EndsWith(".php", path));
	// 'pathLength', len(path),
	features.push_back(path.size());
	// 'domain', len(path) < 2,
	features.push_back(path.size() < 2);
	// 'pathComponents', CountMatches(path, r'\/.'),
	features.push_back(CountMatches(path, "\\/."));
	// 'slugDetector', CountMatches(path, r'[^\w/]'),
	features.push_back(CountMatches(path, "[^\\w/]"));
	// 'pathNumbers', CountMatches(path, r'\d+'),
	features.push_back(CountMatches(path, "\\d+"));
	// 'lastSegmentLength', len(GetLastSegment(path)),
	features.push_back(GetLastSegment(path).size());
	// 'formCount', numForms,
	features.push_back(formCount);
	// 'anchorCount', numAnchors,
	features.push_back(anchorCount);
	// 'elementCount', numElements,
	features.push_back(elementCount);
	// 'anchorRatio', float(numAnchors) / max(1, numElements),
	features.push_back(
	double(anchorCount) / std::max<double>(1, elementCount));
	// 'mozScore'
	features.push_back(mozScore);
	// 'mozScoreAllSqrt'
	features.push_back(mozScoreAllSqrt);
	// 'mozScoreAllLinear'
	features.push_back(mozScoreAllLinear);

	return features;
	}

	} // namespace dom_distiller