blob: a222bd3f2d6bd395640fbb018a01d5b7cba39ddf [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/dom_distiller/core/page_features.h"
#include <stddef.h>
#include <memory>
#include <string>
#include "base/json/json_reader.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"
namespace dom_distiller {
/* This code needs to derive features in the same way and order in which they
* are derived when training the model. Parts of that code are reproduced in the
* comments below.
*/
namespace {
std::string GetLastSegment(const std::string& path) {
// return re.search('[^/]*\/?$', path).group(0)
if (path.size() == 0)
return "";
if (path.size() == 1) {
DCHECK(path[0] == '/');
return path;
}
size_t start = path.rfind("/", path.size() - 2);
return start == std::string::npos ? "" : path.substr(start + 1);
}
int CountMatches(const std::string& s, const std::string& p) {
// return len(re.findall(p, s))
re2::StringPiece sp(s);
re2::RE2 regexp(p);
int count = 0;
while (re2::RE2::FindAndConsume(&sp, regexp))
count++;
return count;
}
int GetWordCount(const std::string& s) {
return CountMatches(s, "\\w+");
}
bool Contains(const std::string& n, const std::string& h) {
return h.find(n) != std::string::npos;
}
bool EndsWith(const std::string& t, const std::string& s) {
return s.size() >= t.size() &&
s.compare(s.size() - t.size(), std::string::npos, t) == 0;
}
} // namespace
int kDerivedFeaturesCount = 29;
std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
const GURL& url,
double numElements,
double numAnchors,
double numForms,
const std::string& innerText,
const std::string& textContent,
const std::string& innerHTML) {
// In the training pipeline, the strings are explicitly encoded in utf-8 (as
// they are here).
const std::string& path = url.path();
int innerTextWords = GetWordCount(innerText);
int textContentWords = GetWordCount(textContent);
int innerHTMLWords = GetWordCount(innerHTML);
std::vector<double> features;
// 'opengraph', opengraph,
features.push_back(isOGArticle);
// 'forum', 'forum' in path,
features.push_back(Contains("forum", path));
// 'index', 'index' in path,
features.push_back(Contains("index", path));
// 'view', 'view' in path,
features.push_back(Contains("view", path));
// 'asp', '.asp' in path,
features.push_back(Contains(".asp", path));
// 'phpbb', 'phpbb' in path,
features.push_back(Contains("phpbb", path));
// 'php', path.endswith('.php'),
features.push_back(EndsWith(".php", path));
// 'pathlength', len(path),
features.push_back(path.size());
// 'domain', len(path) < 2,
features.push_back(path.size() < 2);
// 'pathcomponents', CountMatches(path, r'\/.'),
features.push_back(CountMatches(path, "\\/."));
// 'slugdetector', CountMatches(path, r'[^\w/]'),
features.push_back(CountMatches(path, "[^\\w/]"));
// 'pathnumbers', CountMatches(path, r'\d+'),
features.push_back(CountMatches(path, "\\d+"));
// 'lastSegmentLength', len(GetLastSegment(path)),
features.push_back(GetLastSegment(path).size());
// 'formcount', numForms,
features.push_back(numForms);
// 'anchorcount', numAnchors,
features.push_back(numAnchors);
// 'elementcount', numElements,
features.push_back(numElements);
// 'anchorratio', float(numAnchors) / max(1, numElements),
features.push_back(double(numAnchors) / std::max<double>(1, numElements));
// 'innertextlength', len(innerText),
features.push_back(innerText.size());
// 'textcontentlength', len(textContent),
features.push_back(textContent.size());
// 'innerhtmllength', len(innerHTML),
features.push_back(innerHTML.size());
// 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
features.push_back(double(innerText.size()) /
std::max<double>(1.0, innerHTML.size()));
// 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
features.push_back(double(textContent.size()) /
std::max<double>(1.0, innerHTML.size()));
// 'innertexttextcontentlengthratio',
// float(len(innerText)) / max(1, len(textContent)),
features.push_back(double(innerText.size()) /
std::max<double>(1.0, textContent.size()));
// 'innertextwordcount', innerTextWords,
features.push_back(innerTextWords);
// 'textcontentwordcount', textContentWords,
features.push_back(textContentWords);
// 'innerhtmlwordcount', innerHTMLWords,
features.push_back(innerHTMLWords);
// 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
features.push_back(double(innerTextWords) /
std::max<int>(1.0, innerHTMLWords));
// 'textcontentwordcountratio',
// float(textContentWords) / max(1, innerHTMLWords),
features.push_back(double(textContentWords) /
std::max<int>(1.0, innerHTMLWords));
// 'innertexttextcontentwordcountratio',
// float(innerTextWords) / max(1, textContentWords),
features.push_back(double(innerTextWords) /
std::max<int>(1.0, textContentWords));
return features;
}
std::vector<double> CalculateDerivedFeaturesFromJSON(
const base::Value* stringified_json) {
std::string stringified;
if (!stringified_json->GetAsString(&stringified)) {
return std::vector<double>();
}
std::unique_ptr<base::Value> json = base::JSONReader::Read(stringified);
if (!json) {
return std::vector<double>();
}
const base::DictionaryValue* dict;
if (!json->GetAsDictionary(&dict)) {
return std::vector<double>();
}
bool isOGArticle = false;
std::string url, innerText, textContent, innerHTML;
double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
dict->GetString("url", &url) &&
dict->GetDouble("numElements", &numElements) &&
dict->GetDouble("numAnchors", &numAnchors) &&
dict->GetDouble("numForms", &numForms) &&
dict->GetString("innerText", &innerText) &&
dict->GetString("textContent", &textContent) &&
dict->GetString("innerHTML", &innerHTML))) {
return std::vector<double>();
}
GURL parsed_url(url);
if (!parsed_url.is_valid()) {
return std::vector<double>();
}
return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
numAnchors, numForms, innerText, textContent,
innerHTML);
}
std::vector<double> CalculateDerivedFeatures(
bool openGraph,
const GURL& url,
unsigned elementCount,
unsigned anchorCount,
unsigned formCount,
double mozScore,
double mozScoreAllSqrt,
double mozScoreAllLinear) {
const std::string& path = url.path();
std::vector<double> features;
// 'opengraph', opengraph,
features.push_back(openGraph);
// 'forum', 'forum' in path,
features.push_back(Contains("forum", path));
// 'index', 'index' in path,
features.push_back(Contains("index", path));
// 'search', 'search' in path,
features.push_back(Contains("search", path));
// 'view', 'view' in path,
features.push_back(Contains("view", path));
// 'archive', 'archive' in path,
features.push_back(Contains("archive", path));
// 'asp', '.asp' in path,
features.push_back(Contains(".asp", path));
// 'phpbb', 'phpbb' in path,
features.push_back(Contains("phpbb", path));
// 'php', path.endswith('.php'),
features.push_back(EndsWith(".php", path));
// 'pathLength', len(path),
features.push_back(path.size());
// 'domain', len(path) < 2,
features.push_back(path.size() < 2);
// 'pathComponents', CountMatches(path, r'\/.'),
features.push_back(CountMatches(path, "\\/."));
// 'slugDetector', CountMatches(path, r'[^\w/]'),
features.push_back(CountMatches(path, "[^\\w/]"));
// 'pathNumbers', CountMatches(path, r'\d+'),
features.push_back(CountMatches(path, "\\d+"));
// 'lastSegmentLength', len(GetLastSegment(path)),
features.push_back(GetLastSegment(path).size());
// 'formCount', numForms,
features.push_back(formCount);
// 'anchorCount', numAnchors,
features.push_back(anchorCount);
// 'elementCount', numElements,
features.push_back(elementCount);
// 'anchorRatio', float(numAnchors) / max(1, numElements),
features.push_back(
double(anchorCount) / std::max<double>(1, elementCount));
// 'mozScore'
features.push_back(mozScore);
// 'mozScoreAllSqrt'
features.push_back(mozScoreAllSqrt);
// 'mozScoreAllLinear'
features.push_back(mozScoreAllLinear);
return features;
}
} // namespace dom_distiller