blob: 478c658fb253d0b00d23c39172678cd2c97955bc [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/dom_distiller/content/renderer/distillability_agent.h"
#include "base/json/json_writer.h"
#include "base/metrics/histogram_macros.h"
#include "base/stl_util.h"
#include "base/strings/string_util.h"
#include "components/dom_distiller/content/common/mojom/distillability_service.mojom.h"
#include "components/dom_distiller/core/distillable_page_detector.h"
#include "components/dom_distiller/core/experiments.h"
#include "components/dom_distiller/core/page_features.h"
#include "components/dom_distiller/core/url_utils.h"
#include "content/public/renderer/render_frame.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "third_party/blink/public/common/browser_interface_broker_proxy.h"
#include "third_party/blink/public/platform/web_distillability.h"
#include "third_party/blink/public/web/web_document.h"
#include "third_party/blink/public/web/web_element.h"
#include "third_party/blink/public/web/web_local_frame.h"
namespace dom_distiller {
namespace {
const char* const kFilterlist[] = {"www.reddit.com", "tools.usps.com",
"old.reddit.com"};
enum RejectionBuckets {
NOT_ARTICLE = 0,
MOBILE_FRIENDLY,
FILTERED,
TOO_SHORT,
NOT_REJECTED,
REJECTION_BUCKET_BOUNDARY
};
// Returns whether it is necessary to send updates back to the browser.
// The number of updates can be from 0 to 2. See the tests in
// "distillable_page_utils_browsertest.cc".
// Most heuristics types only require one update after parsing.
// Adaboost-based heuristics are the only ones doing the second update,
// which is after loading.
bool NeedToUpdate(bool is_loaded) {
switch (GetDistillerHeuristicsType()) {
case DistillerHeuristicsType::ALWAYS_TRUE:
return !is_loaded;
case DistillerHeuristicsType::OG_ARTICLE:
return !is_loaded;
case DistillerHeuristicsType::ADABOOST_MODEL:
case DistillerHeuristicsType::ALL_ARTICLES:
return true;
case DistillerHeuristicsType::NONE:
default:
return false;
}
}
// Returns whether this update is the last one for the page.
bool IsLast(bool is_loaded) {
if (GetDistillerHeuristicsType() == DistillerHeuristicsType::ADABOOST_MODEL ||
GetDistillerHeuristicsType() == DistillerHeuristicsType::ALL_ARTICLES)
return is_loaded;
return true;
}
bool IsFiltered(const GURL& url) {
for (auto* filter : kFilterlist) {
if (base::LowerCaseEqualsASCII(url.host(), filter)) {
return true;
}
}
return false;
}
void DumpDistillability(content::RenderFrame* render_frame,
const blink::WebDistillabilityFeatures& features,
const std::vector<double>& derived,
double score,
bool distillable,
double long_score,
bool long_page,
bool filtered) {
base::DictionaryValue dict;
std::string msg;
std::unique_ptr<base::DictionaryValue> raw_features(
new base::DictionaryValue);
raw_features->SetInteger("is_mobile_friendly", features.is_mobile_friendly);
raw_features->SetInteger("open_graph", features.open_graph);
raw_features->SetInteger("element_count", features.element_count);
raw_features->SetInteger("anchor_count", features.anchor_count);
raw_features->SetInteger("form_count", features.form_count);
raw_features->SetInteger("text_input_count", features.text_input_count);
raw_features->SetInteger("password_input_count",
features.password_input_count);
raw_features->SetInteger("p_count", features.p_count);
raw_features->SetInteger("pre_count", features.pre_count);
raw_features->SetDouble("moz_score", features.moz_score);
raw_features->SetDouble("moz_score_all_sqrt", features.moz_score_all_sqrt);
raw_features->SetDouble("moz_score_all_linear",
features.moz_score_all_linear);
dict.Set("features", std::move(raw_features));
std::unique_ptr<base::ListValue> derived_features(new base::ListValue());
for (double value : derived) {
derived_features->AppendDouble(value);
}
dict.Set("derived_features", std::move(derived_features));
dict.SetDouble("score", score);
dict.SetInteger("distillable", distillable);
dict.SetDouble("long_score", long_score);
dict.SetInteger("long_page", long_page);
dict.SetInteger("filtered", filtered);
base::JSONWriter::WriteWithOptions(
dict, base::JSONWriter::OPTIONS_PRETTY_PRINT, &msg);
msg = "adaboost_classification = " + msg;
render_frame->AddMessageToConsole(blink::mojom::ConsoleMessageLevel::kVerbose,
msg);
}
bool IsDistillablePageAdaboost(blink::WebDocument& doc,
const DistillablePageDetector* detector,
const DistillablePageDetector* long_page,
bool is_last,
bool& is_mobile_friendly,
content::RenderFrame* render_frame,
bool dump_info) {
GURL parsed_url(doc.Url());
if (!parsed_url.is_valid()) {
return false;
}
blink::WebDistillabilityFeatures features = doc.DistillabilityFeatures();
is_mobile_friendly = features.is_mobile_friendly;
std::vector<double> derived = CalculateDerivedFeatures(
features.open_graph, parsed_url, features.element_count,
features.anchor_count, features.form_count, features.moz_score,
features.moz_score_all_sqrt, features.moz_score_all_linear);
double score = detector->Score(derived) - detector->GetThreshold();
double long_score = long_page->Score(derived) - long_page->GetThreshold();
bool distillable = score > 0;
bool long_article = long_score > 0;
bool filtered = IsFiltered(parsed_url);
if (dump_info) {
DumpDistillability(render_frame, features, derived, score, distillable,
long_score, long_article, filtered);
}
if (!features.is_mobile_friendly) {
int score_int = std::round(score * 100);
if (score > 0) {
UMA_HISTOGRAM_COUNTS_1000("DomDistiller.DistillabilityScoreNMF.Positive",
score_int);
} else {
UMA_HISTOGRAM_COUNTS_1000("DomDistiller.DistillabilityScoreNMF.Negative",
-score_int);
}
if (distillable) {
// The long-article model is trained with pages that are
// non-mobile-friendly, and distillable (deemed by the first model), so
// only record on that type of pages.
int long_score_int = std::round(long_score * 100);
if (long_score > 0) {
UMA_HISTOGRAM_COUNTS_1000("DomDistiller.LongArticleScoreNMF.Positive",
long_score_int);
} else {
UMA_HISTOGRAM_COUNTS_1000("DomDistiller.LongArticleScoreNMF.Negative",
-long_score_int);
}
}
}
int bucket = static_cast<unsigned>(features.is_mobile_friendly) |
(static_cast<unsigned>(distillable) << 1);
if (is_last) {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterLoading",
bucket, 4);
} else {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.PageDistillableAfterParsing",
bucket, 4);
if (!distillable) {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection",
NOT_ARTICLE, REJECTION_BUCKET_BOUNDARY);
} else if (features.is_mobile_friendly) {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection",
MOBILE_FRIENDLY, REJECTION_BUCKET_BOUNDARY);
} else if (filtered) {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection",
FILTERED, REJECTION_BUCKET_BOUNDARY);
} else if (!long_article) {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection",
TOO_SHORT, REJECTION_BUCKET_BOUNDARY);
} else {
UMA_HISTOGRAM_ENUMERATION("DomDistiller.DistillabilityRejection",
NOT_REJECTED, REJECTION_BUCKET_BOUNDARY);
}
}
if (filtered) {
return false;
}
return distillable && long_article;
}
bool IsDistillablePage(blink::WebDocument& doc,
bool is_last,
bool& is_mobile_friendly,
content::RenderFrame* render_frame,
bool dump_info) {
switch (GetDistillerHeuristicsType()) {
case DistillerHeuristicsType::ALWAYS_TRUE:
return true;
case DistillerHeuristicsType::OG_ARTICLE:
return doc.DistillabilityFeatures().open_graph;
case DistillerHeuristicsType::ADABOOST_MODEL:
case DistillerHeuristicsType::ALL_ARTICLES:
return IsDistillablePageAdaboost(
doc, DistillablePageDetector::GetNewModel(),
DistillablePageDetector::GetLongPageModel(), is_last,
is_mobile_friendly, render_frame, dump_info);
case DistillerHeuristicsType::NONE:
default:
return false;
}
}
} // namespace
DistillabilityAgent::DistillabilityAgent(content::RenderFrame* render_frame,
bool dump_info)
: RenderFrameObserver(render_frame), dump_info_(dump_info) {}
void DistillabilityAgent::DidMeaningfulLayout(
blink::WebMeaningfulLayout layout_type) {
if (layout_type != blink::WebMeaningfulLayout::kFinishedParsing &&
layout_type != blink::WebMeaningfulLayout::kFinishedLoading) {
return;
}
DCHECK(render_frame());
if (!render_frame()->IsMainFrame())
return;
DCHECK(render_frame()->GetWebFrame());
blink::WebDocument doc = render_frame()->GetWebFrame()->GetDocument();
if (doc.IsNull() || doc.Body().IsNull())
return;
if (!url_utils::IsUrlDistillable(doc.Url()))
return;
bool is_loaded = layout_type == blink::WebMeaningfulLayout::kFinishedLoading;
if (!NeedToUpdate(is_loaded))
return;
bool is_last = IsLast(is_loaded);
// Connect to Mojo service on browser to notify page distillability.
mojo::Remote<mojom::DistillabilityService> distillability_service;
render_frame()->GetBrowserInterfaceBroker()->GetInterface(
distillability_service.BindNewPipeAndPassReceiver());
if (!distillability_service.is_bound())
return;
bool is_mobile_friendly = false;
bool is_distillable = IsDistillablePage(doc, is_last, is_mobile_friendly,
render_frame(), dump_info_);
distillability_service->NotifyIsDistillable(is_distillable, is_last,
is_mobile_friendly);
}
DistillabilityAgent::~DistillabilityAgent() {}
void DistillabilityAgent::OnDestruct() {
delete this;
}
} // namespace dom_distiller