blob: 4a652e9aef55b76db1db698800678951434cd8c9 [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "core/dom/DocumentStatisticsCollector.h"
#include <memory>
#include "core/dom/Document.h"
#include "core/frame/LocalFrameView.h"
#include "core/html/HTMLHeadElement.h"
#include "core/html/HTMLLinkElement.h"
#include "core/testing/PageTestBase.h"
#include "platform/wtf/text/StringBuilder.h"
#include "public/platform/WebDistillability.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace blink {
// Saturate the length of a paragraph to save time.
const unsigned kTextContentLengthSaturation = 1000;
// Filter out short P elements. The threshold is set to around 2 English
// sentences.
const unsigned kParagraphLengthThreshold = 140;
class DocumentStatisticsCollectorTest : public PageTestBase {
protected:
void TearDown() override { ThreadState::Current()->CollectAllGarbage(); }
void SetHtmlInnerHTML(const String&);
};
void DocumentStatisticsCollectorTest::SetHtmlInnerHTML(
const String& html_content) {
GetDocument().documentElement()->SetInnerHTMLFromString((html_content));
}
// This test checks open graph articles can be recognized.
TEST_F(DocumentStatisticsCollectorTest, HasOpenGraphArticle) {
SetHtmlInnerHTML(
"<head>"
// Note the case-insensitive matching of the word "article".
" <meta property='og:type' content='arTiclE' />"
"</head>");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::CollectStatistics(GetDocument());
EXPECT_TRUE(features.open_graph);
}
// This test checks non-existence of open graph articles can be recognized.
TEST_F(DocumentStatisticsCollectorTest, NoOpenGraphArticle) {
SetHtmlInnerHTML(R"HTML(
<head>
<meta property='og:type' content='movie' />
</head>
)HTML");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::CollectStatistics(GetDocument());
EXPECT_FALSE(features.open_graph);
}
// This test checks element counts are correct.
TEST_F(DocumentStatisticsCollectorTest, CountElements) {
SetHtmlInnerHTML(R"HTML(
<form>
<input type='text'>
<input type='password'>
</form>
<pre></pre>
<p><a> </a></p>
<ul><li><p><a> </a></p></li></ul>
)HTML");
WebDistillabilityFeatures features =
DocumentStatisticsCollector::CollectStatistics(GetDocument());
EXPECT_FALSE(features.open_graph);
EXPECT_EQ(10u, features.element_count);
EXPECT_EQ(2u, features.anchor_count);
EXPECT_EQ(1u, features.form_count);
EXPECT_EQ(1u, features.text_input_count);
EXPECT_EQ(1u, features.password_input_count);
EXPECT_EQ(2u, features.p_count);
EXPECT_EQ(1u, features.pre_count);
}
// This test checks score calculations are correct.
TEST_F(DocumentStatisticsCollectorTest, CountScore) {
SetHtmlInnerHTML(
"<p class='menu' id='article'>1</p>" // textContentLength = 1
"<ul><li><p>12</p></li></ul>" // textContentLength = 2, skipped because
// under li
"<p class='menu'>123</p>" // textContentLength = 3, skipped because
// unlikelyCandidates
"<p>"
"12345678901234567890123456789012345678901234567890"
"12345678901234567890123456789012345678901234567890"
"12345678901234567890123456789012345678901234"
"</p>" // textContentLength = 144
"<p style='display:none'>12345</p>" // textContentLength = 5, skipped
// because invisible
"<div style='display:none'><p>123456</p></div>" // textContentLength = 6,
// skipped because
// invisible
"<div style='visibility:hidden'><p>1234567</p></div>" // textContentLength
// = 7, skipped
// because
// invisible
"<p style='opacity:0'>12345678</p>" // textContentLength = 8, skipped
// because invisible
"<p><a href='#'>1234 </a>6 <b> 9</b></p>" // textContentLength = 9
"<ul><li></li><p>123456789012</p></ul>" // textContentLength = 12
);
WebDistillabilityFeatures features =
DocumentStatisticsCollector::CollectStatistics(GetDocument());
EXPECT_DOUBLE_EQ(features.moz_score, sqrt(144 - kParagraphLengthThreshold));
EXPECT_DOUBLE_EQ(features.moz_score_all_sqrt,
1 + sqrt(144) + sqrt(9) + sqrt(12));
EXPECT_DOUBLE_EQ(features.moz_score_all_linear, 1 + 144 + 9 + 12);
}
// This test checks saturation of score calculations is correct.
TEST_F(DocumentStatisticsCollectorTest, CountScoreSaturation) {
StringBuilder html;
for (int i = 0; i < 10; i++) {
html.Append("<p>");
for (int j = 0; j < 1000; j++) {
html.Append("0123456789");
}
html.Append("</p>");
}
SetHtmlInnerHTML(html.ToString());
WebDistillabilityFeatures features =
DocumentStatisticsCollector::CollectStatistics(GetDocument());
double error = 1e-5;
EXPECT_NEAR(
features.moz_score,
6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold),
error);
EXPECT_NEAR(features.moz_score_all_sqrt,
6 * sqrt(kTextContentLengthSaturation), error);
EXPECT_NEAR(features.moz_score_all_linear, 6 * kTextContentLengthSaturation,
error);
}
} // namespace blink