blob: ec5e253dd09f3e20493de2728016fe7477770cac [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/phishing_term_feature_extractor.h"
#include <string>
#include "base/bind.h"
#include "base/callback.h"
#include "base/containers/hash_tables.h"
#include "base/memory/scoped_ptr.h"
#include "base/message_loop/message_loop.h"
#include "base/strings/string16.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/mock_feature_extractor_clock.h"
#include "chrome/renderer/safe_browsing/murmurhash3_util.h"
#include "chrome/renderer/safe_browsing/test_utils.h"
#include "crypto/sha2.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
using ::testing::Return;
namespace safe_browsing {
class PhishingTermFeatureExtractorTest : public ::testing::Test {
protected:
virtual void SetUp() {
base::hash_set<std::string> terms;
terms.insert("one");
terms.insert("one one");
terms.insert("two");
terms.insert("multi word test");
terms.insert("capitalization");
terms.insert("space");
terms.insert("separator");
terms.insert("punctuation");
// Chinese (translation of "hello")
terms.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
// Chinese (translation of "goodbye")
terms.insert("\xe5\x86\x8d\xe8\xa7\x81");
for (base::hash_set<std::string>::iterator it = terms.begin();
it != terms.end(); ++it) {
term_hashes_.insert(crypto::SHA256HashString(*it));
}
base::hash_set<std::string> words;
words.insert("one");
words.insert("two");
words.insert("multi");
words.insert("word");
words.insert("test");
words.insert("capitalization");
words.insert("space");
words.insert("separator");
words.insert("punctuation");
words.insert("\xe4\xbd\xa0\xe5\xa5\xbd");
words.insert("\xe5\x86\x8d\xe8\xa7\x81");
static const uint32 kMurmurHash3Seed = 2777808611U;
for (base::hash_set<std::string>::iterator it = words.begin();
it != words.end(); ++it) {
word_hashes_.insert(MurmurHash3String(*it, kMurmurHash3Seed));
}
extractor_.reset(new PhishingTermFeatureExtractor(
&term_hashes_,
&word_hashes_,
3 /* max_words_per_term */,
kMurmurHash3Seed,
&clock_));
}
// Runs the TermFeatureExtractor on |page_text|, waiting for the
// completion callback. Returns the success boolean from the callback.
bool ExtractFeatures(const string16* page_text, FeatureMap* features) {
success_ = false;
extractor_->ExtractFeatures(
page_text,
features,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.Run();
return success_;
}
void PartialExtractFeatures(const string16* page_text, FeatureMap* features) {
extractor_->ExtractFeatures(
page_text,
features,
base::Bind(&PhishingTermFeatureExtractorTest::ExtractionDone,
base::Unretained(this)));
msg_loop_.PostTask(
FROM_HERE,
base::Bind(&PhishingTermFeatureExtractorTest::QuitExtraction,
base::Unretained(this)));
msg_loop_.RunUntilIdle();
}
// Completion callback for feature extraction.
void ExtractionDone(bool success) {
success_ = success;
msg_loop_.Quit();
}
void QuitExtraction() {
extractor_->CancelPendingExtraction();
msg_loop_.Quit();
}
base::MessageLoop msg_loop_;
MockFeatureExtractorClock clock_;
scoped_ptr<PhishingTermFeatureExtractor> extractor_;
base::hash_set<std::string> term_hashes_;
base::hash_set<uint32> word_hashes_;
bool success_; // holds the success value from ExtractFeatures
};
TEST_F(PhishingTermFeatureExtractorTest, ExtractFeatures) {
// This test doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
string16 page_text = ASCIIToUTF16("blah");
FeatureMap expected_features; // initially empty
FeatureMap features;
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
page_text = ASCIIToUTF16("one one");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one one"));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
page_text = ASCIIToUTF16("bla bla multi word test bla");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("multi word test"));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// This text has all of the words for one of the terms, but they are
// not in the correct order.
page_text = ASCIIToUTF16("bla bla test word multi bla");
expected_features.Clear();
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
page_text = ASCIIToUTF16("Capitalization plus non-space\n"
"separator... punctuation!");
expected_features.Clear();
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("capitalization"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("space"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("separator"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("punctuation"));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// Test with empty page text.
page_text = string16();
expected_features.Clear();
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// Chinese translation of the phrase "hello goodbye". This tests that
// we can correctly separate terms in languages that don't use spaces.
page_text = UTF8ToUTF16("\xe4\xbd\xa0\xe5\xa5\xbd\xe5\x86\x8d\xe8\xa7\x81");
expected_features.Clear();
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe4\xbd\xa0\xe5\xa5\xbd"));
expected_features.AddBooleanFeature(
features::kPageTerm + std::string("\xe5\x86\x8d\xe8\xa7\x81"));
features.Clear();
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
}
TEST_F(PhishingTermFeatureExtractorTest, Continuation) {
// For this test, we'll cause the feature extraction to run multiple
// iterations by incrementing the clock.
// This page has a total of 30 words. For the features to be computed
// correctly, the extractor has to process the entire string of text.
string16 page_text(ASCIIToUTF16("one "));
for (int i = 0; i < 28; ++i) {
page_text.append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
page_text.append(ASCIIToUTF16("two"));
// Advance the clock 3 ms every 5 words processed, 10 ms between chunks.
// Note that this assumes kClockCheckGranularity = 5 and
// kMaxTimePerChunkMs = 10.
base::TimeTicks now = base::TimeTicks::Now();
EXPECT_CALL(clock_, Now())
// Time check at the start of extraction.
.WillOnce(Return(now))
// Time check at the start of the first chunk of work.
.WillOnce(Return(now))
// Time check after the first 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(3)))
// Time check after the next 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(6)))
// Time check after the next 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(9)))
// Time check after the next 5 words. This is over the chunk
// time limit, so a continuation task will be posted.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(12)))
// Time check at the start of the second chunk of work.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(22)))
// Time check after the next 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(25)))
// Time check after the next 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(28)))
// A final check for the histograms.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(30)));
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("one"));
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("two"));
FeatureMap features;
ASSERT_TRUE(ExtractFeatures(&page_text, &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// Make sure none of the mock expectations carry over to the next test.
::testing::Mock::VerifyAndClearExpectations(&clock_);
// Now repeat the test with the same text, but advance the clock faster so
// that the extraction time exceeds the maximum total time for the feature
// extractor. Extraction should fail. Note that this assumes
// kMaxTotalTimeMs = 500.
EXPECT_CALL(clock_, Now())
// Time check at the start of extraction.
.WillOnce(Return(now))
// Time check at the start of the first chunk of work.
.WillOnce(Return(now))
// Time check after the first 5 words,
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(300)))
// Time check at the start of the second chunk of work.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(350)))
// Time check after the next 5 words. This is over the limit.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(600)))
// A final time check for the histograms.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(620)));
features.Clear();
EXPECT_FALSE(ExtractFeatures(&page_text, &features));
}
TEST_F(PhishingTermFeatureExtractorTest, PartialExtractionTest) {
scoped_ptr<string16> page_text(new string16(ASCIIToUTF16("one ")));
for (int i = 0; i < 28; ++i) {
page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
base::TimeTicks now = base::TimeTicks::Now();
EXPECT_CALL(clock_, Now())
// Time check at the start of extraction.
.WillOnce(Return(now))
// Time check at the start of the first chunk of work.
.WillOnce(Return(now))
// Time check after the first 5 words.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(7)))
// Time check after the next 5 words. This should be greater than
// kMaxTimePerChunkMs so that we stop and schedule extraction for later.
.WillOnce(Return(now + base::TimeDelta::FromMilliseconds(14)));
FeatureMap features;
// Extract first 10 words then stop.
PartialExtractFeatures(page_text.get(), &features);
page_text.reset(new string16());
for (int i = 30; i < 58; ++i) {
page_text->append(ASCIIToUTF16(base::StringPrintf("%d ", i)));
}
page_text->append(ASCIIToUTF16("multi word test "));
features.Clear();
// This part doesn't exercise the extraction timing.
EXPECT_CALL(clock_, Now()).WillRepeatedly(Return(base::TimeTicks::Now()));
// Now extract normally and make sure nothing breaks.
EXPECT_TRUE(ExtractFeatures(page_text.get(), &features));
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kPageTerm +
std::string("multi word test"));
ExpectFeatureMapsAreEqual(features, expected_features);
}
} // namespace safe_browsing