blob: c87ab3daf88a827fffa3f921f0b6206499ba8f83 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/renderer/safe_browsing/phishing_url_feature_extractor.h"
#include <string>
#include <vector>
#include "base/format_macros.h"
#include "base/strings/stringprintf.h"
#include "chrome/renderer/safe_browsing/features.h"
#include "chrome/renderer/safe_browsing/test_utils.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "url/gurl.h"
using ::testing::ElementsAre;
namespace safe_browsing {
class PhishingUrlFeatureExtractorTest : public ::testing::Test {
protected:
PhishingUrlFeatureExtractor extractor_;
void SplitStringIntoLongAlphanumTokens(const std::string& full,
std::vector<std::string>* tokens) {
PhishingUrlFeatureExtractor::SplitStringIntoLongAlphanumTokens(full,
tokens);
}
void FillFeatureMap(size_t count, FeatureMap* features) {
for (size_t i = 0; i < count; ++i) {
EXPECT_TRUE(
features->AddBooleanFeature(base::StringPrintf("Feature%" PRIuS, i)));
}
}
};
TEST_F(PhishingUrlFeatureExtractorTest, ExtractFeatures) {
std::string url = "http://123.0.0.1/mydocuments/a.file.html";
FeatureMap features;
// If feature map is already full, features cannot be extracted.
FillFeatureMap(FeatureMap::kMaxFeatureMapSize, &features);
ASSERT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
features.Clear();
FeatureMap expected_features;
expected_features.AddBooleanFeature(features::kUrlHostIsIpAddress);
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("mydocuments"));
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("file"));
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("html"));
ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// If feature map is already full, features cannot be extracted.
features.Clear();
FillFeatureMap(FeatureMap::kMaxFeatureMapSize - 1, &features);
ASSERT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
features.Clear();
url = "http://www.www.cnn.co.uk/sports/sports/index.html?shouldnotappear";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kUrlTldToken +
std::string("co.uk"));
expected_features.AddBooleanFeature(features::kUrlDomainToken +
std::string("cnn"));
expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
std::string("www"));
expected_features.AddBooleanFeature(features::kUrlNumOtherHostTokensGTOne);
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("sports"));
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("index"));
expected_features.AddBooleanFeature(features::kUrlPathToken +
std::string("html"));
features.Clear();
ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
ExpectFeatureMapsAreEqual(features, expected_features);
features.Clear();
// If feature map is already full, features cannot be extracted.
FillFeatureMap(FeatureMap::kMaxFeatureMapSize - 5, &features);
ASSERT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "http://justadomain.com/";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kUrlTldToken +
std::string("com"));
expected_features.AddBooleanFeature(features::kUrlDomainToken +
std::string("justadomain"));
features.Clear();
ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// If feature map is already full, features cannot be extracted.
features.Clear();
FillFeatureMap(FeatureMap::kMaxFeatureMapSize - 1, &features);
ASSERT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "http://witharef.com/#abc";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kUrlTldToken +
std::string("com"));
expected_features.AddBooleanFeature(features::kUrlDomainToken +
std::string("witharef"));
features.Clear();
ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
ExpectFeatureMapsAreEqual(features, expected_features);
url = "http://...www..lotsodots....com./";
expected_features.Clear();
expected_features.AddBooleanFeature(features::kUrlTldToken +
std::string("com"));
expected_features.AddBooleanFeature(features::kUrlDomainToken +
std::string("lotsodots"));
expected_features.AddBooleanFeature(features::kUrlOtherHostToken +
std::string("www"));
features.Clear();
ASSERT_TRUE(extractor_.ExtractFeatures(GURL(url), &features));
ExpectFeatureMapsAreEqual(features, expected_features);
// If feature map is already full, features cannot be extracted.
features.Clear();
FillFeatureMap(FeatureMap::kMaxFeatureMapSize - 2, &features);
ASSERT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "http://unrecognized.tld/";
EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "http://com/123";
EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "http://.co.uk/";
EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "file:///nohost.txt";
EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
url = "not:valid:at:all";
EXPECT_FALSE(extractor_.ExtractFeatures(GURL(url), &features));
}
TEST_F(PhishingUrlFeatureExtractorTest, SplitStringIntoLongAlphanumTokens) {
std::string full = "This.is/a_pretty\\unusual-!path,indeed";
std::vector<std::string> long_tokens;
SplitStringIntoLongAlphanumTokens(full, &long_tokens);
EXPECT_THAT(long_tokens,
ElementsAre("This", "pretty", "unusual", "path", "indeed"));
long_tokens.clear();
full = "...i-am_re/al&ly\\b,r,o|k=e:n///up%20";
SplitStringIntoLongAlphanumTokens(full, &long_tokens);
EXPECT_THAT(long_tokens, ElementsAre());
}
} // namespace safe_browsing