blob: 423161032d33d77805f2b4bca9be9d385156b50b [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/common/cross_site_document_classifier.h"
#include "base/basictypes.h"
#include "base/command_line.h"
#include "base/lazy_instance.h"
#include "base/logging.h"
#include "base/metrics/histogram.h"
#include "base/strings/string_util.h"
#include "content/public/common/content_switches.h"
#include "content/public/common/resource_response_info.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/http/http_response_headers.h"
using base::StringPiece;
namespace content {
namespace {
// MIME types
const char kTextHtml[] = "text/html";
const char kTextXml[] = "text/xml";
const char kAppRssXml[] = "application/rss+xml";
const char kAppXml[] = "application/xml";
const char kAppJson[] = "application/json";
const char kTextJson[] = "text/json";
const char kTextXjson[] = "text/x-json";
const char kTextPlain[] = "text/plain";
bool MatchesSignature(StringPiece data,
const StringPiece signatures[],
size_t arr_size) {
size_t offset = data.find_first_not_of(" \t\r\n");
// There is no not-whitespace character in this document.
if (offset == base::StringPiece::npos)
return false;
for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
if (base::StartsWith(data, signatures[sig_index],
return true;
return false;
} // namespace
CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
const std::string& mime_type) {
if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
if (base::LowerCaseEqualsASCII(mime_type, kAppJson) ||
base::LowerCaseEqualsASCII(mime_type, kTextJson) ||
base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
if (base::LowerCaseEqualsASCII(mime_type, kTextXml) ||
base::LowerCaseEqualsASCII(mime_type, kAppRssXml) ||
base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
// We exclude ftp:// from here. FTP doesn't provide a Content-Type
// header which our policy depends on, so we cannot protect any
// document from FTP servers.
return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
const GURL& response_url) {
if (!frame_origin.is_valid() || !response_url.is_valid())
return false;
if (frame_origin.scheme() != response_url.scheme())
return false;
// SameDomainOrHost() extracts the effective domains (public suffix plus one)
// from the two URLs and compare them.
return net::registry_controlled_domains::SameDomainOrHost(
frame_origin, response_url,
// We don't use Webkit's existing CORS policy implementation since
// their policy works in terms of origins, not sites. For example,
// when frame is and it is not allowed to access a document
// with But under Site Isolation, it's allowed.
bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
const GURL& frame_origin,
const GURL& website_origin,
const std::string& access_control_origin) {
// Many websites are sending back "\"*\"" instead of "*". This is
// non-standard practice, and not supported by Chrome. Refer to
// CrossOriginAccessControl::passesAccessControlCheck().
// TODO(dsjang): * is not allowed for the response from a request
// with cookies. This allows for more than what the renderer will
// eventually be able to receive, so we won't see illegal cross-site
// documents allowed by this. We have to find a way to see if this
// response is from a cookie-tagged request or not in the future.
if (access_control_origin == "*")
return true;
// TODO(dsjang): The CORS spec only treats a fully specified URL, except for
// "*", but many websites are using just a domain for access_control_origin,
// and this is blocked by Webkit's CORS logic here :
// CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
// is_valid() to false when it is created from a URL containing * in the
// domain part.
GURL cors_origin(access_control_origin);
return IsSameSite(frame_origin, cors_origin);
// This function is a slight modification of |net::SniffForHTML|.
bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
// The content sniffer used by Chrome and Firefox are using "<!--"
// as one of the HTML signatures, but it also appears in valid
// JavaScript, considered as well-formed JS by the browser. Since
// we do not want to block any JS, we exclude it from our HTML
// signatures. This can weaken our document block policy, but we can
// break less websites.
// TODO(dsjang): parameterize |net::SniffForHTML| with an option
// that decides whether to include <!-- or not, so that we can
// remove this function.
// TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
// process, we should do single-thread checking here for the static
// initializer.
static const StringPiece kHtmlSignatures[] = {
StringPiece("<!doctype html"), // HTML5 spec
StringPiece("<script"), // HTML5 spec, Mozilla
StringPiece("<html"), // HTML5 spec, Mozilla
StringPiece("<head"), // HTML5 spec, Mozilla
StringPiece("<iframe"), // Mozilla
StringPiece("<h1"), // Mozilla
StringPiece("<div"), // Mozilla
StringPiece("<font"), // Mozilla
StringPiece("<table"), // Mozilla
StringPiece("<a"), // Mozilla
StringPiece("<style"), // Mozilla
StringPiece("<title"), // Mozilla
StringPiece("<b"), // Mozilla
StringPiece("<body"), // Mozilla
StringPiece("<br"), // Mozilla
StringPiece("<p") // Mozilla
while (data.length() > 0) {
if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
return true;
// If we cannot find "<!--", we fail sniffing this as HTML.
static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
// Search for --> and do SniffForHTML after that. If we can find the
// comment's end, we start HTML sniffing from there again.
static const char kEndComment[] = "-->";
size_t offset = data.find(kEndComment);
if (offset == base::StringPiece::npos)
// Proceed to the index next to the ending comment (-->).
data.remove_prefix(offset + strlen(kEndComment));
return false;
bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
// TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
// this signature. However, XML is case-sensitive. Don't we have to
// be more lenient only to block documents starting with the exact
// string <?xml rather than <?XML ?
// TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
// process, we should do single-thread checking here for the static
// initializer.
static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
// TODO(dsjang): We have to come up with a better way to sniff
// JSON. However, even RE cannot help us that much due to the fact
// that we don't do full parsing. This DFA starts with state 0, and
// finds {, "/' and : in that order. We're avoiding adding a
// dependency on a regular expression library.
enum {
} state = kStartState;
size_t length = data.length();
for (size_t i = 0; i < length && state < kColonState; ++i) {
const char c = data[i];
if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
switch (state) {
case kStartState:
if (c == '{')
state = kLeftBraceState;
state = kTerminalState;
case kLeftBraceState:
if (c == '\"' || c == '\'')
state = kLeftQuoteState;
state = kTerminalState;
case kLeftQuoteState:
if (c == ':')
state = kColonState;
case kColonState:
case kTerminalState:
return state == kColonState;
} // namespace content