content/common/cross_site_document_classifier.cc - chromium/src - Git at Google

 // Copyright 2015 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "content/common/cross_site_document_classifier.h"

 #include "base/command_line.h"
 #include "base/lazy_instance.h"
 #include "base/logging.h"
 #include "base/macros.h"
 #include "base/metrics/histogram_macros.h"
 #include "base/strings/string_util.h"
 #include "content/public/common/content_switches.h"
 #include "content/public/common/resource_response_info.h"
 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
 #include "net/http/http_response_headers.h"

 using base::StringPiece;

 namespace content {

 namespace {

 // MIME types
 const char kTextHtml[] = "text/html";
 const char kTextXml[] = "text/xml";
 const char kAppRssXml[] = "application/rss+xml";
 const char kAppXml[] = "application/xml";
 const char kAppJson[] = "application/json";
 const char kTextJson[] = "text/json";
 const char kTextXjson[] = "text/x-json";
 const char kTextPlain[] = "text/plain";

 bool MatchesSignature(StringPiece data,
                       const StringPiece signatures[],
                       size_t arr_size) {
   size_t offset = data.find_first_not_of(" \t\r\n");
   // There is no not-whitespace character in this document.
   if (offset == base::StringPiece::npos)
     return false;

   data.remove_prefix(offset);
   for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
     if (base::StartsWith(data, signatures[sig_index],
                          base::CompareCase::INSENSITIVE_ASCII))
       return true;
   }
   return false;
 }

 }  // namespace

 CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
     const std::string& mime_type) {
   if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
     return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
   }

   if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
     return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
   }

   if (base::LowerCaseEqualsASCII(mime_type, kAppJson) ||
       base::LowerCaseEqualsASCII(mime_type, kTextJson) ||
       base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
     return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
   }

   if (base::LowerCaseEqualsASCII(mime_type, kTextXml) ||
       base::LowerCaseEqualsASCII(mime_type, kAppRssXml) ||
       base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
     return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
   }

   return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
 }

 bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
   // We exclude ftp:// from here. FTP doesn't provide a Content-Type
   // header which our policy depends on, so we cannot protect any
   // document from FTP servers.
   return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
 }

 bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
                                              const GURL& response_url) {
   if (!frame_origin.is_valid() || !response_url.is_valid())
     return false;

   if (frame_origin.scheme() != response_url.scheme())
     return false;

   // SameDomainOrHost() extracts the effective domains (public suffix plus one)
   // from the two URLs and compare them.
   return net::registry_controlled_domains::SameDomainOrHost(
       frame_origin, response_url,
       net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
 }

 // We don't use Webkit's existing CORS policy implementation since
 // their policy works in terms of origins, not sites. For example,
 // when frame is sub.a.com and it is not allowed to access a document
 // with sub1.a.com. But under Site Isolation, it's allowed.
 bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
     const GURL& frame_origin,
     const GURL& website_origin,
     const std::string& access_control_origin) {
   // Many websites are sending back "\"*\"" instead of "*". This is
   // non-standard practice, and not supported by Chrome. Refer to
   // CrossOriginAccessControl::passesAccessControlCheck().

   // TODO(dsjang): * is not allowed for the response from a request
   // with cookies. This allows for more than what the renderer will
   // eventually be able to receive, so we won't see illegal cross-site
   // documents allowed by this. We have to find a way to see if this
   // response is from a cookie-tagged request or not in the future.
   if (access_control_origin == "*")
     return true;

   // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
   // "*", but many websites are using just a domain for access_control_origin,
   // and this is blocked by Webkit's CORS logic here :
   // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
   // is_valid() to false when it is created from a URL containing * in the
   // domain part.

   GURL cors_origin(access_control_origin);
   return IsSameSite(frame_origin, cors_origin);
 }

 // This function is a slight modification of |net::SniffForHTML|.
 bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
   // The content sniffer used by Chrome and Firefox are using "<!--"
   // as one of the HTML signatures, but it also appears in valid
   // JavaScript, considered as well-formed JS by the browser.  Since
   // we do not want to block any JS, we exclude it from our HTML
   // signatures. This can weaken our document block policy, but we can
   // break less websites.
   // TODO(dsjang): parameterize |net::SniffForHTML| with an option
 #include <stddef.h>

   // that decides whether to include <!-- or not, so that we can
   // remove this function.
   // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
   // process, we should do single-thread checking here for the static
   // initializer.
   static const StringPiece kHtmlSignatures[] = {
       StringPiece("<!doctype html"),  // HTML5 spec
       StringPiece("<script"),         // HTML5 spec, Mozilla
       StringPiece("<html"),           // HTML5 spec, Mozilla
       StringPiece("<head"),           // HTML5 spec, Mozilla
       StringPiece("<iframe"),         // Mozilla
       StringPiece("<h1"),             // Mozilla
       StringPiece("<div"),            // Mozilla
       StringPiece("<font"),           // Mozilla
       StringPiece("<table"),          // Mozilla
       StringPiece("<a"),              // Mozilla
       StringPiece("<style"),          // Mozilla
       StringPiece("<title"),          // Mozilla
       StringPiece("<b"),              // Mozilla
       StringPiece("<body"),           // Mozilla
       StringPiece("<br"),             // Mozilla
       StringPiece("<p")               // Mozilla
   };

   while (data.length() > 0) {
     if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
       return true;

     // If we cannot find "<!--", we fail sniffing this as HTML.
     static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
     if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
       break;

     // Search for --> and do SniffForHTML after that. If we can find the
     // comment's end, we start HTML sniffing from there again.
     static const char kEndComment[] = "-->";
     size_t offset = data.find(kEndComment);
     if (offset == base::StringPiece::npos)
       break;

     // Proceed to the index next to the ending comment (-->).
     data.remove_prefix(offset + strlen(kEndComment));
   }

   return false;
 }

 bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
   // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
   // this signature. However, XML is case-sensitive. Don't we have to
   // be more lenient only to block documents starting with the exact
   // string <?xml rather than <?XML ?
   // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
   // process, we should do single-thread checking here for the static
   // initializer.
   static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
   return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
 }

 bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
   // TODO(dsjang): We have to come up with a better way to sniff
   // JSON. However, even RE cannot help us that much due to the fact
   // that we don't do full parsing.  This DFA starts with state 0, and
   // finds {, "/' and : in that order. We're avoiding adding a
   // dependency on a regular expression library.
   enum {
     kStartState,
     kLeftBraceState,
     kLeftQuoteState,
     kColonState,
     kTerminalState,
   } state = kStartState;

   size_t length = data.length();
   for (size_t i = 0; i < length && state < kColonState; ++i) {
     const char c = data[i];
     if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
       continue;

     switch (state) {
       case kStartState:
         if (c == '{')
           state = kLeftBraceState;
         else
           state = kTerminalState;
         break;
       case kLeftBraceState:
         if (c == '\"' || c == '\'')
           state = kLeftQuoteState;
         else
           state = kTerminalState;
         break;
       case kLeftQuoteState:
         if (c == ':')
           state = kColonState;
         break;
       case kColonState:
       case kTerminalState:
         NOTREACHED();
         break;
     }
   }
   return state == kColonState;
 }

 }  // namespace content
	// Copyright 2015 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "content/common/cross_site_document_classifier.h"

	#include "base/command_line.h"
	#include "base/lazy_instance.h"
	#include "base/logging.h"
	#include "base/macros.h"
	#include "base/metrics/histogram_macros.h"
	#include "base/strings/string_util.h"
	#include "content/public/common/content_switches.h"
	#include "content/public/common/resource_response_info.h"
	#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
	#include "net/http/http_response_headers.h"

	using base::StringPiece;

	namespace content {

	namespace {

	// MIME types
	const char kTextHtml[] = "text/html";
	const char kTextXml[] = "text/xml";
	const char kAppRssXml[] = "application/rss+xml";
	const char kAppXml[] = "application/xml";
	const char kAppJson[] = "application/json";
	const char kTextJson[] = "text/json";
	const char kTextXjson[] = "text/x-json";
	const char kTextPlain[] = "text/plain";

	bool MatchesSignature(StringPiece data,
	const StringPiece signatures[],
	size_t arr_size) {
	size_t offset = data.find_first_not_of(" \t\r\n");
	// There is no not-whitespace character in this document.
	if (offset == base::StringPiece::npos)
	return false;

	data.remove_prefix(offset);
	for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
	if (base::StartsWith(data, signatures[sig_index],
	base::CompareCase::INSENSITIVE_ASCII))
	return true;
	}
	return false;
	}

	} // namespace

	CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
	const std::string& mime_type) {
	if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
	return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
	}

	if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
	return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
	}

	if (base::LowerCaseEqualsASCII(mime_type, kAppJson) \|\|
	base::LowerCaseEqualsASCII(mime_type, kTextJson) \|\|
	base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
	return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
	}

	if (base::LowerCaseEqualsASCII(mime_type, kTextXml) \|\|
	base::LowerCaseEqualsASCII(mime_type, kAppRssXml) \|\|
	base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
	return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
	}

	return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
	}

	bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url) {
	// We exclude ftp:// from here. FTP doesn't provide a Content-Type
	// header which our policy depends on, so we cannot protect any
	// document from FTP servers.
	return url.SchemeIs(url::kHttpScheme) \|\| url.SchemeIs(url::kHttpsScheme);
	}

	bool CrossSiteDocumentClassifier::IsSameSite(const GURL& frame_origin,
	const GURL& response_url) {
	if (!frame_origin.is_valid() \|\| !response_url.is_valid())
	return false;

	if (frame_origin.scheme() != response_url.scheme())
	return false;

	// SameDomainOrHost() extracts the effective domains (public suffix plus one)
	// from the two URLs and compare them.
	return net::registry_controlled_domains::SameDomainOrHost(
	frame_origin, response_url,
	net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
	}

	// We don't use Webkit's existing CORS policy implementation since
	// their policy works in terms of origins, not sites. For example,
	// when frame is sub.a.com and it is not allowed to access a document
	// with sub1.a.com. But under Site Isolation, it's allowed.
	bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
	const GURL& frame_origin,
	const GURL& website_origin,
	const std::string& access_control_origin) {
	// Many websites are sending back "\"\"" instead of "". This is
	// non-standard practice, and not supported by Chrome. Refer to
	// CrossOriginAccessControl::passesAccessControlCheck().

	// TODO(dsjang): * is not allowed for the response from a request
	// with cookies. This allows for more than what the renderer will
	// eventually be able to receive, so we won't see illegal cross-site
	// documents allowed by this. We have to find a way to see if this
	// response is from a cookie-tagged request or not in the future.
	if (access_control_origin == "*")
	return true;

	// TODO(dsjang): The CORS spec only treats a fully specified URL, except for
	// "*", but many websites are using just a domain for access_control_origin,
	// and this is blocked by Webkit's CORS logic here :
	// CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
	// is_valid() to false when it is created from a URL containing * in the
	// domain part.

	GURL cors_origin(access_control_origin);
	return IsSameSite(frame_origin, cors_origin);
	}

	// This function is a slight modification of \|net::SniffForHTML\|.
	bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data) {
	// The content sniffer used by Chrome and Firefox are using "<!--"
	// as one of the HTML signatures, but it also appears in valid
	// JavaScript, considered as well-formed JS by the browser. Since
	// we do not want to block any JS, we exclude it from our HTML
	// signatures. This can weaken our document block policy, but we can
	// break less websites.
	// TODO(dsjang): parameterize \|net::SniffForHTML\| with an option
	#include <stddef.h>

	// that decides whether to include <!-- or not, so that we can
	// remove this function.
	// TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
	// process, we should do single-thread checking here for the static
	// initializer.
	static const StringPiece kHtmlSignatures[] = {
	StringPiece("<!doctype html"), // HTML5 spec
	StringPiece("<script"), // HTML5 spec, Mozilla
	StringPiece("<html"), // HTML5 spec, Mozilla
	StringPiece("<head"), // HTML5 spec, Mozilla
	StringPiece("<iframe"), // Mozilla
	StringPiece("<h1"), // Mozilla
	StringPiece("<div"), // Mozilla
	StringPiece("<font"), // Mozilla
	StringPiece("<table"), // Mozilla
	StringPiece("<a"), // Mozilla
	StringPiece("<style"), // Mozilla
	StringPiece("<title"), // Mozilla
	StringPiece("<b"), // Mozilla
	StringPiece("<body"), // Mozilla
	StringPiece("<br"), // Mozilla
	StringPiece("<p") // Mozilla
	};

	while (data.length() > 0) {
	if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
	return true;

	// If we cannot find "<!--", we fail sniffing this as HTML.
	static const StringPiece kCommentBegins[] = {StringPiece("<!--")};
	if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
	break;

	// Search for --> and do SniffForHTML after that. If we can find the
	// comment's end, we start HTML sniffing from there again.
	static const char kEndComment[] = "-->";
	size_t offset = data.find(kEndComment);
	if (offset == base::StringPiece::npos)
	break;

	// Proceed to the index next to the ending comment (-->).
	data.remove_prefix(offset + strlen(kEndComment));
	}

	return false;
	}

	bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data) {
	// TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
	// this signature. However, XML is case-sensitive. Don't we have to
	// be more lenient only to block documents starting with the exact
	// string <?xml rather than <?XML ?
	// TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
	// process, we should do single-thread checking here for the static
	// initializer.
	static const StringPiece kXmlSignatures[] = {StringPiece("<?xml")};
	return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
	}

	bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data) {
	// TODO(dsjang): We have to come up with a better way to sniff
	// JSON. However, even RE cannot help us that much due to the fact
	// that we don't do full parsing. This DFA starts with state 0, and
	// finds {, "/' and : in that order. We're avoiding adding a
	// dependency on a regular expression library.
	enum {
	kStartState,
	kLeftBraceState,
	kLeftQuoteState,
	kColonState,
	kTerminalState,
	} state = kStartState;

	size_t length = data.length();
	for (size_t i = 0; i < length && state < kColonState; ++i) {
	const char c = data[i];
	if (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n')
	continue;

	switch (state) {
	case kStartState:
	if (c == '{')
	state = kLeftBraceState;
	else
	state = kTerminalState;
	break;
	case kLeftBraceState:
	if (c == '\"' \|\| c == '\'')
	state = kLeftQuoteState;
	else
	state = kTerminalState;
	break;
	case kLeftQuoteState:
	if (c == ':')
	state = kColonState;
	break;
	case kColonState:
	case kTerminalState:
	NOTREACHED();
	break;
	}
	}
	return state == kColonState;
	}

	} // namespace content