blob: 994ff032653e5d954e02dbb2228a7572adc2f15d [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/policy/core/browser/url_util.h"
#include <string>
#include "base/logging.h"
#include "base/macros.h"
#include "base/no_destructor.h"
#include "components/google/core/common/google_util.h"
#include "components/url_formatter/url_fixer.h"
#include "net/base/escape.h"
#include "net/base/url_util.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"
namespace policy {
namespace url_util {
namespace {
// Host/regex pattern for Google AMP Cache URLs.
// See https://developers.google.com/amp/cache/overview#amp-cache-url-format
// for a definition of the format of AMP Cache URLs.
const char kGoogleAmpCacheHost[] = "cdn.ampproject.org";
const char kGoogleAmpCachePathPattern[] = "/[a-z]/(s/)?(.*)";
// Regex pattern for the path of Google AMP Viewer URLs.
const char kGoogleAmpViewerPathPattern[] = "/amp/(s/)?(.*)";
// Host, path prefix, and query regex pattern for Google web cache URLs.
const char kGoogleWebCacheHost[] = "webcache.googleusercontent.com";
const char kGoogleWebCachePathPrefix[] = "/search";
const char kGoogleWebCacheQueryPattern[] =
"cache:(.{12}:)?(https?://)?([^ :]*)( [^:]*)?";
const char kGoogleTranslateSubdomain[] = "translate.";
const char kAlternateGoogleTranslateHost[] = "translate.googleusercontent.com";
// Returns a full URL using either "http" or "https" as the scheme.
GURL BuildURL(bool is_https, const std::string& host_and_path) {
std::string scheme = is_https ? url::kHttpsScheme : url::kHttpScheme;
return GURL(scheme + "://" + host_and_path);
}
// Helper class for testing the URL against precompiled regexes. This is a
// singleton so the cached regexes are only created once.
class EmbeddedURLExtractor {
public:
static EmbeddedURLExtractor* GetInstance() {
static base::NoDestructor<EmbeddedURLExtractor> instance;
return instance.get();
}
// Implements url_filter::GetEmbeddedURL().
GURL GetEmbeddedURL(const GURL& url) {
// Check for "*.cdn.ampproject.org" URLs.
if (url.DomainIs(kGoogleAmpCacheHost)) {
std::string s;
std::string embedded;
if (re2::RE2::FullMatch(url.path(), google_amp_cache_path_regex_, &s,
&embedded)) {
if (url.has_query())
embedded += "?" + url.query();
return BuildURL(!s.empty(), embedded);
}
}
// Check for "www.google.TLD/amp/" URLs.
if (google_util::IsGoogleDomainUrl(
url, google_util::DISALLOW_SUBDOMAIN,
google_util::DISALLOW_NON_STANDARD_PORTS)) {
std::string s;
std::string embedded;
if (re2::RE2::FullMatch(url.path(), google_amp_viewer_path_regex_, &s,
&embedded)) {
// The embedded URL may be percent-encoded. Undo that.
embedded = net::UnescapeURLComponent(
embedded,
net::UnescapeRule::SPACES | net::UnescapeRule::PATH_SEPARATORS |
net::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
return BuildURL(!s.empty(), embedded);
}
}
// Check for Google web cache URLs
// ("webcache.googleusercontent.com/search?q=cache:...").
std::string query;
if (url.host_piece() == kGoogleWebCacheHost &&
url.path_piece().starts_with(kGoogleWebCachePathPrefix) &&
net::GetValueForKeyInQuery(url, "q", &query)) {
std::string fingerprint;
std::string scheme;
std::string embedded;
if (re2::RE2::FullMatch(query, google_web_cache_query_regex_,
&fingerprint, &scheme, &embedded)) {
return BuildURL(scheme == "https://", embedded);
}
}
// Check for Google translate URLs ("translate.google.TLD/...?...&u=URL" or
// "translate.googleusercontent.com/...?...&u=URL").
bool is_translate = false;
if (base::StartsWith(url.host_piece(), kGoogleTranslateSubdomain,
base::CompareCase::SENSITIVE)) {
// Remove the "translate." prefix.
GURL::Replacements replace;
replace.SetHostStr(
url.host_piece().substr(strlen(kGoogleTranslateSubdomain)));
GURL trimmed = url.ReplaceComponents(replace);
// Check that the remainder is a Google URL. Note: IsGoogleDomainUrl
// checks for [www.]google.TLD, but we don't want the "www.", so
// explicitly exclude that.
// TODO(treib,pam): Instead of excluding "www." manually, teach
// IsGoogleDomainUrl a mode that doesn't allow it.
is_translate = google_util::IsGoogleDomainUrl(
trimmed, google_util::DISALLOW_SUBDOMAIN,
google_util::DISALLOW_NON_STANDARD_PORTS) &&
!base::StartsWith(trimmed.host_piece(), "www.",
base::CompareCase::SENSITIVE);
}
bool is_alternate_translate =
url.host_piece() == kAlternateGoogleTranslateHost;
if (is_translate || is_alternate_translate) {
std::string embedded;
if (net::GetValueForKeyInQuery(url, "u", &embedded)) {
// The embedded URL may or may not include a scheme. Fix it if
// necessary.
return url_formatter::FixupURL(embedded, /*desired_tld=*/std::string());
}
}
return GURL();
}
private:
friend class base::NoDestructor<EmbeddedURLExtractor>;
EmbeddedURLExtractor()
: google_amp_cache_path_regex_(kGoogleAmpCachePathPattern),
google_amp_viewer_path_regex_(kGoogleAmpViewerPathPattern),
google_web_cache_query_regex_(kGoogleWebCacheQueryPattern) {
DCHECK(google_amp_cache_path_regex_.ok());
DCHECK(google_amp_viewer_path_regex_.ok());
DCHECK(google_web_cache_query_regex_.ok());
}
~EmbeddedURLExtractor() = default;
const re2::RE2 google_amp_cache_path_regex_;
const re2::RE2 google_amp_viewer_path_regex_;
const re2::RE2 google_web_cache_query_regex_;
DISALLOW_COPY_AND_ASSIGN(EmbeddedURLExtractor);
};
} // namespace
GURL Normalize(const GURL& url) {
GURL normalized_url = url;
GURL::Replacements replacements;
// Strip username, password, query, and ref.
replacements.ClearUsername();
replacements.ClearPassword();
replacements.ClearQuery();
replacements.ClearRef();
return url.ReplaceComponents(replacements);
}
GURL GetEmbeddedURL(const GURL& url) {
return EmbeddedURLExtractor::GetInstance()->GetEmbeddedURL(url);
}
} // namespace url_util
} // namespace policy