blob: a5cf65086d2f665ced1263529f2e433788d25c9e [file] [log] [blame]
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/precache/core/precache_fetcher.h"
#include <algorithm>
#include <limits>
#include <string>
#include <utility>
#include <vector>
#include "base/bind.h"
#include "base/bind_helpers.h"
#include "base/callback.h"
#include "base/command_line.h"
#include "base/compiler_specific.h"
#include "base/containers/hash_tables.h"
#include "base/location.h"
#include "base/logging.h"
#include "base/memory/ptr_util.h"
#include "base/memory/ref_counted.h"
#include "base/metrics/histogram_macros.h"
#include "components/precache/core/precache_switches.h"
#include "components/precache/core/proto/precache.pb.h"
#include "components/precache/core/proto/unfinished_work.pb.h"
#include "net/base/completion_callback.h"
#include "net/base/escape.h"
#include "net/base/io_buffer.h"
#include "net/base/load_flags.h"
#include "net/base/net_errors.h"
#include "net/http/http_response_headers.h"
#include "net/url_request/url_fetcher_response_writer.h"
#include "net/url_request/url_request_context_getter.h"
#include "net/url_request/url_request_status.h"
namespace precache {
// The following flags are for privacy reasons. For example, if a user clears
// their cookies, but a tracking beacon is prefetched and the beacon specifies
// its source URL in a URL param, the beacon site would be able to rebuild a
// profile of the user. All three flags should occur together, or not at all,
// per
// https://groups.google.com/a/chromium.org/d/topic/net-dev/vvcodRV6SdM/discussion.
const int kNoTracking =
net::LOAD_DO_NOT_SAVE_COOKIES | net::LOAD_DO_NOT_SEND_COOKIES |
net::LOAD_DO_NOT_SEND_AUTH_DATA;
namespace {
// The maximum number of URLFetcher requests that can be on flight in parallel.
const int kMaxParallelFetches = 10;
// The maximum for the Precache.Fetch.ResponseBytes.* histograms. We set this to
// a number we expect to be in the 99th percentile for the histogram, give or
// take.
const int kMaxResponseBytes = 500 * 1024 * 1024;
GURL GetDefaultConfigURL() {
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
if (command_line.HasSwitch(switches::kPrecacheConfigSettingsURL)) {
return GURL(
command_line.GetSwitchValueASCII(switches::kPrecacheConfigSettingsURL));
}
#if defined(PRECACHE_CONFIG_SETTINGS_URL)
return GURL(PRECACHE_CONFIG_SETTINGS_URL);
#else
// The precache config settings URL could not be determined, so return an
// empty, invalid GURL.
return GURL();
#endif
}
std::string GetDefaultManifestURLPrefix() {
const base::CommandLine& command_line =
*base::CommandLine::ForCurrentProcess();
if (command_line.HasSwitch(switches::kPrecacheManifestURLPrefix)) {
return command_line.GetSwitchValueASCII(
switches::kPrecacheManifestURLPrefix);
}
#if defined(PRECACHE_MANIFEST_URL_PREFIX)
return PRECACHE_MANIFEST_URL_PREFIX;
#else
// The precache manifest URL prefix could not be determined, so return an
// empty string.
return std::string();
#endif
}
// Construct the URL of the precache manifest for the given name (either host or
// URL). The server is expecting a request for a URL consisting of the manifest
// URL prefix followed by the doubly escaped name.
std::string ConstructManifestURL(const std::string& prefix,
const std::string& name) {
return prefix + net::EscapeQueryParamValue(
net::EscapeQueryParamValue(name, false), false);
}
// Attempts to parse a protobuf message from the response string of a
// URLFetcher. If parsing is successful, the message parameter will contain the
// parsed protobuf and this function will return true. Otherwise, returns false.
bool ParseProtoFromFetchResponse(const net::URLFetcher& source,
::google::protobuf::MessageLite* message) {
std::string response_string;
if (!source.GetStatus().is_success()) {
DLOG(WARNING) << "Fetch failed: " << source.GetOriginalURL().spec();
return false;
}
if (!source.GetResponseAsString(&response_string)) {
DLOG(WARNING) << "No response string present: "
<< source.GetOriginalURL().spec();
return false;
}
if (!message->ParseFromString(response_string)) {
DLOG(WARNING) << "Unable to parse proto served from "
<< source.GetOriginalURL().spec();
return false;
}
return true;
}
// Returns the resource selection bitset from the |manifest| for the given
// |experiment_id|. By default all resource will be selected if the experiment
// group is not found.
uint64_t GetResourceBitset(const PrecacheManifest& manifest,
uint32_t experiment_id) {
if (manifest.has_experiments()) {
const auto& resource_bitset_map =
manifest.experiments().resources_by_experiment_group();
const auto& resource_bitset_it = resource_bitset_map.find(experiment_id);
if (resource_bitset_it != resource_bitset_map.end())
return resource_bitset_it->second.bitset();
}
return ~0ULL;
}
// URLFetcherResponseWriter that ignores the response body, in order to avoid
// the unnecessary memory usage. Use it rather than the default if you don't
// care about parsing the response body. We use it below as a means to populate
// the cache with requested resource URLs.
class URLFetcherNullWriter : public net::URLFetcherResponseWriter {
public:
int Initialize(const net::CompletionCallback& callback) override {
return net::OK;
}
int Write(net::IOBuffer* buffer,
int num_bytes,
const net::CompletionCallback& callback) override {
return num_bytes;
}
int Finish(const net::CompletionCallback& callback) override {
return net::OK;
}
};
void AppendManifestURLIfValidAndNew(
const std::string& prefix,
const std::string& name,
base::hash_set<std::string>* seen_manifest_urls,
std::list<GURL>* unique_manifest_urls) {
const std::string manifest_url = ConstructManifestURL(prefix, name);
bool first_seen = seen_manifest_urls->insert(manifest_url).second;
if (first_seen) {
GURL url(manifest_url);
if (url.is_valid())
unique_manifest_urls->push_back(url);
}
}
} // namespace
PrecacheFetcher::Fetcher::Fetcher(
net::URLRequestContextGetter* request_context,
const GURL& url,
const base::Callback<void(const Fetcher&)>& callback,
bool is_resource_request,
size_t max_bytes)
: request_context_(request_context),
url_(url),
callback_(callback),
is_resource_request_(is_resource_request),
max_bytes_(max_bytes),
response_bytes_(0),
network_response_bytes_(0) {
if (is_resource_request_)
LoadFromCache();
else
LoadFromNetwork();
}
PrecacheFetcher::Fetcher::~Fetcher() {}
void PrecacheFetcher::Fetcher::LoadFromCache() {
fetch_stage_ = FetchStage::CACHE;
cache_url_fetcher_ =
net::URLFetcher::Create(url_, net::URLFetcher::GET, this);
cache_url_fetcher_->SetRequestContext(request_context_);
cache_url_fetcher_->SetLoadFlags(net::LOAD_ONLY_FROM_CACHE | kNoTracking);
std::unique_ptr<URLFetcherNullWriter> null_writer(new URLFetcherNullWriter);
cache_url_fetcher_->SaveResponseWithWriter(std::move(null_writer));
cache_url_fetcher_->Start();
}
void PrecacheFetcher::Fetcher::LoadFromNetwork() {
fetch_stage_ = FetchStage::NETWORK;
network_url_fetcher_ =
net::URLFetcher::Create(url_, net::URLFetcher::GET, this);
network_url_fetcher_->SetRequestContext(request_context_);
if (is_resource_request_) {
// LOAD_VALIDATE_CACHE allows us to refresh Date headers for resources
// already in the cache. The Date headers are updated from 304s as well as
// 200s.
network_url_fetcher_->SetLoadFlags(net::LOAD_VALIDATE_CACHE | kNoTracking);
// We don't need a copy of the response body for resource requests. The
// request is issued only to populate the browser cache.
std::unique_ptr<URLFetcherNullWriter> null_writer(new URLFetcherNullWriter);
network_url_fetcher_->SaveResponseWithWriter(std::move(null_writer));
} else {
// Config and manifest requests do not need to be revalidated. It's okay if
// they expire from the cache minutes after we request them.
network_url_fetcher_->SetLoadFlags(kNoTracking);
}
network_url_fetcher_->Start();
}
void PrecacheFetcher::Fetcher::OnURLFetchDownloadProgress(
const net::URLFetcher* source,
int64_t current,
int64_t total) {
// If going over the per-resource download cap.
if (fetch_stage_ == FetchStage::NETWORK &&
// |current| is guaranteed to be non-negative, so this cast is safe.
static_cast<size_t>(std::max(current, total)) > max_bytes_) {
VLOG(1) << "Cancelling " << url_ << ": (" << current << "/" << total
<< ") is over " << max_bytes_;
// Cancel the download.
network_url_fetcher_.reset();
// Call the completion callback, to attempt the next download, or to trigger
// cleanup in precache_delegate_->OnDone().
response_bytes_ = network_response_bytes_ = current;
callback_.Run(*this);
}
}
void PrecacheFetcher::Fetcher::OnURLFetchComplete(
const net::URLFetcher* source) {
CHECK(source);
if (fetch_stage_ == FetchStage::CACHE &&
(source->GetStatus().error() == net::ERR_CACHE_MISS ||
(source->GetResponseHeaders() &&
source->GetResponseHeaders()->HasValidators()))) {
// If the resource was not found in the cache, request it from the
// network.
//
// If the resource was found in the cache, but contains validators,
// request a refresh. The presence of validators increases the chance that
// we get a 304 response rather than a full one, thus allowing us to
// refresh the cache with minimal network load.
LoadFromNetwork();
return;
}
// If any of:
// - The request was for a config or manifest.
// - The resource was a cache hit without validators.
// - The response came from the network.
// Then Fetcher is done with this URL and can return control to the caller.
response_bytes_ = source->GetReceivedResponseContentLength();
network_response_bytes_ = source->GetTotalReceivedBytes();
callback_.Run(*this);
}
// static
void PrecacheFetcher::RecordCompletionStatistics(
const PrecacheUnfinishedWork& unfinished_work,
size_t remaining_manifest_urls_to_fetch,
size_t remaining_resource_urls_to_fetch) {
// These may be unset in tests.
if (!unfinished_work.has_start_time())
return;
base::TimeDelta time_to_fetch =
base::Time::Now() -
base::Time::FromInternalValue(unfinished_work.start_time());
UMA_HISTOGRAM_CUSTOM_TIMES("Precache.Fetch.TimeToComplete", time_to_fetch,
base::TimeDelta::FromSeconds(1),
base::TimeDelta::FromHours(4), 50);
// Number of manifests for which we have downloaded all resources.
int manifests_completed =
unfinished_work.num_manifest_urls() - remaining_manifest_urls_to_fetch;
// If there are resource URLs left to fetch, the last manifest is not yet
// completed.
if (remaining_resource_urls_to_fetch > 0)
--manifests_completed;
DCHECK_GE(manifests_completed, 0);
int percent_completed = unfinished_work.num_manifest_urls() == 0
? 0
: (static_cast<double>(manifests_completed) /
unfinished_work.num_manifest_urls() * 100);
UMA_HISTOGRAM_PERCENTAGE("Precache.Fetch.PercentCompleted",
percent_completed);
UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Total",
unfinished_work.total_bytes(),
1, kMaxResponseBytes, 100);
UMA_HISTOGRAM_CUSTOM_COUNTS("Precache.Fetch.ResponseBytes.Network",
unfinished_work.network_bytes(),
1, kMaxResponseBytes,
100);
}
PrecacheFetcher::PrecacheFetcher(
net::URLRequestContextGetter* request_context,
const GURL& config_url,
const std::string& manifest_url_prefix,
std::unique_ptr<PrecacheUnfinishedWork> unfinished_work,
uint32_t experiment_id,
PrecacheFetcher::PrecacheDelegate* precache_delegate)
: request_context_(request_context),
config_url_(config_url),
manifest_url_prefix_(manifest_url_prefix),
precache_delegate_(precache_delegate),
pool_(kMaxParallelFetches),
experiment_id_(experiment_id) {
DCHECK(request_context_.get()); // Request context must be non-NULL.
DCHECK(precache_delegate_); // Precache delegate must be non-NULL.
DCHECK_NE(GURL(), GetDefaultConfigURL())
<< "Could not determine the precache config settings URL.";
DCHECK_NE(std::string(), GetDefaultManifestURLPrefix())
<< "Could not determine the default precache manifest URL prefix.";
DCHECK(unfinished_work);
// Copy manifests and resources to member variables as a convenience.
// TODO(bengr): Consider accessing these directly from the proto.
for (const auto& manifest : unfinished_work->manifest()) {
if (manifest.has_url())
manifest_urls_to_fetch_.push_back(GURL(manifest.url()));
}
for (const auto& resource : unfinished_work->resource()) {
if (resource.has_url())
resource_urls_to_fetch_.push_back(GURL(resource.url()));
}
unfinished_work_ = std::move(unfinished_work);
}
PrecacheFetcher::~PrecacheFetcher() {
}
std::unique_ptr<PrecacheUnfinishedWork> PrecacheFetcher::CancelPrecaching() {
// This could get called multiple times, and it should be handled gracefully.
if (!unfinished_work_)
return nullptr;
unfinished_work_->clear_manifest();
unfinished_work_->clear_resource();
for (const auto& manifest : manifest_urls_to_fetch_)
unfinished_work_->add_manifest()->set_url(manifest.spec());
for (const auto& resource : resource_urls_to_fetch_)
unfinished_work_->add_resource()->set_url(resource.spec());
for (const auto& it : pool_.elements()) {
const Fetcher* fetcher = it.first;
if (fetcher->is_resource_request())
unfinished_work_->add_resource()->set_url(fetcher->url().spec());
else if (fetcher->url() != config_url_)
unfinished_work_->add_manifest()->set_url(fetcher->url().spec());
}
manifest_urls_to_fetch_.clear();
resource_urls_to_fetch_.clear();
pool_.DeleteAll();
return std::move(unfinished_work_);
}
void PrecacheFetcher::Start() {
if (unfinished_work_->has_config_settings()) {
DCHECK(unfinished_work_->has_start_time());
DetermineManifests();
return;
}
GURL config_url =
config_url_.is_empty() ? GetDefaultConfigURL() : config_url_;
DCHECK(config_url.is_valid()) << "Config URL not valid: "
<< config_url.possibly_invalid_spec();
// Fetch the precache configuration settings from the server.
DCHECK(pool_.IsEmpty()) << "All parallel requests should be available";
VLOG(3) << "Fetching " << config_url;
pool_.Add(base::WrapUnique(new Fetcher(
request_context_.get(), config_url,
base::Bind(&PrecacheFetcher::OnConfigFetchComplete,
base::Unretained(this)),
false /* is_resource_request */, std::numeric_limits<int32_t>::max())));
}
void PrecacheFetcher::StartNextResourceFetch() {
DCHECK(unfinished_work_->has_config_settings());
while (!resource_urls_to_fetch_.empty() && pool_.IsAvailable()) {
const size_t max_bytes =
std::min(unfinished_work_->config_settings().max_bytes_per_resource(),
unfinished_work_->config_settings().max_bytes_total() -
unfinished_work_->total_bytes());
VLOG(3) << "Fetching " << resource_urls_to_fetch_.front();
pool_.Add(base::WrapUnique(
new Fetcher(request_context_.get(), resource_urls_to_fetch_.front(),
base::Bind(&PrecacheFetcher::OnResourceFetchComplete,
base::Unretained(this)),
true /* is_resource_request */, max_bytes)));
resource_urls_to_fetch_.pop_front();
}
}
void PrecacheFetcher::StartNextManifestFetch() {
if (manifest_urls_to_fetch_.empty() || !pool_.IsAvailable())
return;
// We only fetch one manifest at a time to keep the size of
// resource_urls_to_fetch_ as small as possible.
VLOG(3) << "Fetching " << manifest_urls_to_fetch_.front();
pool_.Add(base::WrapUnique(new Fetcher(
request_context_.get(), manifest_urls_to_fetch_.front(),
base::Bind(&PrecacheFetcher::OnManifestFetchComplete,
base::Unretained(this)),
false /* is_resource_request */, std::numeric_limits<int32_t>::max())));
manifest_urls_to_fetch_.pop_front();
}
void PrecacheFetcher::NotifyDone(
size_t remaining_manifest_urls_to_fetch,
size_t remaining_resource_urls_to_fetch) {
RecordCompletionStatistics(*unfinished_work_,
remaining_manifest_urls_to_fetch,
remaining_resource_urls_to_fetch);
precache_delegate_->OnDone();
}
void PrecacheFetcher::StartNextFetch() {
DCHECK(unfinished_work_->has_config_settings());
// If over the precache total size cap, then stop prefetching.
if (unfinished_work_->total_bytes() >
unfinished_work_->config_settings().max_bytes_total()) {
size_t pending_manifests_in_pool = 0;
size_t pending_resources_in_pool = 0;
for (const auto& element_pair : pool_.elements()) {
const Fetcher* fetcher = element_pair.first;
if (fetcher->is_resource_request())
pending_resources_in_pool++;
else if (fetcher->url() != config_url_)
pending_manifests_in_pool++;
}
pool_.DeleteAll();
NotifyDone(manifest_urls_to_fetch_.size() + pending_manifests_in_pool,
resource_urls_to_fetch_.size() + pending_resources_in_pool);
return;
}
StartNextResourceFetch();
StartNextManifestFetch();
if (pool_.IsEmpty()) {
// There are no more URLs to fetch, so end the precache cycle.
NotifyDone(0, 0);
// OnDone may have deleted this PrecacheFetcher, so don't do anything after
// it is called.
}
}
void PrecacheFetcher::OnConfigFetchComplete(const Fetcher& source) {
UpdateStats(source.response_bytes(), source.network_response_bytes());
if (source.network_url_fetcher() == nullptr) {
pool_.DeleteAll(); // Cancel any other ongoing request.
} else {
// Attempt to parse the config proto. On failure, continue on with the
// default configuration.
ParseProtoFromFetchResponse(
*source.network_url_fetcher(),
unfinished_work_->mutable_config_settings());
pool_.Delete(source);
DetermineManifests();
}
}
void PrecacheFetcher::DetermineManifests() {
DCHECK(unfinished_work_->has_config_settings());
std::string prefix = manifest_url_prefix_.empty()
? GetDefaultManifestURLPrefix()
: manifest_url_prefix_;
DCHECK_NE(std::string(), prefix)
<< "Could not determine the precache manifest URL prefix.";
// Keep track of manifest URLs that are being fetched, in order to elide
// duplicates.
base::hash_set<std::string> seen_manifest_urls;
// Attempt to fetch manifests for starting hosts up to the maximum top sites
// count. If a manifest does not exist for a particular starting host, then
// the fetch will fail, and that starting host will be ignored. Starting
// hosts are not added if this is a continuation from a previous precache
// session.
if (manifest_urls_to_fetch_.empty() &&
resource_urls_to_fetch_.empty()) {
int64_t rank = 0;
for (const auto& host : unfinished_work_->top_host()) {
++rank;
if (rank > unfinished_work_->config_settings().top_sites_count())
break;
AppendManifestURLIfValidAndNew(prefix, host.hostname(),
&seen_manifest_urls,
&manifest_urls_to_fetch_);
}
for (const std::string& host
: unfinished_work_->config_settings().forced_site()) {
AppendManifestURLIfValidAndNew(prefix, host, &seen_manifest_urls,
&manifest_urls_to_fetch_);
}
}
unfinished_work_->set_num_manifest_urls(manifest_urls_to_fetch_.size());
StartNextFetch();
}
void PrecacheFetcher::OnManifestFetchComplete(const Fetcher& source) {
DCHECK(unfinished_work_->has_config_settings());
UpdateStats(source.response_bytes(), source.network_response_bytes());
if (source.network_url_fetcher() == nullptr) {
pool_.DeleteAll(); // Cancel any other ongoing request.
} else {
PrecacheManifest manifest;
if (ParseProtoFromFetchResponse(*source.network_url_fetcher(), &manifest)) {
const int32_t len =
std::min(manifest.resource_size(),
unfinished_work_->config_settings().top_resources_count());
const uint64_t resource_bitset =
GetResourceBitset(manifest, experiment_id_);
for (int i = 0; i < len; ++i) {
if (((0x1ULL << i) & resource_bitset) &&
manifest.resource(i).has_url()) {
GURL url(manifest.resource(i).url());
if (url.is_valid())
resource_urls_to_fetch_.push_back(url);
}
}
}
}
pool_.Delete(source);
StartNextFetch();
}
void PrecacheFetcher::OnResourceFetchComplete(const Fetcher& source) {
UpdateStats(source.response_bytes(), source.network_response_bytes());
pool_.Delete(source);
// The resource has already been put in the cache during the fetch process, so
// nothing more needs to be done for the resource.
StartNextFetch();
}
void PrecacheFetcher::UpdateStats(int64_t response_bytes,
int64_t network_response_bytes) {
unfinished_work_->set_total_bytes(
unfinished_work_->total_bytes() + response_bytes);
unfinished_work_->set_network_bytes(
unfinished_work_->network_bytes() + network_response_bytes);
}
} // namespace precache