blob: 3f45d288a8bd756c2ff49cf76643ff73612ffc69 [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "ios/chrome/browser/reading_list/url_downloader.h"
#include <string>
#include <vector>
#include "base/base64.h"
#include "base/bind.h"
#include "base/containers/contains.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/json/json_writer.h"
#include "base/memory/ptr_util.h"
#include "base/metrics/histogram_macros.h"
#include "base/path_service.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/task/thread_pool.h"
#include "components/reading_list/core/offline_url_utils.h"
#include "ios/chrome/browser/chrome_paths.h"
#include "ios/chrome/browser/dom_distiller/distiller_viewer.h"
#include "ios/chrome/browser/reading_list/reading_list_distiller_page.h"
#include "ios/chrome/browser/reading_list/reading_list_distiller_page_factory.h"
#include "net/base/load_flags.h"
#include "net/base/mime_sniffer.h"
#include "net/http/http_response_headers.h"
#include "services/network/public/cpp/resource_request.h"
#include "services/network/public/cpp/shared_url_loader_factory.h"
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#include "url/gurl.h"
#if !defined(__has_feature) || !__has_feature(objc_arc)
#error "This file requires ARC support."
#endif
namespace {
// This script disables context menu on img elements.
// The pages are stored locally and long pressing on them will trigger a context
// menu on the file:// URL which cannot be opened. Disable the context menu.
const char kDisableImageContextMenuScript[] =
"<script nonce=\"$1\">"
"document.addEventListener('DOMContentLoaded', function (event) {"
" var imgMenuDisabler = document.createElement('style');"
" imgMenuDisabler.innerHTML = 'img { -webkit-touch-callout: none; }';"
" document.head.appendChild(imgMenuDisabler);"
"}, false);"
"</script>";
// This script replaces any downloaded images with a data uri.
const char kReplaceDownloadedImagesScript[] =
"<script nonce=\"$1\">"
"document.addEventListener('DOMContentLoaded', function (event) {"
" var imgData = {};"
" $2"
" var imgTags = document.getElementsByTagName(\"img\");"
" for(image of imgTags) {"
" image.src = imgData[image.src] || image.src;"
" }"
"}, false);"
"</script>";
// The maximum size for the distilled page.
// Note that the sum of the size of the resources will be used for this check,
// so the total size of the page after processing can be slightly more than
// this.
const int kMaximumTotalPageSize = 10 * 1024 * 1024;
// The maximum size for a single raw image. If a bigger image is found, the
// page distillation is canceled (page will only be available online).
const int kMaximumImageSize = 1024 * 1024;
} // namespace
// URLDownloader
URLDownloader::URLDownloader(
dom_distiller::DistillerFactory* distiller_factory,
reading_list::ReadingListDistillerPageFactory* distiller_page_factory,
PrefService* prefs,
base::FilePath chrome_profile_path,
scoped_refptr<network::SharedURLLoaderFactory> url_loader_factory,
const DownloadCompletion& download_completion,
const SuccessCompletion& delete_completion)
: distiller_page_factory_(distiller_page_factory),
distiller_factory_(distiller_factory),
pref_service_(prefs),
download_completion_(download_completion),
delete_completion_(delete_completion),
working_(false),
base_directory_(chrome_profile_path),
mime_type_(),
url_loader_factory_(std::move(url_loader_factory)),
task_runner_(base::ThreadPool::CreateSequencedTaskRunner(
{base::MayBlock(), base::TaskPriority::BEST_EFFORT,
base::TaskShutdownBehavior::SKIP_ON_SHUTDOWN})),
task_tracker_() {}
URLDownloader::~URLDownloader() {
task_tracker_.TryCancelAll();
}
void URLDownloader::OfflinePathExists(const base::FilePath& path,
base::OnceCallback<void(bool)> callback) {
task_tracker_.PostTaskAndReplyWithResult(
task_runner_.get(), FROM_HERE, base::BindOnce(&base::PathExists, path),
std::move(callback));
}
void URLDownloader::RemoveOfflineURL(const GURL& url) {
// Remove all download tasks for this url as it would be pointless work.
CancelDownloadOfflineURL(url);
tasks_.push_back(std::make_pair(DELETE, url));
HandleNextTask();
}
void URLDownloader::DownloadOfflineURL(const GURL& url) {
if (!base::Contains(tasks_, std::make_pair(DOWNLOAD, url))) {
tasks_.push_back(std::make_pair(DOWNLOAD, url));
HandleNextTask();
}
}
void URLDownloader::CancelDownloadOfflineURL(const GURL& url) {
tasks_.erase(
std::remove(tasks_.begin(), tasks_.end(), std::make_pair(DOWNLOAD, url)),
tasks_.end());
}
void URLDownloader::DownloadCompletionHandler(
const GURL& url,
const std::string& title,
const base::FilePath& offline_path,
SuccessState success) {
DCHECK(working_);
auto post_delete = base::BindOnce(
[](URLDownloader* _this, const GURL& url, const std::string& title,
const base::FilePath& offline_path, SuccessState success) {
_this->download_completion_.Run(url, _this->distilled_url_, success,
offline_path, _this->saved_size_,
title);
_this->distiller_.reset();
_this->working_ = false;
_this->HandleNextTask();
},
base::Unretained(this), url, title, offline_path, success);
// If downloading failed, clean up any partial download.
if (success == ERROR) {
base::FilePath directory_path =
reading_list::OfflineURLDirectoryAbsolutePath(base_directory_, url);
task_tracker_.PostTaskAndReply(
task_runner_.get(), FROM_HERE,
base::BindOnce(
[](const base::FilePath& offline_directory_path) {
base::DeletePathRecursively(offline_directory_path);
},
directory_path),
std::move(post_delete));
} else {
std::move(post_delete).Run();
}
}
void URLDownloader::DeleteCompletionHandler(const GURL& url, bool success) {
DCHECK(working_);
delete_completion_.Run(url, success);
working_ = false;
HandleNextTask();
}
void URLDownloader::HandleNextTask() {
if (working_ || tasks_.empty()) {
return;
}
working_ = true;
Task task = tasks_.front();
tasks_.pop_front();
GURL url = task.second;
base::FilePath directory_path =
reading_list::OfflineURLDirectoryAbsolutePath(base_directory_, url);
if (task.first == DELETE) {
task_tracker_.PostTaskAndReplyWithResult(
task_runner_.get(), FROM_HERE,
base::BindOnce(&base::DeletePathRecursively, directory_path),
base::BindOnce(&URLDownloader::DeleteCompletionHandler,
base::Unretained(this), url));
} else if (task.first == DOWNLOAD) {
DCHECK(!distiller_);
OfflinePathExists(directory_path,
base::BindOnce(&URLDownloader::DownloadURL,
base::Unretained(this), url));
}
}
void URLDownloader::DownloadURL(const GURL& url, bool offline_url_exists) {
if (offline_url_exists) {
DownloadCompletionHandler(url, std::string(), base::FilePath(),
DOWNLOAD_EXISTS);
return;
}
original_url_ = url;
distilled_url_ = url;
saved_size_ = 0;
std::unique_ptr<reading_list::ReadingListDistillerPage>
reading_list_distiller_page =
distiller_page_factory_->CreateReadingListDistillerPage(url, this);
distiller_.reset(new dom_distiller::DistillerViewer(
distiller_factory_, std::move(reading_list_distiller_page), pref_service_,
url,
base::BindRepeating(&URLDownloader::DistillerCallback,
base::Unretained(this))));
}
void URLDownloader::DistilledPageRedirectedToURL(const GURL& page_url,
const GURL& redirected_url) {
DCHECK(original_url_ == page_url);
distilled_url_ = redirected_url;
}
void URLDownloader::DistilledPageHasMimeType(const GURL& original_url,
const std::string& mime_type) {
DCHECK(original_url_ == original_url);
mime_type_ = mime_type;
}
void URLDownloader::OnURLLoadComplete(const GURL& original_url,
base::FilePath response_path) {
// At the moment, only pdf files are downloaded using URLFetcher.
DCHECK(mime_type_ == "application/pdf");
base::FilePath path = reading_list::OfflinePagePath(
original_url_, reading_list::OFFLINE_TYPE_PDF);
std::string mime_type;
if (url_loader_->ResponseInfo()) {
mime_type = url_loader_->ResponseInfo()->mime_type;
}
if (response_path.empty() || mime_type != mime_type_) {
return DownloadCompletionHandler(original_url_, "", path, ERROR);
}
task_tracker_.PostTaskAndReplyWithResult(
task_runner_.get(), FROM_HERE,
base::BindOnce(&URLDownloader::SavePDFFile, base::Unretained(this),
response_path),
base::BindOnce(&URLDownloader::DownloadCompletionHandler,
base::Unretained(this), original_url, "", path));
url_loader_.reset();
}
void URLDownloader::CancelTask() {
task_tracker_.TryCancelAll();
distiller_.reset();
}
void URLDownloader::FetchPDFFile() {
const GURL& pdf_url =
distilled_url_.is_valid() ? distilled_url_ : original_url_;
auto resource_request = std::make_unique<network::ResourceRequest>();
resource_request->url = pdf_url;
resource_request->load_flags = net::LOAD_SKIP_CACHE_VALIDATION;
url_loader_ = network::SimpleURLLoader::Create(std::move(resource_request),
NO_TRAFFIC_ANNOTATION_YET);
url_loader_->DownloadToTempFile(
url_loader_factory_.get(),
base::BindOnce(&URLDownloader::OnURLLoadComplete, base::Unretained(this),
pdf_url));
}
URLDownloader::SuccessState URLDownloader::SavePDFFile(
const base::FilePath& temporary_path) {
if (CreateOfflineURLDirectory(original_url_)) {
base::FilePath path = reading_list::OfflinePagePath(
original_url_, reading_list::OFFLINE_TYPE_PDF);
base::FilePath absolute_path =
reading_list::OfflineURLAbsolutePathFromRelativePath(base_directory_,
path);
if (base::Move(temporary_path, absolute_path)) {
int64_t pdf_file_size;
base::GetFileSize(absolute_path, &pdf_file_size);
saved_size_ += pdf_file_size;
return DOWNLOAD_SUCCESS;
} else {
return ERROR;
}
}
return ERROR;
}
void URLDownloader::DistillerCallback(
const GURL& page_url,
const std::string& html,
const std::vector<dom_distiller::DistillerViewerInterface::ImageInfo>&
images,
const std::string& title) {
if (html.empty()) {
// The page may not be HTML. Check the mime-type to see if another handler
// can save offline content.
if (mime_type_ == "application/pdf") {
// PDF handler just downloads the PDF file.
FetchPDFFile();
return;
}
// This content cannot be processed, return an error value to the client.
DownloadCompletionHandler(page_url, std::string(), base::FilePath(), ERROR);
return;
}
task_tracker_.PostTaskAndReplyWithResult(
task_runner_.get(), FROM_HERE,
base::BindOnce(&URLDownloader::SaveDistilledHTML, base::Unretained(this),
page_url, images, html),
base::BindOnce(&URLDownloader::DownloadCompletionHandler,
base::Unretained(this), page_url, title,
reading_list::OfflinePagePath(
page_url, reading_list::OFFLINE_TYPE_HTML)));
}
URLDownloader::SuccessState URLDownloader::SaveDistilledHTML(
const GURL& url,
const std::vector<dom_distiller::DistillerViewerInterface::ImageInfo>&
images,
const std::string& html) {
int total_size = html.size();
for (size_t i = 0; i < images.size(); i++) {
if (images[i].data.size() > kMaximumImageSize) {
UMA_HISTOGRAM_MEMORY_KB("IOS.ReadingList.ImageTooLargeFailure",
images[i].data.size() / 1024);
return PERMANENT_ERROR;
}
// Image will be base64 encoded.
total_size += 4 * images[i].data.size() / 3;
}
if (total_size > kMaximumTotalPageSize) {
UMA_HISTOGRAM_MEMORY_KB("IOS.ReadingList.PageTooLargeFailure",
total_size / 1024);
return PERMANENT_ERROR;
}
if (CreateOfflineURLDirectory(url)) {
return SaveHTMLForURL(ReplaceImagesInHTML(url, html, images), url)
? DOWNLOAD_SUCCESS
: ERROR;
}
return ERROR;
}
bool URLDownloader::CreateOfflineURLDirectory(const GURL& url) {
base::FilePath directory_path =
reading_list::OfflineURLDirectoryAbsolutePath(base_directory_, url);
if (!DirectoryExists(directory_path)) {
return CreateDirectoryAndGetError(directory_path, nil);
}
return true;
}
std::string URLDownloader::ReplaceImagesInHTML(
const GURL& url,
const std::string& html,
const std::vector<dom_distiller::DistillerViewerInterface::ImageInfo>&
images) {
std::string mutable_html = html;
std::string image_js;
bool local_images_found = false;
for (size_t i = 0; i < images.size(); i++) {
if (images[i].url.SchemeIs(url::kDataScheme)) {
// Data URI, the data part of the image is empty, no need to store it.
continue;
}
std::string local_image_name;
// Mixed content is HTTP images on HTTPS pages.
bool image_is_mixed_content = distilled_url_.SchemeIsCryptographic() &&
!images[i].url.SchemeIsCryptographic();
// Only inline images if it is not mixed content and image data is valid.
if (image_is_mixed_content || !images[i].url.is_valid() ||
images[i].data.empty()) {
continue;
}
// Try to detect the mime-type from the bytes so an arbitrary page cannot
// be included. Returned mime-type must start with "image/".
std::string sniffed_type;
if (!net::SniffMimeTypeFromLocalData(images[i].data, &sniffed_type)) {
continue;
}
if (!base::StartsWith(sniffed_type, "image/")) {
continue;
}
std::string image_url;
std::string image_data;
base::Value value(images[i].url.spec());
base::JSONWriter::Write(value, &image_url);
base::Base64Encode(images[i].data, &image_data);
std::string src_with_data =
base::StringPrintf("data:image/png;base64,%s", image_data.c_str());
image_js += "imgData[" + image_url + "] = \"" + src_with_data + "\";";
local_images_found = true;
}
if (local_images_found) {
std::vector<std::string> substitutions;
substitutions.push_back(distiller_->GetCspNonce());
mutable_html += base::ReplaceStringPlaceholders(
kDisableImageContextMenuScript, substitutions, nullptr);
substitutions.push_back(image_js);
mutable_html += base::ReplaceStringPlaceholders(
kReplaceDownloadedImagesScript, substitutions, nullptr);
}
return mutable_html;
}
bool URLDownloader::SaveHTMLForURL(std::string html, const GURL& url) {
if (html.empty()) {
return false;
}
base::FilePath path = reading_list::OfflineURLAbsolutePathFromRelativePath(
base_directory_,
reading_list::OfflinePagePath(url, reading_list::OFFLINE_TYPE_HTML));
int written = base::WriteFile(path, html.c_str(), html.length());
if (written <= 0) {
return false;
}
saved_size_ += written;
return true;
}