blob: c3385f8f3da16589bef118b6f905ab2330f1b353 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_
#define CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_
#include <stddef.h>
#include <stdint.h>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
#include "base/containers/circular_deque.h"
#include "base/files/file_path.h"
#include "base/gtest_prod_util.h"
#include "base/macros.h"
#include "base/memory/ref_counted.h"
#include "base/memory/weak_ptr.h"
#include "base/time/time.h"
#include "components/download/public/common/download_item.h"
#include "content/browser/download/save_types.h"
#include "content/common/content_export.h"
#include "content/public/browser/download_manager_delegate.h"
#include "content/public/browser/save_page_type.h"
#include "content/public/browser/web_contents_observer.h"
#include "content/public/common/referrer.h"
#include "net/base/net_errors.h"
#include "services/metrics/public/cpp/ukm_source_id.h"
#include "url/gurl.h"
class GURL;
namespace download {
class DownloadItemImpl;
}
namespace content {
class DownloadManagerImpl;
class FrameTreeNode;
class RenderFrameHostImpl;
struct SavableSubframe;
class SaveFileManager;
class SaveItem;
class SavePackage;
class WebContents;
// SavePackage manages the process of saving a page as only-HTML, complete-HTML
// or MHTML and provides status information about the job.
// - only-html: the web page is saved to a single HTML file excluding
// sub-resources and sub-frames
// - complete-html: the web page's main frame HTML is saved to the user selected
// file and a directory for the auxiliary files such as all sub-frame html
// files, image files, css files and js files is created
// - MHTML: the main frame and all auxiliary files are stored a single text
// file using the MHTML format.
//
// Each page saving job may include one or multiple files which need to be
// saved. Each file is represented by a SaveItem, and all SaveItems are owned
// by the SavePackage. SaveItems are created when a user initiates a page
// saving job, and exist for the duration of one contents's life time.
class CONTENT_EXPORT SavePackage
: public base::RefCountedThreadSafe<SavePackage>,
public WebContentsObserver,
public base::SupportsWeakPtr<SavePackage> {
public:
enum WaitState {
// State when created but not initialized.
INITIALIZE = 0,
// State when after initializing, but not yet saving.
START_PROCESS,
// Waiting on a list of savable resources from the backend.
RESOURCES_LIST,
// Waiting for data sent from net IO or from file system.
NET_FILES,
// Waiting for html DOM data sent from render process.
HTML_DATA,
// Saving page finished successfully.
SUCCESSFUL,
// Failed to save page.
FAILED
};
static const base::FilePath::CharType kDefaultHtmlExtension[];
// Constructor for user initiated page saving. This constructor results in a
// SavePackage that will generate and sanitize a suggested name for the user
// in the "Save As" dialog box.
explicit SavePackage(WebContents* web_contents);
// Initialize the SavePackage. Returns true if it initializes properly. Need
// to make sure that this method must be called in the UI thread because using
// g_browser_process on a non-UI thread can cause crashes during shutdown.
// |cb| will be called when the download::DownloadItem is created, before data
// is written to disk.
bool Init(const SavePackageDownloadCreatedCallback& cb);
// Cancel all in progress request, might be called by user or internal error.
void Cancel(bool user_action, bool cancel_download_item = true);
void Finish();
// Notifications sent from the download sequence to the UI thread.
void StartSave(const SaveFileCreateInfo* info);
bool UpdateSaveProgress(SaveItemId save_item_id,
int64_t size,
bool write_success);
// Called for updating end state.
void SaveFinished(SaveItemId save_item_id, int64_t size, bool is_success);
void SaveCanceled(const SaveItem* save_item);
// Calculate the percentage of whole save page job.
// Rough percent complete, -1 means we don't know (since we didn't receive a
// total size).
int PercentComplete();
bool canceled() const { return user_canceled_ || disk_error_occurred_; }
bool finished() const { return finished_; }
SavePageType save_type() const { return save_type_; }
SavePackageId id() const { return unique_id_; }
void GetSaveInfo();
private:
friend class base::RefCountedThreadSafe<SavePackage>;
// Friends for testing. Needed for accessing the test-only constructor below.
friend class SavePackageTest;
friend class WebContentsImpl;
FRIEND_TEST_ALL_PREFIXES(SavePackageTest, TestSuggestedSaveNames);
FRIEND_TEST_ALL_PREFIXES(SavePackageTest, TestLongSafePureFilename);
FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, ImplicitCancel);
FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, ExplicitCancel);
FRIEND_TEST_ALL_PREFIXES(SavePackageBrowserTest, DownloadItemDestroyed);
// Map from SaveItem::id() (aka save_item_id) into a SaveItem.
using SaveItemIdMap = std::
unordered_map<SaveItemId, std::unique_ptr<SaveItem>, SaveItemId::Hasher>;
using FileNameSet = std::set<base::FilePath::StringType,
bool (*)(base::FilePath::StringPieceType,
base::FilePath::StringPieceType)>;
using FileNameCountMap =
std::unordered_map<base::FilePath::StringType, uint32_t>;
// Used only for testing. Bypasses the file and directory name generation /
// sanitization by providing well known paths better suited for tests.
SavePackage(WebContents* web_contents,
SavePageType save_type,
const base::FilePath& file_full_path,
const base::FilePath& directory_full_path);
~SavePackage() override;
void InitWithDownloadItem(
const SavePackageDownloadCreatedCallback& download_created_callback,
download::DownloadItemImpl* item);
// Callback for WebContents::GenerateMHTML().
void OnMHTMLGenerated(int64_t size);
// Notes from Init() above applies here as well.
void InternalInit();
void Stop(bool cancel_download_item);
void CheckFinish();
// Initiate a saving job of a specific URL. We send the request to
// SaveFileManager, which will dispatch it to different approach according to
// the save source. |process_all_remaining_items| indicates whether we need to
// save all remaining items.
void SaveNextFile(bool process_all_remainder_items);
// Continue processing the save page job after one SaveItem has been finished.
void DoSavingProcess();
// WebContentsObserver implementation.
bool OnMessageReceived(const IPC::Message& message,
RenderFrameHost* render_frame_host) override;
// Update the download history of this item upon completion.
void FinalizeDownloadEntry();
// Return max length of a path for a specific base directory.
// This is needed on POSIX, which restrict the length of file names in
// addition to the restriction on the length of path names.
// |base_dir| is assumed to be a directory name with no trailing slash.
static uint32_t GetMaxPathLengthForDirectory(const base::FilePath& base_dir);
// Truncates a filename to fit length constraints.
//
// |directory| : Directory containing target file.
// |extension| : Extension.
// |max_path_len| : Maximum size allowed for |len(directory + base_name +
// extension|.
// |base_name| : Variable portion. The length of this component will be
// adjusted to fit the length constraints described at
// |max_path_len| above.
//
// Returns true if |base_name| could be successfully adjusted to fit the
// aforementioned constraints, or false otherwise.
// TODO(asanka): This function is wrong. |base_name| cannot be truncated
// without knowing its encoding and truncation has to be performed on
// character boundaries. Also the implementation doesn't look up the actual
// path constraints and instead uses hard coded constants. crbug.com/618737
static bool TruncateBaseNameToFitPathConstraints(
const base::FilePath& directory,
const base::FilePath::StringType& extension,
uint32_t max_path_len,
base::FilePath::StringType* base_name);
// Create a file name based on the response from the server.
bool GenerateFileName(const std::string& disposition,
const GURL& url,
bool need_html_ext,
base::FilePath::StringType* generated_name);
// Main routine that initiates asking all frames for their savable resources.
//
// Responses are received asynchronously by OnSavableResourceLinks... methods
// and pending responses are counted/tracked by
// CompleteSavableResourceLinksResponse.
//
// OnSavableResourceLinksResponse creates SaveItems for each savable resource
// and each subframe - these SaveItems get enqueued into |waiting_item_queue_|
// with the help of CreatePendingSaveItem, EnqueueSavableResource,
// EnqueueFrame.
void GetSavableResourceLinks();
// Response from |sender| frame to GetSavableResourceLinks request.
void OnSavableResourceLinksResponse(
RenderFrameHostImpl* sender,
const std::vector<GURL>& resources_list,
const Referrer& referrer,
const std::vector<SavableSubframe>& subframes);
// Helper for finding or creating a SaveItem with the given parameters.
SaveItem* CreatePendingSaveItem(
int container_frame_tree_node_id,
int save_item_frame_tree_node_id,
const GURL& url,
const Referrer& referrer,
SaveFileCreateInfo::SaveFileSource save_source);
// Helper for finding a SaveItem with the given url, or falling back to
// creating a SaveItem with the given parameters.
void CreatePendingSaveItemDeduplicatingByUrl(
int container_frame_tree_node_id,
int save_item_frame_tree_node_id,
const GURL& url,
const Referrer& referrer,
SaveFileCreateInfo::SaveFileSource save_source);
// Helper to enqueue a savable resource reported by GetSavableResourceLinks.
void EnqueueSavableResource(int container_frame_tree_node_id,
const GURL& url,
const Referrer& referrer);
// Helper to enqueue a subframe reported by GetSavableResourceLinks.
void EnqueueFrame(int container_frame_tree_node_id,
int frame_tree_node_id,
const GURL& frame_original_url);
// Response to GetSavableResourceLinks that indicates an error when processing
// the frame associated with |sender|.
void OnSavableResourceLinksError(RenderFrameHostImpl* sender);
// Helper tracking how many |number_of_frames_pending_response_| we have
// left and kicking off the next phase after we got all the
// OnSavableResourceLinksResponse messages we were waiting for.
void CompleteSavableResourceLinksResponse();
// For each frame in the current page, ask the renderer process associated
// with that frame to serialize that frame into html.
void GetSerializedHtmlWithLocalLinks();
// Ask renderer process to serialize |target_tree_node| into html data
// with resource links replaced with a link to a locally saved copy.
void GetSerializedHtmlWithLocalLinksForFrame(FrameTreeNode* target_tree_node);
// Routes html data (sent by renderer process in response to
// GetSerializedHtmlWithLocalLinksForFrame above) to the associated local file
// (and also keeps track of when all frames have been completed).
void OnSerializedHtmlWithLocalLinksResponse(RenderFrameHostImpl* sender,
const std::string& data,
bool end_of_data);
// Look up SaveItem by save item id from in progress map.
SaveItem* LookupInProgressSaveItem(SaveItemId save_item_id);
// Remove SaveItem from in progress map and put it to saved map.
void PutInProgressItemToSavedMap(SaveItem* save_item);
// Retrieves the URL to be saved from the WebContents.
static GURL GetUrlToBeSaved(WebContents* web_contents);
static base::FilePath CreateDirectoryOnFileThread(
const base::string16& title,
const GURL& page_url,
bool can_save_as_complete,
const std::string& mime_type,
const base::FilePath& website_save_dir,
const base::FilePath& download_save_dir,
bool skip_dir_check);
void ContinueGetSaveInfo(bool can_save_as_complete,
const base::FilePath& suggested_path);
void OnPathPicked(
const base::FilePath& final_name,
SavePageType type,
const SavePackageDownloadCreatedCallback& cb);
// The number of in process SaveItems.
int in_process_count() const {
return static_cast<int>(in_progress_items_.size());
}
// The number of all SaveItems which have completed, including success items
// and failed items.
int completed_count() const {
return static_cast<int>(saved_success_items_.size() +
saved_failed_items_.size());
}
// The current speed in files per second. This is used to update the
// download::DownloadItem associated to this SavePackage. The files per second
// is presented by the download::DownloadItem to the UI as bytes per second,
// which is not correct but matches the way the total and received number of
// files is presented as the total and received bytes.
int64_t CurrentSpeed() const;
// A queue for items we are about to start saving.
base::circular_deque<std::unique_ptr<SaveItem>> waiting_item_queue_;
// Map of all saving job in in-progress state.
SaveItemIdMap in_progress_items_;
// Map of all saving job which are failed.
SaveItemIdMap saved_failed_items_;
// Used to de-dupe urls that are being gathered into |waiting_item_queue_|
// and also to find SaveItems to associate with a containing frame.
// Note that |url_to_save_item_| does NOT own SaveItems - they
// remain owned by waiting_item_queue_, in_progress_items_, etc.
std::map<GURL, SaveItem*> url_to_save_item_;
// Map used to route responses from a given a subframe (i.e.
// OnSerializedHtmlWithLocalLinksResponse) to the right SaveItem.
// Note that |frame_tree_node_id_to_save_item_| does NOT own SaveItems - they
// remain owned by waiting_item_queue_, in_progress_items_, etc.
std::unordered_map<int, SaveItem*> frame_tree_node_id_to_save_item_;
// Used to limit which local paths get exposed to which frames
// (i.e. to prevent information disclosure to oop frames).
// Note that |frame_tree_node_id_to_contained_save_items_| does NOT own
// SaveItems - they remain owned by waiting_item_queue_, in_progress_items_,
// etc.
std::unordered_map<int, std::vector<SaveItem*>>
frame_tree_node_id_to_contained_save_items_;
// Number of frames that we still need to get a response from.
int number_of_frames_pending_response_ = 0;
// Map of all saving job which are successfully saved.
SaveItemIdMap saved_success_items_;
// Non-owning pointer for handling file writing on the download sequence.
SaveFileManager* file_manager_ = nullptr;
// DownloadManager owns the download::DownloadItem and handles history and UI.
DownloadManagerImpl* download_manager_ = nullptr;
download::DownloadItemImpl* download_ = nullptr;
// The URL of the page the user wants to save.
const GURL page_url_;
base::FilePath saved_main_file_path_;
base::FilePath saved_main_directory_path_;
// The title of the page the user wants to save.
const base::string16 title_;
// Used to calculate package download speed (in files per second).
const base::TimeTicks start_tick_;
// Indicates whether the actual saving job is finishing or not.
bool finished_ = false;
// Indicates whether user canceled the saving job.
bool user_canceled_ = false;
// Indicates whether user get disk error.
bool disk_error_occurred_ = false;
// Variables to record errors that happened so we can record them via
// UMA statistics.
bool wrote_to_completed_file_ = false;
bool wrote_to_failed_file_ = false;
// Type about saving page as only-html or complete-html.
SavePageType save_type_ = SAVE_PAGE_TYPE_UNKNOWN;
// Number of all need to be saved resources.
size_t all_save_items_count_ = 0;
// This set is used to eliminate duplicated file names in saving directory.
FileNameSet file_name_set_;
// This map is used to track serial number for specified filename.
FileNameCountMap file_name_count_map_;
// Indicates current waiting state when SavePackage try to get something
// from outside.
WaitState wait_state_ = INITIALIZE;
// Unique ID for this SavePackage.
const SavePackageId unique_id_;
// UKM IDs for reporting.
ukm::SourceId ukm_source_id_;
uint64_t ukm_download_id_;
DISALLOW_COPY_AND_ASSIGN(SavePackage);
};
} // namespace content
#endif // CONTENT_BROWSER_DOWNLOAD_SAVE_PACKAGE_H_