| // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "pdf/document_loader.h" |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "base/logging.h" |
| #include "base/strings/string_util.h" |
| #include "net/http/http_util.h" |
| #include "ppapi/c/pp_errors.h" |
| #include "ppapi/cpp/url_loader.h" |
| #include "ppapi/cpp/url_request_info.h" |
| #include "ppapi/cpp/url_response_info.h" |
| |
| namespace chrome_pdf { |
| |
| namespace { |
| |
| // If the headers have a byte-range response, writes the start and end |
| // positions and returns true if at least the start position was parsed. |
| // The end position will be set to 0 if it was not found or parsed from the |
| // response. |
| // Returns false if not even a start position could be parsed. |
| bool GetByteRange(const std::string& headers, uint32_t* start, uint32_t* end) { |
| net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); |
| while (it.GetNext()) { |
| if (base::LowerCaseEqualsASCII(it.name(), "content-range")) { |
| std::string range = it.values().c_str(); |
| if (base::StartsWith(range, "bytes", |
| base::CompareCase::INSENSITIVE_ASCII)) { |
| range = range.substr(strlen("bytes")); |
| std::string::size_type pos = range.find('-'); |
| std::string range_end; |
| if (pos != std::string::npos) |
| range_end = range.substr(pos + 1); |
| base::TrimWhitespaceASCII(range, base::TRIM_LEADING, &range); |
| base::TrimWhitespaceASCII(range_end, base::TRIM_LEADING, &range_end); |
| *start = atoi(range.c_str()); |
| *end = atoi(range_end.c_str()); |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| // If the headers have a multi-part response, returns the boundary name. |
| // Otherwise returns an empty string. |
| std::string GetMultiPartBoundary(const std::string& headers) { |
| net::HttpUtil::HeadersIterator it(headers.begin(), headers.end(), "\n"); |
| while (it.GetNext()) { |
| if (base::LowerCaseEqualsASCII(it.name(), "content-type")) { |
| std::string type = base::ToLowerASCII(it.values()); |
| if (base::StartsWith(type, "multipart/", base::CompareCase::SENSITIVE)) { |
| const char* boundary = strstr(type.c_str(), "boundary="); |
| if (!boundary) { |
| NOTREACHED(); |
| break; |
| } |
| |
| return std::string(boundary + 9); |
| } |
| } |
| } |
| return std::string(); |
| } |
| |
| bool IsValidContentType(const std::string& type) { |
| return (base::EndsWith(type, "/pdf", base::CompareCase::INSENSITIVE_ASCII) || |
| base::EndsWith(type, ".pdf", base::CompareCase::INSENSITIVE_ASCII) || |
| base::EndsWith(type, "/x-pdf", |
| base::CompareCase::INSENSITIVE_ASCII) || |
| base::EndsWith(type, "/*", base::CompareCase::INSENSITIVE_ASCII) || |
| base::EndsWith(type, "/acrobat", |
| base::CompareCase::INSENSITIVE_ASCII) || |
| base::EndsWith(type, "/unknown", |
| base::CompareCase::INSENSITIVE_ASCII)); |
| } |
| |
| } // namespace |
| |
| DocumentLoader::Client::~Client() { |
| } |
| |
| DocumentLoader::DocumentLoader(Client* client) |
| : client_(client), partial_document_(false), request_pending_(false), |
| current_pos_(0), current_chunk_size_(0), current_chunk_read_(0), |
| document_size_(0), header_request_(true), is_multipart_(false) { |
| loader_factory_.Initialize(this); |
| } |
| |
| DocumentLoader::~DocumentLoader() { |
| } |
| |
| bool DocumentLoader::Init(const pp::URLLoader& loader, |
| const std::string& url, |
| const std::string& headers) { |
| DCHECK(url_.empty()); |
| url_ = url; |
| loader_ = loader; |
| |
| std::string response_headers; |
| if (!headers.empty()) { |
| response_headers = headers; |
| } else { |
| pp::URLResponseInfo response = loader_.GetResponseInfo(); |
| pp::Var headers_var = response.GetHeaders(); |
| |
| if (headers_var.is_string()) { |
| response_headers = headers_var.AsString(); |
| } |
| } |
| |
| bool accept_ranges_bytes = false; |
| bool content_encoded = false; |
| uint32_t content_length = 0; |
| std::string type; |
| std::string disposition; |
| |
| // This happens for PDFs not loaded from http(s) sources. |
| if (response_headers == "Content-Type: text/plain") { |
| if (!base::StartsWith(url, "http://", |
| base::CompareCase::INSENSITIVE_ASCII) && |
| !base::StartsWith(url, "https://", |
| base::CompareCase::INSENSITIVE_ASCII)) { |
| type = "application/pdf"; |
| } |
| } |
| if (type.empty() && !response_headers.empty()) { |
| net::HttpUtil::HeadersIterator it(response_headers.begin(), |
| response_headers.end(), "\n"); |
| while (it.GetNext()) { |
| if (base::LowerCaseEqualsASCII(it.name(), "content-length")) { |
| content_length = atoi(it.values().c_str()); |
| } else if (base::LowerCaseEqualsASCII(it.name(), "accept-ranges")) { |
| accept_ranges_bytes = base::LowerCaseEqualsASCII(it.values(), "bytes"); |
| } else if (base::LowerCaseEqualsASCII(it.name(), "content-encoding")) { |
| content_encoded = true; |
| } else if (base::LowerCaseEqualsASCII(it.name(), "content-type")) { |
| type = it.values(); |
| size_t semi_colon_pos = type.find(';'); |
| if (semi_colon_pos != std::string::npos) { |
| type = type.substr(0, semi_colon_pos); |
| } |
| TrimWhitespaceASCII(type, base::TRIM_ALL, &type); |
| } else if (base::LowerCaseEqualsASCII(it.name(), "content-disposition")) { |
| disposition = it.values(); |
| } |
| } |
| } |
| if (!type.empty() && !IsValidContentType(type)) |
| return false; |
| if (base::StartsWith(disposition, "attachment", |
| base::CompareCase::INSENSITIVE_ASCII)) |
| return false; |
| |
| if (content_length > 0) |
| chunk_stream_.Preallocate(content_length); |
| |
| document_size_ = content_length; |
| requests_count_ = 0; |
| |
| // Enable partial loading only if file size is above the threshold. |
| // It will allow avoiding latency for multiple requests. |
| if (content_length > kMinFileSize && |
| accept_ranges_bytes && |
| !content_encoded) { |
| LoadPartialDocument(); |
| } else { |
| LoadFullDocument(); |
| } |
| return true; |
| } |
| |
| void DocumentLoader::LoadPartialDocument() { |
| // The current request is a full request (not a range request) so it starts at |
| // 0 and ends at |document_size_|. |
| current_chunk_size_ = document_size_; |
| current_pos_ = 0; |
| current_request_offset_ = 0; |
| current_request_size_ = 0; |
| current_request_extended_size_ = document_size_; |
| request_pending_ = true; |
| |
| partial_document_ = true; |
| header_request_ = true; |
| ReadMore(); |
| } |
| |
| void DocumentLoader::LoadFullDocument() { |
| partial_document_ = false; |
| chunk_buffer_.clear(); |
| ReadMore(); |
| } |
| |
| bool DocumentLoader::IsDocumentComplete() const { |
| if (document_size_ == 0) // Document size unknown. |
| return false; |
| return IsDataAvailable(0, document_size_); |
| } |
| |
| uint32_t DocumentLoader::GetAvailableData() const { |
| if (document_size_ == 0) { // If document size is unknown. |
| return current_pos_; |
| } |
| |
| std::vector<std::pair<size_t, size_t> > ranges; |
| chunk_stream_.GetMissedRanges(0, document_size_, &ranges); |
| uint32_t available = document_size_; |
| for (const auto& range : ranges) |
| available -= range.second; |
| return available; |
| } |
| |
| void DocumentLoader::ClearPendingRequests() { |
| pending_requests_.erase(pending_requests_.begin(), |
| pending_requests_.end()); |
| } |
| |
| bool DocumentLoader::GetBlock(uint32_t position, |
| uint32_t size, |
| void* buf) const { |
| return chunk_stream_.ReadData(position, size, buf); |
| } |
| |
| bool DocumentLoader::IsDataAvailable(uint32_t position, uint32_t size) const { |
| return chunk_stream_.IsRangeAvailable(position, size); |
| } |
| |
| void DocumentLoader::RequestData(uint32_t position, uint32_t size) { |
| DCHECK(partial_document_); |
| |
| // We have some artefact request from |
| // PDFiumEngine::OnDocumentComplete() -> FPDFAvail_IsPageAvail after |
| // document is complete. |
| // We need this fix in PDFIum. Adding this as a work around. |
| // Bug: http://code.google.com/p/chromium/issues/detail?id=79996 |
| // Test url: |
| // http://www.icann.org/en/correspondence/holtzman-to-jeffrey-02mar11-en.pdf |
| if (IsDocumentComplete()) |
| return; |
| |
| pending_requests_.push_back(std::pair<size_t, size_t>(position, size)); |
| DownloadPendingRequests(); |
| } |
| |
| void DocumentLoader::RemoveCompletedRanges() { |
| // Split every request that has been partially downloaded already into smaller |
| // requests. |
| std::vector<std::pair<size_t, size_t> > ranges; |
| auto it = pending_requests_.begin(); |
| while (it != pending_requests_.end()) { |
| chunk_stream_.GetMissedRanges(it->first, it->second, &ranges); |
| pending_requests_.insert(it, ranges.begin(), ranges.end()); |
| ranges.clear(); |
| pending_requests_.erase(it++); |
| } |
| } |
| |
| void DocumentLoader::DownloadPendingRequests() { |
| if (request_pending_) |
| return; |
| |
| uint32_t pos; |
| uint32_t size; |
| if (pending_requests_.empty()) { |
| // If the document is not complete and we have no outstanding requests, |
| // download what's left for as long as no other request gets added to |
| // |pending_requests_|. |
| pos = chunk_stream_.GetFirstMissingByte(); |
| if (pos >= document_size_) { |
| // We're done downloading the document. |
| return; |
| } |
| // Start with size 0, we'll set |current_request_extended_size_| to > 0. |
| // This way this request will get cancelled as soon as the renderer wants |
| // another portion of the document. |
| size = 0; |
| } else { |
| RemoveCompletedRanges(); |
| |
| pos = pending_requests_.front().first; |
| size = pending_requests_.front().second; |
| if (IsDataAvailable(pos, size)) { |
| ReadComplete(); |
| return; |
| } |
| } |
| |
| size_t last_byte_before = chunk_stream_.GetFirstMissingByteInInterval(pos); |
| if (size < kDefaultRequestSize) { |
| // Try to extend before pos, up to size |kDefaultRequestSize|. |
| if (pos + size - last_byte_before > kDefaultRequestSize) { |
| pos += size - kDefaultRequestSize; |
| size = kDefaultRequestSize; |
| } else { |
| size += pos - last_byte_before; |
| pos = last_byte_before; |
| } |
| } |
| if (pos - last_byte_before < kDefaultRequestSize) { |
| // Don't leave a gap smaller than |kDefaultRequestSize|. |
| size += pos - last_byte_before; |
| pos = last_byte_before; |
| } |
| |
| current_request_offset_ = pos; |
| current_request_size_ = size; |
| |
| // Extend the request until the next downloaded byte or the end of the |
| // document. |
| size_t last_missing_byte = |
| chunk_stream_.GetLastMissingByteInInterval(pos + size - 1); |
| current_request_extended_size_ = last_missing_byte - pos + 1; |
| |
| request_pending_ = true; |
| |
| // Start downloading first pending request. |
| loader_.Close(); |
| loader_ = client_->CreateURLLoader(); |
| pp::CompletionCallback callback = |
| loader_factory_.NewCallback(&DocumentLoader::DidOpen); |
| pp::URLRequestInfo request = GetRequest(pos, current_request_extended_size_); |
| requests_count_++; |
| int rv = loader_.Open(request, callback); |
| if (rv != PP_OK_COMPLETIONPENDING) |
| callback.Run(rv); |
| } |
| |
| pp::URLRequestInfo DocumentLoader::GetRequest(uint32_t position, |
| uint32_t size) const { |
| pp::URLRequestInfo request(client_->GetPluginInstance()); |
| request.SetURL(url_); |
| request.SetMethod("GET"); |
| request.SetFollowRedirects(true); |
| request.SetCustomReferrerURL(url_); |
| |
| const size_t kBufSize = 100; |
| char buf[kBufSize]; |
| // According to rfc2616, byte range specifies position of the first and last |
| // bytes in the requested range inclusively. Therefore we should subtract 1 |
| // from the position + size, to get index of the last byte that needs to be |
| // downloaded. |
| base::snprintf(buf, kBufSize, "Range: bytes=%d-%d", position, |
| position + size - 1); |
| pp::Var header(buf); |
| request.SetHeaders(header); |
| |
| return request; |
| } |
| |
| void DocumentLoader::DidOpen(int32_t result) { |
| if (result != PP_OK) { |
| NOTREACHED(); |
| return; |
| } |
| |
| int32_t http_code = loader_.GetResponseInfo().GetStatusCode(); |
| if (http_code >= 400 && http_code < 500) { |
| // Error accessing resource. 4xx error indicate subsequent requests |
| // will fail too. |
| // E.g. resource has been removed from the server while loading it. |
| // https://code.google.com/p/chromium/issues/detail?id=414827 |
| return; |
| } |
| |
| is_multipart_ = false; |
| current_chunk_size_ = 0; |
| current_chunk_read_ = 0; |
| |
| pp::Var headers_var = loader_.GetResponseInfo().GetHeaders(); |
| std::string headers; |
| if (headers_var.is_string()) |
| headers = headers_var.AsString(); |
| |
| std::string boundary = GetMultiPartBoundary(headers); |
| if (!boundary.empty()) { |
| // Leave position untouched for now, when we read the data we'll get it. |
| is_multipart_ = true; |
| multipart_boundary_ = boundary; |
| } else { |
| // Need to make sure that the server returned a byte-range, since it's |
| // possible for a server to just ignore our byte-range request and just |
| // return the entire document even if it supports byte-range requests. |
| // i.e. sniff response to |
| // http://www.act.org/compass/sample/pdf/geometry.pdf |
| current_pos_ = 0; |
| uint32_t start_pos, end_pos; |
| if (GetByteRange(headers, &start_pos, &end_pos)) { |
| current_pos_ = start_pos; |
| if (end_pos && end_pos > start_pos) |
| current_chunk_size_ = end_pos - start_pos + 1; |
| } else { |
| partial_document_ = false; |
| } |
| } |
| |
| ReadMore(); |
| } |
| |
| void DocumentLoader::ReadMore() { |
| pp::CompletionCallback callback = |
| loader_factory_.NewCallback(&DocumentLoader::DidRead); |
| int rv = loader_.ReadResponseBody(buffer_, sizeof(buffer_), callback); |
| if (rv != PP_OK_COMPLETIONPENDING) |
| callback.Run(rv); |
| } |
| |
| void DocumentLoader::DidRead(int32_t result) { |
| if (result > 0) { |
| char* start = buffer_; |
| size_t length = result; |
| if (is_multipart_ && result > 2) { |
| for (int i = 2; i < result; ++i) { |
| if ((buffer_[i - 1] == '\n' && buffer_[i - 2] == '\n') || |
| (i >= 4 && |
| buffer_[i - 1] == '\n' && buffer_[i - 2] == '\r' && |
| buffer_[i - 3] == '\n' && buffer_[i - 4] == '\r')) { |
| uint32_t start_pos, end_pos; |
| if (GetByteRange(std::string(buffer_, i), &start_pos, &end_pos)) { |
| current_pos_ = start_pos; |
| start += i; |
| length -= i; |
| if (end_pos && end_pos > start_pos) |
| current_chunk_size_ = end_pos - start_pos + 1; |
| } |
| break; |
| } |
| } |
| |
| // Reset this flag so we don't look inside the buffer in future calls of |
| // DidRead for this response. Note that this code DOES NOT handle multi- |
| // part responses with more than one part (we don't issue them at the |
| // moment, so they shouldn't arrive). |
| is_multipart_ = false; |
| } |
| |
| if (current_chunk_size_ && |
| current_chunk_read_ + length > current_chunk_size_) |
| length = current_chunk_size_ - current_chunk_read_; |
| |
| if (length) { |
| if (document_size_ > 0) { |
| chunk_stream_.WriteData(current_pos_, start, length); |
| } else { |
| // If we did not get content-length in the response, we can't |
| // preallocate buffer for the entire document. Resizing array causing |
| // memory fragmentation issues on the large files and OOM exceptions. |
| // To fix this, we collect all chunks of the file to the list and |
| // concatenate them together after request is complete. |
| std::vector<unsigned char> buf(length); |
| memcpy(buf.data(), start, length); |
| chunk_buffer_.push_back(std::move(buf)); |
| } |
| current_pos_ += length; |
| current_chunk_read_ += length; |
| client_->OnNewDataAvailable(); |
| } |
| |
| // Only call the renderer if we allow partial loading. |
| if (!partial_document_) { |
| ReadMore(); |
| return; |
| } |
| |
| UpdateRendering(); |
| RemoveCompletedRanges(); |
| |
| if (!pending_requests_.empty()) { |
| // If there are pending requests and the current content we're downloading |
| // doesn't satisfy any of these requests, cancel the current request to |
| // fullfill those more important requests. |
| bool satisfying_pending_request = |
| SatisfyingRequest(current_request_offset_, current_request_size_); |
| for (const auto& pending_request : pending_requests_) { |
| if (SatisfyingRequest(pending_request.first, pending_request.second)) { |
| satisfying_pending_request = true; |
| break; |
| } |
| } |
| // Cancel the request as it's not satisfying any request from the |
| // renderer, unless the current request is finished in which case we let |
| // it finish cleanly. |
| if (!satisfying_pending_request && |
| current_pos_ < current_request_offset_ + |
| current_request_extended_size_) { |
| loader_.Close(); |
| } |
| } |
| |
| ReadMore(); |
| } else if (result == PP_OK || result == PP_ERROR_ABORTED) { |
| ReadComplete(); |
| } else { |
| NOTREACHED(); |
| } |
| } |
| |
| bool DocumentLoader::SatisfyingRequest(size_t offset, size_t size) const { |
| return offset <= current_pos_ + kDefaultRequestSize && |
| current_pos_ < offset + size; |
| } |
| |
| void DocumentLoader::ReadComplete() { |
| if (!partial_document_) { |
| if (document_size_ == 0) { |
| // For the document with no 'content-length" specified we've collected all |
| // the chunks already. Let's allocate final document buffer and copy them |
| // over. |
| chunk_stream_.Preallocate(current_pos_); |
| uint32_t pos = 0; |
| for (auto& chunk : chunk_buffer_) { |
| chunk_stream_.WriteData(pos, chunk.data(), chunk.size()); |
| pos += chunk.size(); |
| } |
| chunk_buffer_.clear(); |
| } |
| document_size_ = current_pos_; |
| client_->OnDocumentComplete(); |
| return; |
| } |
| |
| request_pending_ = false; |
| |
| if (IsDocumentComplete()) { |
| client_->OnDocumentComplete(); |
| return; |
| } |
| |
| UpdateRendering(); |
| DownloadPendingRequests(); |
| } |
| |
| void DocumentLoader::UpdateRendering() { |
| if (header_request_) |
| client_->OnPartialDocumentLoaded(); |
| else |
| client_->OnPendingRequestComplete(); |
| header_request_ = false; |
| } |
| |
| } // namespace chrome_pdf |