blob: 5c2b3d8cb8a8deb6978992f56783672ddc27238b [file] [log] [blame]
// Copyright 2017 The Goma Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "jarfile_reader.h"
#include <cstring>
#include "absl/strings/match.h"
#include "basictypes.h"
#include "glog/logging.h"
namespace {
static uint16_t ToUInt16(const char* ptr) {
return static_cast<uint8_t>(ptr[0]) | (static_cast<uint8_t>(ptr[1]) << 8);
}
static uint32_t ToUInt32(const char* ptr) {
return static_cast<uint8_t>(ptr[0]) | (static_cast<uint8_t>(ptr[1]) << 8) |
(static_cast<uint8_t>(ptr[2]) << 16) |
(static_cast<uint8_t>(ptr[3]) << 24);
}
} // namespace
namespace devtools_goma {
/* static */
std::unique_ptr<FileReader> JarFileReader::Create(const std::string& filename) {
if (!CanHandle(filename)) {
return nullptr;
}
std::unique_ptr<JarFileReader> file_reader(new JarFileReader(filename));
if (!file_reader->valid() || file_reader->detected_zip_normalized_time()) {
return nullptr;
}
// TODO: vlog if too chatty.
// Since the number of jar files should not be large, and we see the message
// once compiler_proxy read the file. I guess it not so chatty.
LOG(INFO) << "JarFileReader is used. filename=" << filename;
return std::move(file_reader);
}
/* static */
bool JarFileReader::CanHandle(const std::string& filename) {
return absl::EndsWith(filename, ".jar");
}
JarFileReader::JarFileReader(const std::string& filename)
: FileReader(filename),
buffer_head_pos_(0),
last_normalized_absolute_pos_(0),
is_buffer_normalized_(false),
is_central_directory_started_(false),
is_valid_(false),
detected_zip_normalized_time_(false),
offset_(0),
input_filename_(filename) {
buffer_.resize(0x30);
if (FileReader::Read(&buffer_[0], buffer_.size()) != buffer_.size()) {
return;
}
// If the file looks like ZIP archive, it might be ok to normalize.
// Some jar files used by Android build seems not be valid jar file but
// we allow jarfile reader to normalize it if it looks like zip file.
// (b/38329025)
if (absl::StartsWith(buffer_, "PK\x03\x04")) {
is_valid_ = true;
}
// Checks the Jar file magic string (0xcafe) existence.
// I am not confident we can normalize a broken jar file, and ease of
// finding such a file, let me log.
LOG_IF(WARNING, ToUInt16(buffer_.data() + 0x27) != 0xcafe)
<< "JarFileReader: the file seems not have jar file magic:"
<< "expect 0xcafe (little endian) but " << std::hex
<< ToUInt16(buffer_.data() + 0x27)
<< " input_filename=" << input_filename_;
// If ziptime has already been applied, we do not need to normalize.
//
// See also:
// https://android.googlesource.com/platform/build/+/master/tools/ziptime/ZipEntry.cpp
// kZipTimeStaticDate come from ziptime code above, and it is 2008-01-01.
// date format: (year - 1980) << 9 | month << 5 | day.
static uint16_t kZipTimeStaticDate = (2008 - 1980) << 9 | 1 << 5 | 1;
static uint16_t kZipTimeStaticTime = 0;
if (ToUInt16(buffer_.data() + 0x0a) == kZipTimeStaticTime &&
ToUInt16(buffer_.data() + 0x0c) == kZipTimeStaticDate) {
LOG(INFO) << "JarFileReader won't normalize jar file that has already been"
<< " normalized with ziptime."
<< " input_filename=" << input_filename_;
detected_zip_normalized_time_ = true;
return;
}
// TODO: skip normalize prebuilt jar files.
// Currently, we also normalizes prebuilt library jar files.
// Since such files are also stored in output directory, it is difficult to
// distinguish.
NormalizeBuffer();
}
ssize_t JarFileReader::ReadDataToBuffer(size_t size) {
size_t orig_size = buffer_.size();
buffer_.resize(orig_size + size);
ssize_t read_bytes = FileReader::Read(&buffer_[orig_size], size);
if (read_bytes >= 0) {
buffer_.resize(orig_size + read_bytes);
}
VLOG(2) << "input_filename=" << input_filename_
<< " read buffer_.size()=" << buffer_.size() << " size=" << size
<< " read_bytes=" << read_bytes;
return read_bytes;
}
// NormalizeBuffer normalizes a timestamp in the header.
//
// How it works?
// 1. find "PK".
// 2. signature starting from "PK" let us know what header is there, and
// normalize a timestamp in it.
//
// Serious way of parsing .jar file is uncompressing each ZIP entry until
// the end of compressed data. This is what the original jar command do.
// However, as far as I am inspired by zlib/contrib/minizip/unzip.c,
// just skipping to the signature seems to usually work.
//
// See Also: https://en.wikipedia.org/wiki/Zip_(file_format)#File_headers
// Note that header structure is the same between ZIP and ZIP64.
void JarFileReader::NormalizeBuffer() {
// TODO: return error instead?
DCHECK_LE(buffer_head_pos_, last_normalized_absolute_pos_)
<< "buffer_head_pos must be smaller than or equals to "
<< "last_normalized_absolute_pos_"
<< " input_filename=" << input_filename_
<< " buffer_head_pos=" << buffer_head_pos_
<< " last_normalized_absolute_pos=" << last_normalized_absolute_pos_;
// Normalize the buffer from the last normalized position.
size_t cur = last_normalized_absolute_pos_ - buffer_head_pos_;
is_buffer_normalized_ = true;
for (;;) {
cur = buffer_.find("PK", cur);
if (cur == string::npos) {
// 'K' may come just after 'P'. Let me mark this not normalized.
if (!buffer_.empty() && buffer_[buffer_.size() - 1] == 'P') {
is_buffer_normalized_ = false;
} else {
last_normalized_absolute_pos_ = buffer_head_pos_ + buffer_.size();
}
return;
}
if (cur + 4 > buffer_.size()) {
// Will cause buffer overrun.
VLOG(1) << "would cause buffer overrun."
<< " input_filename=" << input_filename_ << " cur=" << cur
<< " buffer_head_pos=" << buffer_head_pos_
<< " buffer_.size()=" << buffer_.size();
is_buffer_normalized_ = false;
return;
}
ssize_t offset = GetTimestampOffset(&buffer_[cur]);
VLOG(3) << "offset:" << offset;
if (offset < 0) {
cur += 4;
continue;
}
if (cur + offset + 4 > buffer_.size()) {
// Will cause buffer overrun.
VLOG(1) << "would cause buffer overrun."
<< " input_filename=" << input_filename_ << " cur=" << cur
<< " buffer_head_pos=" << buffer_head_pos_ << " offset=" << offset
<< " buffer_.size()=" << buffer_.size();
is_buffer_normalized_ = false;
return;
}
// Set timestamp to the epoch time. 1980-01-01T00:00:00
// Note that all 0 represents 1980-00-00T00:00:00, which could be invalid.
buffer_[cur + offset + 0] = 0;
buffer_[cur + offset + 1] = 0;
buffer_[cur + offset + 2] = 0x21;
buffer_[cur + offset + 3] = 0;
// offset from the head of the header + timestamp (4bytes) to go to just
// next to timestamp.
cur += offset + 4;
last_normalized_absolute_pos_ = buffer_head_pos_ + cur;
}
}
ssize_t JarFileReader::GetTimestampOffset(const char* signature) {
// Please see also:
// https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
static const uint32_t kLocalFileHeaderSignature = 0x04034b50;
static const uint32_t kCentralFileHeaderSignature = 0x02014b50;
uint32_t u32_signature = ToUInt32(signature);
VLOG(3) << "signature:" << std::hex << u32_signature
<< " input_filename=" << input_filename_
<< " buffer_head_pos=" << buffer_head_pos_
<< " last_normalized_absolute_pos=" << last_normalized_absolute_pos_
<< " offset=" << offset_ << " buffer_.size()=" << buffer_.size();
if (u32_signature == kLocalFileHeaderSignature) {
DCHECK(!is_central_directory_started_)
<< "Local file descriptor signature comes after central directory "
<< "entry."
<< " input_filename=" << input_filename_
<< " buffer_head_pos=" << buffer_head_pos_
<< " last_normalized_absolute_pos=" << last_normalized_absolute_pos_
<< " offset_=" << offset_;
return 10;
}
if (u32_signature == kCentralFileHeaderSignature) {
if (!is_central_directory_started_) {
is_central_directory_started_ = true;
}
return 12;
}
return -1;
}
ssize_t JarFileReader::Read(void* ptr, size_t len) {
// TODO: increase kBufSize when it works fine.
// small buffer size is good for checking code but not good for real world.
static const size_t kBufSize = 128;
// https://en.wikipedia.org/wiki/Zip_(file_format)
// Central directory file header should be the largest.
static const size_t kMaxHeaderSize = 46;
COMPILE_ASSERT(kBufSize > kMaxHeaderSize,
"Buffer size should be larger than ZIP header size.");
off_t buffer_head_pos_at_beginning = buffer_head_pos_;
if (is_buffer_normalized_) {
buffer_head_pos_ += FileReader::FlushDataInBuffer(&buffer_, &ptr, &len);
}
while (len > 0) {
ssize_t read_bytes = ReadDataToBuffer(kBufSize);
if (read_bytes < 0) { // Return error soon.
return read_bytes;
}
if (read_bytes != kBufSize) { // Should be end of the file.
VLOG(1) << "input_filename=" << input_filename_
<< " buffer_head_pos=" << buffer_head_pos_
<< " buffer_.size()=" << buffer_.size();
NormalizeBuffer();
// No more data. i.e. no possibility that next buffer may contain the
// data that need to be normalized.
buffer_head_pos_ += FileReader::FlushDataInBuffer(&buffer_, &ptr, &len);
break;
}
VLOG(1) << "input_filename=" << input_filename_
<< " buffer_head_pos=" << buffer_head_pos_
<< " buffer_.size()=" << buffer_.size();
NormalizeBuffer();
if (is_buffer_normalized_) {
buffer_head_pos_ += FileReader::FlushDataInBuffer(&buffer_, &ptr, &len);
}
}
size_t read_bytes = buffer_head_pos_ - buffer_head_pos_at_beginning;
offset_ += read_bytes;
VLOG(1) << "input_filename=" << input_filename_
<< " read_bytes=" << read_bytes << " offset_=" << offset_
<< " buffer_head_pos=" << buffer_head_pos_
<< " buffer_.size()=" << buffer_.size()
<< " is_buffer_normalized=" << is_buffer_normalized_;
return read_bytes;
}
off_t JarFileReader::Seek(off_t offset, ScopedFd::Whence whence) const {
CHECK_EQ(whence, ScopedFd::SeekAbsolute)
<< "Sorry, this function only support seek absolute";
CHECK_EQ(offset, offset_)
<< "Sorry, this function expects the user set just next position.";
return offset;
}
} // namespace devtools_goma