blob: d732b35db941ba4fea698529b5dcdb010b1b2a51 [file] [log] [blame]
/*
* Copyright 2017 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.turbine.zip;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.common.io.ByteStreams;
import com.google.common.primitives.UnsignedInts;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.IOError;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Iterator;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import java.util.zip.ZipException;
/**
* A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a
* href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet
* support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have
* the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile}
* instead.
*
* <p>If you're reading this because you're fixing a bug, sorry.
*
* <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this
* implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I
* verified they're doing the same work.) This is likely largely due to ZipInputStream reading the
* entire file from the beginning to scan the local headers, whereas this implementation (and
* ZipFile) only read the central directory. Iterating over the entries (but not reading the data)
* is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so
* most time is spent just scanning entry names. And rt.jar is an interesting test case because
* every compilation has to read it, and it dominates the size of the classpath for small
* compilations.
*
* <p>Implementation notes:
*
* <ul>
* <li>Leading garbage may be supported, since the archive is read backwards using the central
* directory. Archives modified with zip -A may not be supported. Trailing garbage is not
* supported.
* <li>UTF-8 is the only supported encoding.
* <li>STORED and DEFLATE are the only supported compression methods.
* <li>Zip files larger than Integer.MAX_VALUE bytes are not supported.
* <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end
* header is present only if ENDTOT in EOCD header is 0xFFFF.
* </ul>
*/
public final class Zip {
static final int ZIP64_ENDSIG = 0x06064b50;
static final int ZIP64_LOCSIG = 0x07064b50;
static final int LOCHDR = 30; // LOC header size
static final int CENHDR = 46; // CEN header size
static final int ENDHDR = 22; // END header size
static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size
static final int ZIP64_ENDHDR = 56; // ZIP64 end header size
static final int ENDTOT = 10; // total number of entries
static final int ENDSIZ = 12; // central directory size in bytes
static final int ENDCOM = 20; // zip file comment length
static final int CENHOW = 10; // compression method
static final int CENLEN = 24; // uncompressed size
static final int CENSIZ = 20; // compressed size
static final int CENNAM = 28; // filename length
static final int CENEXT = 30; // extra field length
static final int CENCOM = 32; // comment length
static final int CENOFF = 42; // LOC header offset
static final int LOCEXT = 28; // extra field length
static final int ZIP64_ENDSIZ = 40; // central directory size in bytes
static final int ZIP64_MAGICCOUNT = 0xFFFF;
/** Iterates over a zip archive. */
static class ZipIterator implements Iterator<Entry> {
/** A reader for the backing storage. */
private final FileChannel chan;
private final Path path;
private int cdindex = 0;
private final MappedByteBuffer cd;
private final CharsetDecoder decoder = UTF_8.newDecoder();
ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) {
this.path = path;
this.chan = chan;
this.cd = cd;
}
@Override
public boolean hasNext() {
return cdindex < cd.limit();
}
/* Returns a {@link Entry} for the current CEN entry. */
@Override
public Entry next() {
// TODO(cushon): technically we're supposed to throw NSEE
checkSignature(path, cd, cdindex, 1, 2, "CENSIG");
int nameLength = cd.getChar(cdindex + CENNAM);
int extLength = cd.getChar(cdindex + CENEXT);
int commentLength = cd.getChar(cdindex + CENCOM);
Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex);
cdindex += CENHDR + nameLength + extLength + commentLength;
return entry;
}
public String string(ByteBuffer buf, int offset, int length) {
buf = buf.duplicate();
buf.position(offset);
buf.limit(offset + length);
decoder.reset();
try {
return decoder.decode(buf).toString();
} catch (CharacterCodingException e) {
throw new IOError(e);
}
}
}
/** Provides an {@link Iterable} of {@link Entry} over a zip archive. */
public static class ZipIterable implements Iterable<Entry>, Closeable {
private final Path path;
private final FileChannel chan;
private final MappedByteBuffer cd;
public ZipIterable(Path path) throws IOException {
this.path = path;
this.chan = FileChannel.open(path, StandardOpenOption.READ);
// Locate the EOCD
long size = chan.size();
if (size < ENDHDR) {
throw new ZipException("invalid zip archive");
}
long eocdOffset = size - ENDHDR;
MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR);
eocd.order(ByteOrder.LITTLE_ENDIAN);
int index = 0;
int commentSize = 0;
if (!isSignature(eocd, 0, 5, 6)) {
// The archive may contain a zip file comment; keep looking for the EOCD.
long start = Math.max(0, size - ENDHDR - 0xFFFF);
eocd = chan.map(MapMode.READ_ONLY, start, (size - start));
eocd.order(ByteOrder.LITTLE_ENDIAN);
index = (int) ((size - start) - ENDHDR);
while (index > 0) {
index--;
eocd.position(index);
if (isSignature(eocd, index, 5, 6)) {
commentSize = (int) ((size - start) - ENDHDR) - index;
eocdOffset = start + index;
break;
}
}
}
checkSignature(path, eocd, index, 5, 6, "ENDSIG");
int totalEntries = eocd.getChar(index + ENDTOT);
long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ));
int actualCommentSize = eocd.getChar(index + ENDCOM);
if (commentSize != actualCommentSize) {
throw new ZipException(
String.format(
"zip file comment length was %d, expected %d", commentSize, actualCommentSize));
}
// If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator.
if (totalEntries == ZIP64_MAGICCOUNT) {
// Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors.
long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR;
// Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF
// entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations
// read the full EOCD records and compare them.
long zip64cdsize = zip64cdsize(chan, zip64eocdOffset);
if (zip64cdsize != -1) {
eocdOffset = zip64eocdOffset;
cdsize = zip64cdsize;
} else {
// If we couldn't find a zip64 EOCD at a fixed offset, either it doesn't exist
// or there was a zip64 extensible data sector, so try going through the
// locator. This approach doesn't work if data was prepended to the archive
// without updating the offset in the locator.
MappedByteBuffer zip64loc =
chan.map(MapMode.READ_ONLY, size - ENDHDR - ZIP64_LOCHDR, ZIP64_LOCHDR);
zip64loc.order(ByteOrder.LITTLE_ENDIAN);
if (zip64loc.getInt(0) == ZIP64_LOCSIG) {
zip64eocdOffset = zip64loc.getLong(8);
zip64cdsize = zip64cdsize(chan, zip64eocdOffset);
if (zip64cdsize != -1) {
eocdOffset = zip64eocdOffset;
cdsize = zip64cdsize;
}
}
}
}
this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize);
cd.order(ByteOrder.LITTLE_ENDIAN);
}
static long zip64cdsize(FileChannel chan, long eocdOffset) throws IOException {
MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ZIP64_ENDHDR);
zip64eocd.order(ByteOrder.LITTLE_ENDIAN);
if (zip64eocd.getInt(0) == ZIP64_ENDSIG) {
return zip64eocd.getLong(ZIP64_ENDSIZ);
}
return -1;
}
@Override
public Iterator<Entry> iterator() {
return new ZipIterator(path, chan, cd);
}
@Override
public void close() throws IOException {
chan.close();
}
}
/** An entry in a zip archive. */
public static class Entry {
private final Path path;
private final FileChannel chan;
private final String name;
private final ByteBuffer cd;
private final int cdindex;
public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) {
this.path = path;
this.chan = chan;
this.name = name;
this.cd = cd;
this.cdindex = cdindex;
}
/** The entry name. */
public String name() {
return name;
}
/** The entry data. */
public byte[] data() {
// Read the offset and variable lengths from the central directory and then try to map in the
// data section in one shot.
long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF));
int nameLength = cd.getChar(cdindex + CENNAM);
int extLength = cd.getChar(cdindex + CENEXT);
int compression = cd.getChar(cdindex + CENHOW);
switch (compression) {
case 0x8:
return getBytes(
offset,
nameLength,
extLength,
UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)),
/*deflate=*/ true);
case 0x0:
return getBytes(
offset,
nameLength,
extLength,
UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)),
/*deflate=*/ false);
default:
throw new AssertionError(
String.format("unsupported compression mode: 0x%x", compression));
}
}
/**
* Number of extra bytes to read for each file, to avoid re-mapping the data if the local header
* reports more extra field data than the central directory.
*/
static final int EXTRA_FIELD_SLACK = 128;
private byte[] getBytes(
long offset, int nameLength, int cenExtLength, long size, boolean deflate) {
if (size > Integer.MAX_VALUE) {
throw new IllegalArgumentException("unsupported zip entry size: " + size);
}
try {
MappedByteBuffer fc =
chan.map(
MapMode.READ_ONLY,
offset,
Math.min(
LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK,
chan.size() - offset));
fc.order(ByteOrder.LITTLE_ENDIAN);
checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG");
int locExtLength = fc.getChar(LOCEXT);
if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) {
// If the local header's extra fields don't match the central directory and we didn't
// leave enough slac, re-map the data section with the correct extra field length.
fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size);
fc.order(ByteOrder.LITTLE_ENDIAN);
} else {
// Otherwise seek past the local header, name, and extra fields to the data.
fc.position(LOCHDR + nameLength + locExtLength);
fc.limit((int) (LOCHDR + nameLength + locExtLength + size));
}
byte[] bytes = new byte[(int) size];
fc.get(bytes);
if (deflate) {
bytes =
ByteStreams.toByteArray(
new InflaterInputStream(
new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true)));
}
return bytes;
} catch (IOException e) {
throw new IOError(e);
}
}
}
static void checkSignature(
Path path, MappedByteBuffer buf, int index, int i, int j, String name) {
if (!isSignature(buf, index, i, j)) {
throw new AssertionError(
String.format(
"%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)",
path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index)));
}
}
static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) {
return (buf.get(index) == 'P')
&& (buf.get(index + 1) == 'K')
&& (buf.get(index + 2) == i)
&& (buf.get(index + 3) == j);
}
private Zip() {}
}