blob: 51638ee4e8b167e8e4d47c7d776dbf6f2803db9a [file] [log] [blame]
/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
package java.nio.charset;
import java.io.UTFDataFormatException;
/**
* Encoding and decoding methods for Modified UTF-8
*
* <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
* 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
*
* @hide
*/
public class ModifiedUtf8 {
/**
* Count the number of bytes in the modified UTF-8 representation of {@code s}.
*
* <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
* the size cannot be presented in an (unsigned) java short.
*/
public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
long counter = 0;
int strLen = s.length();
for (int i = 0; i < strLen; i++) {
char c = s.charAt(i);
if (c < '\u0080') {
counter++;
if (c == '\u0000') {
counter++;
}
} else if (c < '\u0800') {
counter += 2;
} else {
counter += 3;
}
}
// Allow up to the maximum value of an unsigned short (as the value is known to be
// unsigned.
if (shortLength && counter > 0xffff) {
throw new UTFDataFormatException(
"Size of the encoded string doesn't fit in two bytes");
}
return counter;
}
/**
* Encode {@code s} into {@code dst} starting at offset {@code offset}.
*
* <p>The output buffer is guaranteed to have enough space.
*/
public static void encode(byte[] dst, int offset, String s) {
int strLen = s.length();
for (int i = 0; i < strLen; i++) {
char c = s.charAt(i);
if (c < '\u0080') {
if (c == 0) {
dst[offset++] = (byte) 0xc0;
dst[offset++] = (byte) 0x80;
} else {
dst[offset++] = (byte) c;
}
} else if (c < '\u0800') {
dst[offset++] = (byte) ((c >>> 6) | 0xc0);
dst[offset++] = (byte) ((c & 0x3f) | 0x80);
} else {
dst[offset++] = (byte) ((c >>> 12) | 0xe0);
dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
dst[offset++] = (byte) ((c & 0x3f) | 0x80);
}
}
}
/**
* Encodes {@code s} into a buffer with the following format:
*
* <p>- the first two bytes of the buffer are the length of the modified-utf8 output
* (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
* represented as a short.
*
* <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
* {@code encode(buf, 2, s)}).
*/
public static byte[] encode(String s) throws UTFDataFormatException {
long size = countBytes(s, true);
byte[] output = new byte[(int) size + 2];
encode(output, 2, s);
output[0] = (byte) (size >>> 8);
output[1] = (byte) size;
return output;
}
/**
* Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
* {@code out},
*
* <p>A maximum of {@code length} chars are written to the output starting at offset 0.
* {@code out} is assumed to have enough space for the output (a standard
* {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
*
* <p>If a ‘0’ byte is encountered, it is converted to U+0000.
*/
public static String decode(byte[] in, char[] out, int offset, int length)
throws UTFDataFormatException {
if (offset < 0 || length < 0) {
throw new IllegalArgumentException("Illegal arguments: offset " + offset
+ ". Length: " + length);
}
int outputIndex = 0;
int limitIndex = offset + length;
while (offset < limitIndex) {
int i = in[offset] & 0xff;
offset++;
if (i < 0x80) {
out[outputIndex] = (char) i;
outputIndex++;
continue;
}
if (0xc0 <= i && i < 0xe0) {
// This branch covers the case 0 = 0xc080.
// The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
i = (i & 0x1f) << 6;
if(offset == limitIndex) {
throw new UTFDataFormatException("unexpected end of input");
}
// Include 6 least-significant bits of the input byte.
if ((in[offset] & 0xc0) != 0x80) {
throw new UTFDataFormatException("bad second byte at " + offset);
}
out[outputIndex] = (char) (i | (in[offset] & 0x3f));
offset++;
outputIndex++;
} else if(i < 0xf0) {
// The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
// + 6 l-s of next to next input byte.
i = (i & 0x1f) << 12;
// Make sure there are are at least two bytes left.
if (offset + 1 >= limitIndex) {
throw new UTFDataFormatException("unexpected end of input");
}
// Include 6 least-significant bits of the input byte, with 6 bits of room
// for the next byte.
if ((in[offset] & 0xc0) != 0x80) {
throw new UTFDataFormatException("bad second byte at " + offset);
}
i = i | (in[offset] & 0x3f) << 6;
offset++;
// Include 6 least-significant bits of the input byte.
if ((in[offset] & 0xc0) != 0x80) {
throw new UTFDataFormatException("bad third byte at " + offset);
}
out[outputIndex] = (char) (i | (in[offset] & 0x3f));
offset++;
outputIndex++;
} else {
throw new UTFDataFormatException("Invalid UTF8 byte "
+ (int) i + " at position " + (offset - 1));
}
}
return String.valueOf(out, 0, outputIndex);
}
}