blob: 9a1182724e0ebf175cb226479d966ebfe0bedbe2 [file] [log] [blame]
// Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
'use strict';
// TODO(davidben): When the string encoding API is implemented,
// replace this with the native in-browser implementation.
//
// https://wiki.whatwg.org/wiki/StringEncoding
// https://encoding.spec.whatwg.org/
/**
* A stateful UTF-8 decoder.
*/
lib.UTF8Decoder = function() {
// The number of bytes left in the current sequence.
this.bytesLeft = 0;
// The in-progress code point being decoded, if bytesLeft > 0.
this.codePoint = 0;
// The lower bound on the final code point, if bytesLeft > 0.
this.lowerBound = 0;
};
/**
* Decodes a some UTF-8 data, taking into account state from previous
* data streamed through the encoder.
*
* @param {String} str data to decode, represented as a JavaScript
* String with each code unit representing a byte between 0x00 to
* 0xFF.
* @return {String} The data decoded into a JavaScript UTF-16 string.
*/
lib.UTF8Decoder.prototype.decode = function(str) {
var ret = '';
for (var i = 0; i < str.length; i++) {
var c = str.charCodeAt(i);
if (this.bytesLeft == 0) {
if (c <= 0x7F) {
ret += str.charAt(i);
} else if (0xC0 <= c && c <= 0xDF) {
this.codePoint = c - 0xC0;
this.bytesLeft = 1;
this.lowerBound = 0x80;
} else if (0xE0 <= c && c <= 0xEF) {
this.codePoint = c - 0xE0;
this.bytesLeft = 2;
this.lowerBound = 0x800;
} else if (0xF0 <= c && c <= 0xF7) {
this.codePoint = c - 0xF0;
this.bytesLeft = 3;
this.lowerBound = 0x10000;
} else if (0xF8 <= c && c <= 0xFB) {
this.codePoint = c - 0xF8;
this.bytesLeft = 4;
this.lowerBound = 0x200000;
} else if (0xFC <= c && c <= 0xFD) {
this.codePoint = c - 0xFC;
this.bytesLeft = 5;
this.lowerBound = 0x4000000;
} else {
ret += '\ufffd';
}
} else {
if (0x80 <= c && c <= 0xBF) {
this.bytesLeft--;
this.codePoint = (this.codePoint << 6) + (c - 0x80);
if (this.bytesLeft == 0) {
// Got a full sequence. Check if it's within bounds and
// filter out surrogate pairs.
var codePoint = this.codePoint;
if (codePoint < this.lowerBound
|| (0xD800 <= codePoint && codePoint <= 0xDFFF)
|| codePoint > 0x10FFFF) {
ret += '\ufffd';
} else {
// Encode as UTF-16 in the output.
if (codePoint < 0x10000) {
ret += String.fromCharCode(codePoint);
} else {
// Surrogate pair.
codePoint -= 0x10000;
ret += String.fromCharCode(
0xD800 + ((codePoint >>> 10) & 0x3FF),
0xDC00 + (codePoint & 0x3FF));
}
}
}
} else {
// Too few bytes in multi-byte sequence. Rewind stream so we
// don't lose the next byte.
ret += '\ufffd';
this.bytesLeft = 0;
i--;
}
}
}
return ret;
};
/**
* Decodes UTF-8 data. This is a convenience function for when all the
* data is already known.
*
* @param {String} str data to decode, represented as a JavaScript
* String with each code unit representing a byte between 0x00 to
* 0xFF.
* @return {String} The data decoded into a JavaScript UTF-16 string.
*/
lib.decodeUTF8 = function(utf8) {
return (new lib.UTF8Decoder()).decode(utf8);
};
/**
* Encodes a UTF-16 string into UTF-8.
*
* TODO(davidben): Do we need a stateful version of this that can
* handle a surrogate pair split in two calls? What happens if a
* keypress event would have contained a character outside the BMP?
*
* @param {String} str The string to encode.
* @return {String} The string encoded as UTF-8, as a JavaScript
* string with bytes represented as code units from 0x00 to 0xFF.
*/
lib.encodeUTF8 = function(str) {
var ret = '';
for (var i = 0; i < str.length; i++) {
// Get a unicode code point out of str.
var c = str.charCodeAt(i);
if (0xDC00 <= c && c <= 0xDFFF) {
c = 0xFFFD;
} else if (0xD800 <= c && c <= 0xDBFF) {
if (i+1 < str.length) {
var d = str.charCodeAt(i+1);
if (0xDC00 <= d && d <= 0xDFFF) {
// Swallow a surrogate pair.
c = 0x10000 + ((c & 0x3FF) << 10) + (d & 0x3FF);
i++;
} else {
c = 0xFFFD;
}
} else {
c = 0xFFFD;
}
}
// Encode c in UTF-8.
var bytesLeft;
if (c <= 0x7F) {
ret += str.charAt(i);
continue;
} else if (c <= 0x7FF) {
ret += String.fromCharCode(0xC0 | (c >>> 6));
bytesLeft = 1;
} else if (c <= 0xFFFF) {
ret += String.fromCharCode(0xE0 | (c >>> 12));
bytesLeft = 2;
} else /* if (c <= 0x10FFFF) */ {
ret += String.fromCharCode(0xF0 | (c >>> 18));
bytesLeft = 3;
}
while (bytesLeft > 0) {
bytesLeft--;
ret += String.fromCharCode(0x80 | ((c >>> (6 * bytesLeft)) & 0x3F));
}
}
return ret;
};