libdot/js/lib_utf8.js - apps/libapps - Git at Google

 // Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 'use strict';

 // TODO(davidben): When the string encoding API is implemented,
 // replace this with the native in-browser implementation.
 //
 // https://wiki.whatwg.org/wiki/StringEncoding
 // https://encoding.spec.whatwg.org/

 /**
  * A stateful UTF-8 decoder.
  */
 lib.UTF8Decoder = function() {
   // The number of bytes left in the current sequence.
   this.bytesLeft = 0;
   // The in-progress code point being decoded, if bytesLeft > 0.
   this.codePoint = 0;
   // The lower bound on the final code point, if bytesLeft > 0.
   this.lowerBound = 0;
 };

 /**
  * Decodes a some UTF-8 data, taking into account state from previous
  * data streamed through the encoder.
  *
  * @param {String} str data to decode, represented as a JavaScript
  *     String with each code unit representing a byte between 0x00 to
  *     0xFF.
  * @return {String} The data decoded into a JavaScript UTF-16 string.
  */
 lib.UTF8Decoder.prototype.decode = function(str) {
   var ret = '';
   for (var i = 0; i < str.length; i++) {
     var c = str.charCodeAt(i);
     if (this.bytesLeft == 0) {
       if (c <= 0x7F) {
         ret += str.charAt(i);
       } else if (0xC0 <= c && c <= 0xDF) {
         this.codePoint = c - 0xC0;
         this.bytesLeft = 1;
         this.lowerBound = 0x80;
       } else if (0xE0 <= c && c <= 0xEF) {
         this.codePoint = c - 0xE0;
         this.bytesLeft = 2;
         this.lowerBound = 0x800;
       } else if (0xF0 <= c && c <= 0xF7) {
         this.codePoint = c - 0xF0;
         this.bytesLeft = 3;
         this.lowerBound = 0x10000;
       } else if (0xF8 <= c && c <= 0xFB) {
         this.codePoint = c - 0xF8;
         this.bytesLeft = 4;
         this.lowerBound = 0x200000;
       } else if (0xFC <= c && c <= 0xFD) {
         this.codePoint = c - 0xFC;
         this.bytesLeft = 5;
         this.lowerBound = 0x4000000;
       } else {
         ret += '\ufffd';
       }
     } else {
       if (0x80 <= c && c <= 0xBF) {
         this.bytesLeft--;
         this.codePoint = (this.codePoint << 6) + (c - 0x80);
         if (this.bytesLeft == 0) {
           // Got a full sequence. Check if it's within bounds and
           // filter out surrogate pairs.
           var codePoint = this.codePoint;
           if (codePoint < this.lowerBound
               || (0xD800 <= codePoint && codePoint <= 0xDFFF)
               || codePoint > 0x10FFFF) {
             ret += '\ufffd';
           } else {
             // Encode as UTF-16 in the output.
             if (codePoint < 0x10000) {
               ret += String.fromCharCode(codePoint);
             } else {
               // Surrogate pair.
               codePoint -= 0x10000;
               ret += String.fromCharCode(
                 0xD800 + ((codePoint >>> 10) & 0x3FF),
                 0xDC00 + (codePoint & 0x3FF));
             }
           }
         }
       } else {
         // Too few bytes in multi-byte sequence. Rewind stream so we
         // don't lose the next byte.
         ret += '\ufffd';
         this.bytesLeft = 0;
         i--;
       }
     }
   }
   return ret;
 };

 /**
  * Decodes UTF-8 data. This is a convenience function for when all the
  * data is already known.
  *
  * @param {String} str data to decode, represented as a JavaScript
  *     String with each code unit representing a byte between 0x00 to
  *     0xFF.
  * @return {String} The data decoded into a JavaScript UTF-16 string.
  */
 lib.decodeUTF8 = function(utf8) {
   return (new lib.UTF8Decoder()).decode(utf8);
 };

 /**
  * Encodes a UTF-16 string into UTF-8.
  *
  * TODO(davidben): Do we need a stateful version of this that can
  * handle a surrogate pair split in two calls? What happens if a
  * keypress event would have contained a character outside the BMP?
  *
  * @param {String} str The string to encode.
  * @return {String} The string encoded as UTF-8, as a JavaScript
  *     string with bytes represented as code units from 0x00 to 0xFF.
  */
 lib.encodeUTF8 = function(str) {
   var ret = '';
   for (var i = 0; i < str.length; i++) {
     // Get a unicode code point out of str.
     var c = str.charCodeAt(i);
     if (0xDC00 <= c && c <= 0xDFFF) {
       c = 0xFFFD;
     } else if (0xD800 <= c && c <= 0xDBFF) {
       if (i+1 < str.length) {
         var d = str.charCodeAt(i+1);
         if (0xDC00 <= d && d <= 0xDFFF) {
           // Swallow a surrogate pair.
           c = 0x10000 + ((c & 0x3FF) << 10) + (d & 0x3FF);
           i++;
         } else {
           c = 0xFFFD;
         }
       } else {
         c = 0xFFFD;
       }
     }

     // Encode c in UTF-8.
     var bytesLeft;
     if (c <= 0x7F) {
       ret += str.charAt(i);
       continue;
     } else if (c <= 0x7FF) {
       ret += String.fromCharCode(0xC0 | (c >>> 6));
       bytesLeft = 1;
     } else if (c <= 0xFFFF) {
       ret += String.fromCharCode(0xE0 | (c >>> 12));
       bytesLeft = 2;
     } else /* if (c <= 0x10FFFF) */ {
       ret += String.fromCharCode(0xF0 | (c >>> 18));
       bytesLeft = 3;
     }

     while (bytesLeft > 0) {
       bytesLeft--;
       ret += String.fromCharCode(0x80 | ((c >>> (6 * bytesLeft)) & 0x3F));
     }
   }
   return ret;
 };
	// Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	'use strict';

	// TODO(davidben): When the string encoding API is implemented,
	// replace this with the native in-browser implementation.
	//
	// https://wiki.whatwg.org/wiki/StringEncoding
	// https://encoding.spec.whatwg.org/

	/**
	* A stateful UTF-8 decoder.
	*/
	lib.UTF8Decoder = function() {
	// The number of bytes left in the current sequence.
	this.bytesLeft = 0;
	// The in-progress code point being decoded, if bytesLeft > 0.
	this.codePoint = 0;
	// The lower bound on the final code point, if bytesLeft > 0.
	this.lowerBound = 0;
	};

	/**
	* Decodes a some UTF-8 data, taking into account state from previous
	* data streamed through the encoder.
	*
	* @param {String} str data to decode, represented as a JavaScript
	* String with each code unit representing a byte between 0x00 to
	* 0xFF.
	* @return {String} The data decoded into a JavaScript UTF-16 string.
	*/
	lib.UTF8Decoder.prototype.decode = function(str) {
	var ret = '';
	for (var i = 0; i < str.length; i++) {
	var c = str.charCodeAt(i);
	if (this.bytesLeft == 0) {
	if (c <= 0x7F) {
	ret += str.charAt(i);
	} else if (0xC0 <= c && c <= 0xDF) {
	this.codePoint = c - 0xC0;
	this.bytesLeft = 1;
	this.lowerBound = 0x80;
	} else if (0xE0 <= c && c <= 0xEF) {
	this.codePoint = c - 0xE0;
	this.bytesLeft = 2;
	this.lowerBound = 0x800;
	} else if (0xF0 <= c && c <= 0xF7) {
	this.codePoint = c - 0xF0;
	this.bytesLeft = 3;
	this.lowerBound = 0x10000;
	} else if (0xF8 <= c && c <= 0xFB) {
	this.codePoint = c - 0xF8;
	this.bytesLeft = 4;
	this.lowerBound = 0x200000;
	} else if (0xFC <= c && c <= 0xFD) {
	this.codePoint = c - 0xFC;
	this.bytesLeft = 5;
	this.lowerBound = 0x4000000;
	} else {
	ret += '\ufffd';
	}
	} else {
	if (0x80 <= c && c <= 0xBF) {
	this.bytesLeft--;
	this.codePoint = (this.codePoint << 6) + (c - 0x80);
	if (this.bytesLeft == 0) {
	// Got a full sequence. Check if it's within bounds and
	// filter out surrogate pairs.
	var codePoint = this.codePoint;
	if (codePoint < this.lowerBound
	\|\| (0xD800 <= codePoint && codePoint <= 0xDFFF)
	\|\| codePoint > 0x10FFFF) {
	ret += '\ufffd';
	} else {
	// Encode as UTF-16 in the output.
	if (codePoint < 0x10000) {
	ret += String.fromCharCode(codePoint);
	} else {
	// Surrogate pair.
	codePoint -= 0x10000;
	ret += String.fromCharCode(
	0xD800 + ((codePoint >>> 10) & 0x3FF),
	0xDC00 + (codePoint & 0x3FF));
	}
	}
	}
	} else {
	// Too few bytes in multi-byte sequence. Rewind stream so we
	// don't lose the next byte.
	ret += '\ufffd';
	this.bytesLeft = 0;
	i--;
	}
	}
	}
	return ret;
	};

	/**
	* Decodes UTF-8 data. This is a convenience function for when all the
	* data is already known.
	*
	* @param {String} str data to decode, represented as a JavaScript
	* String with each code unit representing a byte between 0x00 to
	* 0xFF.
	* @return {String} The data decoded into a JavaScript UTF-16 string.
	*/
	lib.decodeUTF8 = function(utf8) {
	return (new lib.UTF8Decoder()).decode(utf8);
	};

	/**
	* Encodes a UTF-16 string into UTF-8.
	*
	* TODO(davidben): Do we need a stateful version of this that can
	* handle a surrogate pair split in two calls? What happens if a
	* keypress event would have contained a character outside the BMP?
	*
	* @param {String} str The string to encode.
	* @return {String} The string encoded as UTF-8, as a JavaScript
	* string with bytes represented as code units from 0x00 to 0xFF.
	*/
	lib.encodeUTF8 = function(str) {
	var ret = '';
	for (var i = 0; i < str.length; i++) {
	// Get a unicode code point out of str.
	var c = str.charCodeAt(i);
	if (0xDC00 <= c && c <= 0xDFFF) {
	c = 0xFFFD;
	} else if (0xD800 <= c && c <= 0xDBFF) {
	if (i+1 < str.length) {
	var d = str.charCodeAt(i+1);
	if (0xDC00 <= d && d <= 0xDFFF) {
	// Swallow a surrogate pair.
	c = 0x10000 + ((c & 0x3FF) << 10) + (d & 0x3FF);
	i++;
	} else {
	c = 0xFFFD;
	}
	} else {
	c = 0xFFFD;
	}
	}

	// Encode c in UTF-8.
	var bytesLeft;
	if (c <= 0x7F) {
	ret += str.charAt(i);
	continue;
	} else if (c <= 0x7FF) {
	ret += String.fromCharCode(0xC0 \| (c >>> 6));
	bytesLeft = 1;
	} else if (c <= 0xFFFF) {
	ret += String.fromCharCode(0xE0 \| (c >>> 12));
	bytesLeft = 2;
	} else /* if (c <= 0x10FFFF) */ {
	ret += String.fromCharCode(0xF0 \| (c >>> 18));
	bytesLeft = 3;
	}

	while (bytesLeft > 0) {
	bytesLeft--;
	ret += String.fromCharCode(0x80 \| ((c >>> (6 * bytesLeft)) & 0x3F));
	}
	}
	return ret;
	};