packages/html/lib/src/encoding_parser.dart - external/github.com/dart-lang/observatory_pub_packages - Git at Google

 library encoding_parser;

 import 'constants.dart';
 import 'inputstream.dart';

 // TODO(jmesserly): I converted StopIteration to StateError("No more elements").
 // Seems strange to throw this from outside of an iterator though.
 /// String-like object with an associated position and various extra methods
 /// If the position is ever greater than the string length then an exception is
 /// raised.
 class EncodingBytes {
   final String _bytes;
   int _position = -1;

   EncodingBytes(this._bytes);

   int get length => _bytes.length;

   String next() {
     var p = _position = _position + 1;
     if (p >= length) {
       throw new StateError("No more elements");
     } else if (p < 0) {
       throw new RangeError(p);
     }
     return _bytes[p];
   }

   String previous() {
     var p = _position;
     if (p >= length) {
       throw new StateError("No more elements");
     } else if (p < 0) {
       throw new RangeError(p);
     }
     _position = p = p - 1;
     return _bytes[p];
   }

   set position(int value) {
     if (_position >= length) {
       throw new StateError("No more elements");
     }
     _position = value;
   }

   int get position {
     if (_position >= length) {
       throw new StateError("No more elements");
     }
     if (_position >= 0) {
       return _position;
     } else {
       return 0;
     }
   }

   String get currentByte => _bytes[position];

   /// Skip past a list of characters. Defaults to skipping [isWhitespace].
   String skipChars([CharPreciate skipChars]) {
     if (skipChars == null) skipChars = isWhitespace;
     var p = position; // use property for the error-checking
     while (p < length) {
       var c = _bytes[p];
       if (!skipChars(c)) {
         _position = p;
         return c;
       }
       p += 1;
     }
     _position = p;
     return null;
   }

   String skipUntil(CharPreciate untilChars) {
     var p = position;
     while (p < length) {
       var c = _bytes[p];
       if (untilChars(c)) {
         _position = p;
         return c;
       }
       p += 1;
     }
     return null;
   }

   /// Look for a sequence of bytes at the start of a string. If the bytes
   /// are found return true and advance the position to the byte after the
   /// match. Otherwise return false and leave the position alone.
   bool matchBytes(String bytes) {
     var p = position;
     if (_bytes.length < p + bytes.length) {
       return false;
     }
     var data = _bytes.substring(p, p + bytes.length);
     if (data == bytes) {
       position += bytes.length;
       return true;
     }
     return false;
   }

   /// Look for the next sequence of bytes matching a given sequence. If
   /// a match is found advance the position to the last byte of the match
   bool jumpTo(String bytes) {
     var newPosition = _bytes.indexOf(bytes, position);
     if (newPosition >= 0) {
       _position = newPosition + bytes.length - 1;
       return true;
     } else {
       throw new StateError("No more elements");
     }
   }

   String slice(int start, [int end]) {
     if (end == null) end = length;
     if (end < 0) end += length;
     return _bytes.substring(start, end - start);
   }
 }

 typedef bool _MethodHandler();

 class _DispatchEntry {
   final String pattern;
   final _MethodHandler handler;
   _DispatchEntry(this.pattern, this.handler);
 }

 /// Mini parser for detecting character encoding from meta elements.
 class EncodingParser {
   final EncodingBytes data;
   String encoding;

   /// [bytes] - the data to work on for encoding detection.
   EncodingParser(List<int> bytes)
       // Note: this is intentionally interpreting bytes as codepoints.
       : data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

   String getEncoding() {
     final methodDispatch = [
         new _DispatchEntry("<!--", handleComment),
         new _DispatchEntry("<meta", handleMeta),
         new _DispatchEntry("</", handlePossibleEndTag),
         new _DispatchEntry("<!", handleOther),
         new _DispatchEntry("<?", handleOther),
         new _DispatchEntry("<", handlePossibleStartTag),
     ];

     try {
       for (;;) {
         for (var dispatch in methodDispatch) {
           if (data.matchBytes(dispatch.pattern)) {
             var keepParsing = dispatch.handler();
             if (keepParsing) break;

             // We found an encoding. Stop.
             return encoding;
           }
         }
         data.position += 1;
       }
     } on StateError catch (_) {
       // Catch this here to match behavior of Python's StopIteration
       // TODO(jmesserly): refactor to not use exceptions
     }
     return encoding;
   }

   /// Skip over comments.
   bool handleComment() => data.jumpTo("-->");

   bool handleMeta() {
     if (!isWhitespace(data.currentByte)) {
       // if we have <meta not followed by a space so just keep going
       return true;
     }
     // We have a valid meta element we want to search for attributes
     while (true) {
       // Try to find the next attribute after the current position
       var attr = getAttribute();
       if (attr == null) return true;

       if (attr[0] == "charset") {
         var tentativeEncoding = attr[1];
         var codec = codecName(tentativeEncoding);
         if (codec != null) {
           encoding = codec;
           return false;
         }
       } else if (attr[0] == "content") {
         var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
         var tentativeEncoding = contentParser.parse();
         var codec = codecName(tentativeEncoding);
         if (codec != null) {
           encoding = codec;
           return false;
         }
       }
     }
   }

   bool handlePossibleStartTag() => handlePossibleTag(false);

   bool handlePossibleEndTag() {
     data.next();
     return handlePossibleTag(true);
   }

   bool handlePossibleTag(bool endTag) {
     if (!isLetter(data.currentByte)) {
       //If the next byte is not an ascii letter either ignore this
       //fragment (possible start tag case) or treat it according to
       //handleOther
       if (endTag) {
         data.previous();
         handleOther();
       }
       return true;
     }

     var c = data.skipUntil(isSpaceOrAngleBracket);
     if (c == "<") {
       // return to the first step in the overall "two step" algorithm
       // reprocessing the < byte
       data.previous();
     } else {
       //Read all attributes
       var attr = getAttribute();
       while (attr != null) {
         attr = getAttribute();
       }
     }
     return true;
   }

   bool handleOther() => data.jumpTo(">");

   /// Return a name,value pair for the next attribute in the stream,
   /// if one is found, or null
   List<String> getAttribute() {
     // Step 1 (skip chars)
     var c = data.skipChars((x) => x == "/" || isWhitespace(x));
     // Step 2
     if (c == ">" || c == null) {
       return null;
     }
     // Step 3
     var attrName = [];
     var attrValue = [];
     // Step 4 attribute name
     while (true) {
       if (c == null) {
         return null;
       } else if (c == "=" && attrName.length > 0) {
         break;
       } else if (isWhitespace(c)) {
         // Step 6!
         c = data.skipChars();
         c = data.next();
         break;
       } else if (c == "/" || c == ">") {
         return [attrName.join(), ""];
       } else if (isLetter(c)) {
         attrName.add(c.toLowerCase());
       } else {
         attrName.add(c);
       }
       // Step 5
       c = data.next();
     }
     // Step 7
     if (c != "=") {
       data.previous();
       return [attrName.join(), ""];
     }
     // Step 8
     data.next();
     // Step 9
     c = data.skipChars();
     // Step 10
     if (c == "'" || c == '"') {
       // 10.1
       var quoteChar = c;
       while (true) {
         // 10.2
         c = data.next();
         if (c == quoteChar) {
           // 10.3
           data.next();
           return [attrName.join(), attrValue.join()];
         } else if (isLetter(c)) {
           // 10.4
           attrValue.add(c.toLowerCase());
         } else {
           // 10.5
           attrValue.add(c);
         }
       }
     } else if (c == ">") {
       return [attrName.join(), ""];
     } else if (c == null) {
       return null;
     } else if (isLetter(c)) {
       attrValue.add(c.toLowerCase());
     } else {
       attrValue.add(c);
     }
     // Step 11
     while (true) {
       c = data.next();
       if (isSpaceOrAngleBracket(c)) {
         return [attrName.join(), attrValue.join()];
       } else if (c == null) {
         return null;
       } else if (isLetter(c)) {
         attrValue.add(c.toLowerCase());
       } else {
         attrValue.add(c);
       }
     }
   }
 }

 class ContentAttrParser {
   final EncodingBytes data;

   ContentAttrParser(this.data);

   String parse() {
     try {
       // Check if the attr name is charset
       // otherwise return
       data.jumpTo("charset");
       data.position += 1;
       data.skipChars();
       if (data.currentByte != "=") {
         // If there is no = sign keep looking for attrs
         return null;
       }
       data.position += 1;
       data.skipChars();
       // Look for an encoding between matching quote marks
       if (data.currentByte == '"' || data.currentByte == "'") {
         var quoteMark = data.currentByte;
         data.position += 1;
         var oldPosition = data.position;
         if (data.jumpTo(quoteMark)) {
           return data.slice(oldPosition, data.position);
         } else {
           return null;
         }
       } else {
         // Unquoted value
         var oldPosition = data.position;
         try {
           data.skipUntil(isWhitespace);
           return data.slice(oldPosition, data.position);
         } on StateError catch (_) {
           //Return the whole remaining value
           return data.slice(oldPosition);
         }
       }
     } on StateError catch (_) {
       return null;
     }
   }
 }

 bool isSpaceOrAngleBracket(String char) {
   return char == ">" || char == "<" || isWhitespace(char);
 }

 typedef bool CharPreciate(String char);
	library encoding_parser;

	import 'constants.dart';
	import 'inputstream.dart';

	// TODO(jmesserly): I converted StopIteration to StateError("No more elements").
	// Seems strange to throw this from outside of an iterator though.
	/// String-like object with an associated position and various extra methods
	/// If the position is ever greater than the string length then an exception is
	/// raised.
	class EncodingBytes {
	final String _bytes;
	int _position = -1;

	EncodingBytes(this._bytes);

	int get length => _bytes.length;

	String next() {
	var p = _position = _position + 1;
	if (p >= length) {
	throw new StateError("No more elements");
	} else if (p < 0) {
	throw new RangeError(p);
	}
	return _bytes[p];
	}

	String previous() {
	var p = _position;
	if (p >= length) {
	throw new StateError("No more elements");
	} else if (p < 0) {
	throw new RangeError(p);
	}
	_position = p = p - 1;
	return _bytes[p];
	}

	set position(int value) {
	if (_position >= length) {
	throw new StateError("No more elements");
	}
	_position = value;
	}

	int get position {
	if (_position >= length) {
	throw new StateError("No more elements");
	}
	if (_position >= 0) {
	return _position;
	} else {
	return 0;
	}
	}

	String get currentByte => _bytes[position];

	/// Skip past a list of characters. Defaults to skipping [isWhitespace].
	String skipChars([CharPreciate skipChars]) {
	if (skipChars == null) skipChars = isWhitespace;
	var p = position; // use property for the error-checking
	while (p < length) {
	var c = _bytes[p];
	if (!skipChars(c)) {
	_position = p;
	return c;
	}
	p += 1;
	}
	_position = p;
	return null;
	}

	String skipUntil(CharPreciate untilChars) {
	var p = position;
	while (p < length) {
	var c = _bytes[p];
	if (untilChars(c)) {
	_position = p;
	return c;
	}
	p += 1;
	}
	return null;
	}

	/// Look for a sequence of bytes at the start of a string. If the bytes
	/// are found return true and advance the position to the byte after the
	/// match. Otherwise return false and leave the position alone.
	bool matchBytes(String bytes) {
	var p = position;
	if (_bytes.length < p + bytes.length) {
	return false;
	}
	var data = _bytes.substring(p, p + bytes.length);
	if (data == bytes) {
	position += bytes.length;
	return true;
	}
	return false;
	}

	/// Look for the next sequence of bytes matching a given sequence. If
	/// a match is found advance the position to the last byte of the match
	bool jumpTo(String bytes) {
	var newPosition = _bytes.indexOf(bytes, position);
	if (newPosition >= 0) {
	_position = newPosition + bytes.length - 1;
	return true;
	} else {
	throw new StateError("No more elements");
	}
	}

	String slice(int start, [int end]) {
	if (end == null) end = length;
	if (end < 0) end += length;
	return _bytes.substring(start, end - start);
	}
	}

	typedef bool _MethodHandler();

	class _DispatchEntry {
	final String pattern;
	final _MethodHandler handler;
	_DispatchEntry(this.pattern, this.handler);
	}

	/// Mini parser for detecting character encoding from meta elements.
	class EncodingParser {
	final EncodingBytes data;
	String encoding;

	/// [bytes] - the data to work on for encoding detection.
	EncodingParser(List<int> bytes)
	// Note: this is intentionally interpreting bytes as codepoints.
	: data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());

	String getEncoding() {
	final methodDispatch = [
	new _DispatchEntry("<!--", handleComment),
	new _DispatchEntry("<meta", handleMeta),
	new _DispatchEntry("</", handlePossibleEndTag),
	new _DispatchEntry("<!", handleOther),
	new _DispatchEntry("<?", handleOther),
	new _DispatchEntry("<", handlePossibleStartTag),
	];

	try {
	for (;;) {
	for (var dispatch in methodDispatch) {
	if (data.matchBytes(dispatch.pattern)) {
	var keepParsing = dispatch.handler();
	if (keepParsing) break;

	// We found an encoding. Stop.
	return encoding;
	}
	}
	data.position += 1;
	}
	} on StateError catch (_) {
	// Catch this here to match behavior of Python's StopIteration
	// TODO(jmesserly): refactor to not use exceptions
	}
	return encoding;
	}

	/// Skip over comments.
	bool handleComment() => data.jumpTo("-->");

	bool handleMeta() {
	if (!isWhitespace(data.currentByte)) {
	// if we have <meta not followed by a space so just keep going
	return true;
	}
	// We have a valid meta element we want to search for attributes
	while (true) {
	// Try to find the next attribute after the current position
	var attr = getAttribute();
	if (attr == null) return true;

	if (attr[0] == "charset") {
	var tentativeEncoding = attr[1];
	var codec = codecName(tentativeEncoding);
	if (codec != null) {
	encoding = codec;
	return false;
	}
	} else if (attr[0] == "content") {
	var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
	var tentativeEncoding = contentParser.parse();
	var codec = codecName(tentativeEncoding);
	if (codec != null) {
	encoding = codec;
	return false;
	}
	}
	}
	}

	bool handlePossibleStartTag() => handlePossibleTag(false);

	bool handlePossibleEndTag() {
	data.next();
	return handlePossibleTag(true);
	}

	bool handlePossibleTag(bool endTag) {
	if (!isLetter(data.currentByte)) {
	//If the next byte is not an ascii letter either ignore this
	//fragment (possible start tag case) or treat it according to
	//handleOther
	if (endTag) {
	data.previous();
	handleOther();
	}
	return true;
	}

	var c = data.skipUntil(isSpaceOrAngleBracket);
	if (c == "<") {
	// return to the first step in the overall "two step" algorithm
	// reprocessing the < byte
	data.previous();
	} else {
	//Read all attributes
	var attr = getAttribute();
	while (attr != null) {
	attr = getAttribute();
	}
	}
	return true;
	}

	bool handleOther() => data.jumpTo(">");

	/// Return a name,value pair for the next attribute in the stream,
	/// if one is found, or null
	List<String> getAttribute() {
	// Step 1 (skip chars)
	var c = data.skipChars((x) => x == "/" \|\| isWhitespace(x));
	// Step 2
	if (c == ">" \|\| c == null) {
	return null;
	}
	// Step 3
	var attrName = [];
	var attrValue = [];
	// Step 4 attribute name
	while (true) {
	if (c == null) {
	return null;
	} else if (c == "=" && attrName.length > 0) {
	break;
	} else if (isWhitespace(c)) {
	// Step 6!
	c = data.skipChars();
	c = data.next();
	break;
	} else if (c == "/" \|\| c == ">") {
	return [attrName.join(), ""];
	} else if (isLetter(c)) {
	attrName.add(c.toLowerCase());
	} else {
	attrName.add(c);
	}
	// Step 5
	c = data.next();
	}
	// Step 7
	if (c != "=") {
	data.previous();
	return [attrName.join(), ""];
	}
	// Step 8
	data.next();
	// Step 9
	c = data.skipChars();
	// Step 10
	if (c == "'" \|\| c == '"') {
	// 10.1
	var quoteChar = c;
	while (true) {
	// 10.2
	c = data.next();
	if (c == quoteChar) {
	// 10.3
	data.next();
	return [attrName.join(), attrValue.join()];
	} else if (isLetter(c)) {
	// 10.4
	attrValue.add(c.toLowerCase());
	} else {
	// 10.5
	attrValue.add(c);
	}
	}
	} else if (c == ">") {
	return [attrName.join(), ""];
	} else if (c == null) {
	return null;
	} else if (isLetter(c)) {
	attrValue.add(c.toLowerCase());
	} else {
	attrValue.add(c);
	}
	// Step 11
	while (true) {
	c = data.next();
	if (isSpaceOrAngleBracket(c)) {
	return [attrName.join(), attrValue.join()];
	} else if (c == null) {
	return null;
	} else if (isLetter(c)) {
	attrValue.add(c.toLowerCase());
	} else {
	attrValue.add(c);
	}
	}
	}
	}

	class ContentAttrParser {
	final EncodingBytes data;

	ContentAttrParser(this.data);

	String parse() {
	try {
	// Check if the attr name is charset
	// otherwise return
	data.jumpTo("charset");
	data.position += 1;
	data.skipChars();
	if (data.currentByte != "=") {
	// If there is no = sign keep looking for attrs
	return null;
	}
	data.position += 1;
	data.skipChars();
	// Look for an encoding between matching quote marks
	if (data.currentByte == '"' \|\| data.currentByte == "'") {
	var quoteMark = data.currentByte;
	data.position += 1;
	var oldPosition = data.position;
	if (data.jumpTo(quoteMark)) {
	return data.slice(oldPosition, data.position);
	} else {
	return null;
	}
	} else {
	// Unquoted value
	var oldPosition = data.position;
	try {
	data.skipUntil(isWhitespace);
	return data.slice(oldPosition, data.position);
	} on StateError catch (_) {
	//Return the whole remaining value
	return data.slice(oldPosition);
	}
	}
	} on StateError catch (_) {
	return null;
	}
	}
	}

	bool isSpaceOrAngleBracket(String char) {
	return char == ">" \|\| char == "<" \|\| isWhitespace(char);
	}

	typedef bool CharPreciate(String char);