Support the latest pkg:csslib, prepare to release 0.14.0+1
diff --git a/.travis.yml b/.travis.yml
index 353f809..337ceca 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,12 @@
language: dart
+
dart:
- - 2.0.0
- dev
+ - 2.0.0
dart_task:
- test: -p vm
- - test: -p chrome,firefox
+ - test: -p chrome
- dartanalyzer: --fatal-warnings --fatal-infos .
matrix:
@@ -15,7 +16,7 @@
# Only building master means that we don't run two builds for each pull request.
branches:
- only: [master, v0_13]
+ only: [master]
cache:
directories:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 350ac50..139ec41 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,14 @@
-## 0.13.4+2
+## 0.14.0+1
- Support `package:css` `>=0.13.2 <0.16.0`.
+## 0.14.0
+
+*BREAKING CHANGES*
+
+- Drop support for encodings other than UTF-8 and ASCII.
+- Removed `parser_console.dart` library.
+
## 0.13.4+1
* Fixes to readme and pubspec.
diff --git a/lib/parser_console.dart b/lib/parser_console.dart
deleted file mode 100644
index 28dee14..0000000
--- a/lib/parser_console.dart
+++ /dev/null
@@ -1,42 +0,0 @@
-/// This library adds `dart:io` support to the HTML5 parser. Call
-/// [initDartIOSupport] before calling the [parse] methods and they will accept
-/// a [RandomAccessFile] as input, in addition to the other input types.
-library parser_console;
-
-import 'dart:io';
-import 'parser.dart';
-import 'src/inputstream.dart' as inputstream;
-
-/// Adds support to the [HtmlParser] for running on a console VM. In particular
-/// this means it will be able to handle `dart:io` and [RandomAccessFile]s as
-/// input to the various [parse] methods.
-void useConsole() {
- inputstream.consoleSupport = _ConsoleSupport();
-}
-
-class _ConsoleSupport extends inputstream.ConsoleSupport {
- List<int> bytesFromFile(source) {
- if (source is! RandomAccessFile) return null;
- return readAllBytesFromFile(source);
- }
-}
-
-// TODO(jmesserly): this should be `RandomAccessFile.readAllBytes`.
-/// Synchronously reads all bytes from the [file].
-List<int> readAllBytesFromFile(RandomAccessFile file) {
- int length = file.lengthSync();
- var bytes = List<int>(length);
-
- int bytesRead = 0;
- while (bytesRead < length) {
- int read = file.readIntoSync(bytes, bytesRead, length - bytesRead);
- if (read <= 0) {
- // This could happen if, for example, the file was resized while
- // we're reading. Just shrink the bytes array and move on.
- bytes = bytes.sublist(0, bytesRead);
- break;
- }
- bytesRead += read;
- }
- return bytes;
-}
diff --git a/lib/src/char_encodings.dart b/lib/src/char_encodings.dart
deleted file mode 100644
index ba10a4a..0000000
--- a/lib/src/char_encodings.dart
+++ /dev/null
@@ -1,228 +0,0 @@
-/// Decodes bytes using the correct name. See [decodeBytes].
-library char_encodings;
-
-import 'dart:collection';
-import 'package:utf/utf.dart';
-
-// TODO(jmesserly): this function is conspicuously absent from dart:utf.
-/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
-/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
-/// used in HTML to detect the UTF-
-bool hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
- int end = length != null ? offset + length : bytes.length;
- return (offset + 3) <= end &&
- bytes[offset] == 0xEF &&
- bytes[offset + 1] == 0xBB &&
- bytes[offset + 2] == 0xBF;
-}
-
-// TODO(jmesserly): it's unfortunate that this has to be one-shot on the entire
-// file, but dart:utf does not expose stream-based decoders yet.
-/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
-/// the codepoints. Supports the major unicode encodings as well as ascii and
-/// and windows-1252 encodings.
-Iterable<int> decodeBytes(String encoding, List<int> bytes,
- [int offset = 0,
- int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- if (length == null) length = bytes.length;
- final replace = replacementCodepoint;
- switch (encoding) {
- case 'ascii':
- bytes = bytes.sublist(offset, offset + length);
- // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart
- for (int byte in bytes) {
- if (byte > 127) {
- // TODO(jmesserly): ideally this would be DecoderException, like the
- // one thrown in runtime/bin/string_stream.dart, but we don't want to
- // depend on dart:io.
- throw FormatException("Illegal ASCII character $byte");
- }
- }
- return bytes;
-
- case 'windows-1252':
- case 'cp1252':
- return decodeWindows1252AsIterable(bytes, offset, length, replace);
-
- case 'utf-8':
- // NOTE: to match the behavior of the other decode functions, we eat the
- // utf-8 BOM here.
- if (hasUtf8Bom(bytes, offset, length)) {
- offset += 3;
- length -= 3;
- }
- return decodeUtf8AsIterable(bytes, offset, length, replace);
-
- case 'utf-16':
- return decodeUtf16AsIterable(bytes, offset, length, replace);
- case 'utf-16-be':
- return decodeUtf16beAsIterable(bytes, offset, length, true, replace);
- case 'utf-16-le':
- return decodeUtf16leAsIterable(bytes, offset, length, true, replace);
-
- case 'utf-32':
- return decodeUtf32AsIterable(bytes, offset, length, replace);
- case 'utf-32-be':
- return decodeUtf32beAsIterable(bytes, offset, length, true, replace);
- case 'utf-32-le':
- return decodeUtf32leAsIterable(bytes, offset, length, true, replace);
-
- default:
- throw ArgumentError('Encoding $encoding not supported');
- }
-}
-
-// TODO(jmesserly): use dart:utf once http://dartbug.com/6476 is fixed.
-/// Returns the code points for the [input]. This works like [String.charCodes]
-/// but it decodes UTF-16 surrogate pairs.
-List<int> toCodepoints(String input) {
- var newCodes = <int>[];
- for (int i = 0; i < input.length; i++) {
- var c = input.codeUnitAt(i);
- if (0xD800 <= c && c <= 0xDBFF) {
- int next = i + 1;
- if (next < input.length) {
- var d = input.codeUnitAt(next);
- if (0xDC00 <= d && d <= 0xDFFF) {
- c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00);
- i = next;
- }
- }
- }
- newCodes.add(c);
- }
- return newCodes;
-}
-
-/// Decodes [windows-1252](http://en.wikipedia.org/wiki/Windows-1252) bytes as
-/// an iterable. Thus, the consumer can only convert as much of the input as
-/// needed. Set the [replacementCharacter] to null to throw an [ArgumentError]
-/// rather than replace the bad value.
-IterableWindows1252Decoder decodeWindows1252AsIterable(List<int> bytes,
- [int offset = 0,
- int length,
- int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) {
- return IterableWindows1252Decoder(
- bytes, offset, length, replacementCodepoint);
-}
-
-/// Return type of [decodeWindows1252AsIterable] and variants. The Iterable type
-/// provides an iterator on demand and the iterator will only translate bytes
-/// as requested by the user of the iterator. (Note: results are not cached.)
-class IterableWindows1252Decoder extends IterableBase<int> {
- final List<int> bytes;
- final int offset;
- final int length;
- final int replacementCodepoint;
-
- IterableWindows1252Decoder(this.bytes,
- [this.offset = 0,
- this.length,
- this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]);
-
- Windows1252Decoder get iterator =>
- Windows1252Decoder(bytes, offset, length, replacementCodepoint);
-}
-
-/// Provides an iterator of Unicode codepoints from windows-1252 encoded bytes.
-/// The parameters can set an offset into a list of bytes (as int), limit the
-/// length of the values to be decoded, and override the default Unicode
-/// replacement character. Set the replacementCharacter to null to throw an
-/// ArgumentError rather than replace the bad value. The return value
-/// from this method can be used as an Iterable (e.g. in a for-loop).
-class Windows1252Decoder implements Iterator<int> {
- final int replacementCodepoint;
- final List<int> _bytes;
- int _offset;
- final int _length;
-
- Windows1252Decoder(List<int> bytes,
- [int offset = 0,
- int length,
- this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT])
- : _bytes = bytes,
- _offset = offset - 1,
- _length = length == null ? bytes.length : length;
-
- bool get _inRange => _offset >= 0 && _offset < _length;
- int get current => _inRange ? _mapChar(_bytes[_offset]) : null;
-
- bool moveNext() {
- _offset++;
- return _inRange;
- }
-
- int _mapChar(int char) {
- // TODO(jmesserly): this is duplicating entitiesWindows1252 and
- // replacementCharacters from constants.dart
- switch (char) {
- case 0x80:
- return 0x20AC; // EURO SIGN
- case 0x82:
- return 0x201A; // SINGLE LOW-9 QUOTATION MARK
- case 0x83:
- return 0x0192; // LATIN SMALL LETTER F WITH HOOK
- case 0x84:
- return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
- case 0x85:
- return 0x2026; // HORIZONTAL ELLIPSIS
- case 0x86:
- return 0x2020; // DAGGER
- case 0x87:
- return 0x2021; // DOUBLE DAGGER
- case 0x88:
- return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
- case 0x89:
- return 0x2030; // PER MILLE SIGN
- case 0x8A:
- return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
- case 0x8B:
- return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- case 0x8C:
- return 0x0152; // LATIN CAPITAL LIGATURE OE
- case 0x8E:
- return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
- case 0x91:
- return 0x2018; // LEFT SINGLE QUOTATION MARK
- case 0x92:
- return 0x2019; // RIGHT SINGLE QUOTATION MARK
- case 0x93:
- return 0x201C; // LEFT DOUBLE QUOTATION MARK
- case 0x94:
- return 0x201D; // RIGHT DOUBLE QUOTATION MARK
- case 0x95:
- return 0x2022; // BULLET
- case 0x96:
- return 0x2013; // EN DASH
- case 0x97:
- return 0x2014; // EM DASH
- case 0x98:
- return 0x02DC; // SMALL TILDE
- case 0x99:
- return 0x2122; // TRADE MARK SIGN
- case 0x9A:
- return 0x0161; // LATIN SMALL LETTER S WITH CARON
- case 0x9B:
- return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- case 0x9C:
- return 0x0153; // LATIN SMALL LIGATURE OE
- case 0x9E:
- return 0x017E; // LATIN SMALL LETTER Z WITH CARON
- case 0x9F:
- return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
-
- case 0x81:
- case 0x8D:
- case 0x8F:
- case 0x90:
- case 0x9D:
- if (replacementCodepoint == null) {
- throw ArgumentError(
- "Invalid windows-1252 code point $char at $_offset");
- }
- return replacementCodepoint;
- }
- return char;
- }
-}
diff --git a/lib/src/encoding_parser.dart b/lib/src/encoding_parser.dart
index d61e76a..d0f40d6 100644
--- a/lib/src/encoding_parser.dart
+++ b/lib/src/encoding_parser.dart
@@ -1,7 +1,5 @@
-library encoding_parser;
-
import 'constants.dart';
-import 'inputstream.dart';
+import 'html_input_stream.dart';
// TODO(jmesserly): I converted StopIteration to StateError("No more elements").
// Seems strange to throw this from outside of an iterator though.
@@ -10,15 +8,15 @@
/// raised.
class EncodingBytes {
final String _bytes;
- int _position = -1;
+ int __position = -1;
EncodingBytes(this._bytes);
- int get length => _bytes.length;
+ int get _length => _bytes.length;
- String next() {
- var p = _position = _position + 1;
- if (p >= length) {
+ String _next() {
+ var p = __position = __position + 1;
+ if (p >= _length) {
throw StateError("No more elements");
} else if (p < 0) {
throw RangeError(p);
@@ -26,59 +24,59 @@
return _bytes[p];
}
- String previous() {
- var p = _position;
- if (p >= length) {
+ String _previous() {
+ var p = __position;
+ if (p >= _length) {
throw StateError("No more elements");
} else if (p < 0) {
throw RangeError(p);
}
- _position = p = p - 1;
+ __position = p = p - 1;
return _bytes[p];
}
- set position(int value) {
- if (_position >= length) {
+ set _position(int value) {
+ if (__position >= _length) {
throw StateError("No more elements");
}
- _position = value;
+ __position = value;
}
- int get position {
- if (_position >= length) {
+ int get _position {
+ if (__position >= _length) {
throw StateError("No more elements");
}
- if (_position >= 0) {
- return _position;
+ if (__position >= 0) {
+ return __position;
} else {
return 0;
}
}
- String get currentByte => _bytes[position];
+ String get _currentByte => _bytes[_position];
/// Skip past a list of characters. Defaults to skipping [isWhitespace].
- String skipChars([CharPreciate skipChars]) {
+ String _skipChars([_CharPredicate skipChars]) {
if (skipChars == null) skipChars = isWhitespace;
- var p = position; // use property for the error-checking
- while (p < length) {
+ var p = _position; // use property for the error-checking
+ while (p < _length) {
var c = _bytes[p];
if (!skipChars(c)) {
- _position = p;
+ __position = p;
return c;
}
p += 1;
}
- _position = p;
+ __position = p;
return null;
}
- String skipUntil(CharPreciate untilChars) {
- var p = position;
- while (p < length) {
+ String _skipUntil(_CharPredicate untilChars) {
+ var p = _position;
+ while (p < _length) {
var c = _bytes[p];
if (untilChars(c)) {
- _position = p;
+ __position = p;
return c;
}
p += 1;
@@ -89,14 +87,14 @@
/// Look for a sequence of bytes at the start of a string. If the bytes
/// are found return true and advance the position to the byte after the
/// match. Otherwise return false and leave the position alone.
- bool matchBytes(String bytes) {
- var p = position;
+ bool _matchBytes(String bytes) {
+ var p = _position;
if (_bytes.length < p + bytes.length) {
return false;
}
var data = _bytes.substring(p, p + bytes.length);
if (data == bytes) {
- position += bytes.length;
+ _position += bytes.length;
return true;
}
return false;
@@ -104,19 +102,19 @@
/// Look for the next sequence of bytes matching a given sequence. If
/// a match is found advance the position to the last byte of the match
- bool jumpTo(String bytes) {
- var newPosition = _bytes.indexOf(bytes, position);
+ bool _jumpTo(String bytes) {
+ var newPosition = _bytes.indexOf(bytes, _position);
if (newPosition >= 0) {
- _position = newPosition + bytes.length - 1;
+ __position = newPosition + bytes.length - 1;
return true;
} else {
throw StateError("No more elements");
}
}
- String slice(int start, [int end]) {
- if (end == null) end = length;
- if (end < 0) end += length;
+ String _slice(int start, [int end]) {
+ if (end == null) end = _length;
+ if (end < 0) end += _length;
return _bytes.substring(start, end);
}
}
@@ -126,68 +124,69 @@
class _DispatchEntry {
final String pattern;
final _MethodHandler handler;
+
_DispatchEntry(this.pattern, this.handler);
}
/// Mini parser for detecting character encoding from meta elements.
class EncodingParser {
- final EncodingBytes data;
- String encoding;
+ final EncodingBytes _data;
+ String _encoding;
/// [bytes] - the data to work on for encoding detection.
EncodingParser(List<int> bytes)
// Note: this is intentionally interpreting bytes as codepoints.
- : data = EncodingBytes(String.fromCharCodes(bytes).toLowerCase());
+ : _data = EncodingBytes(String.fromCharCodes(bytes).toLowerCase());
String getEncoding() {
final methodDispatch = [
- _DispatchEntry("<!--", handleComment),
- _DispatchEntry("<meta", handleMeta),
- _DispatchEntry("</", handlePossibleEndTag),
- _DispatchEntry("<!", handleOther),
- _DispatchEntry("<?", handleOther),
- _DispatchEntry("<", handlePossibleStartTag),
+ _DispatchEntry("<!--", _handleComment),
+ _DispatchEntry("<meta", _handleMeta),
+ _DispatchEntry("</", _handlePossibleEndTag),
+ _DispatchEntry("<!", _handleOther),
+ _DispatchEntry("<?", _handleOther),
+ _DispatchEntry("<", _handlePossibleStartTag),
];
try {
for (;;) {
for (var dispatch in methodDispatch) {
- if (data.matchBytes(dispatch.pattern)) {
+ if (_data._matchBytes(dispatch.pattern)) {
var keepParsing = dispatch.handler();
if (keepParsing) break;
// We found an encoding. Stop.
- return encoding;
+ return _encoding;
}
}
- data.position += 1;
+ _data._position += 1;
}
} on StateError catch (_) {
// Catch this here to match behavior of Python's StopIteration
// TODO(jmesserly): refactor to not use exceptions
}
- return encoding;
+ return _encoding;
}
/// Skip over comments.
- bool handleComment() => data.jumpTo("-->");
+ bool _handleComment() => _data._jumpTo("-->");
- bool handleMeta() {
- if (!isWhitespace(data.currentByte)) {
+ bool _handleMeta() {
+ if (!isWhitespace(_data._currentByte)) {
// if we have <meta not followed by a space so just keep going
return true;
}
// We have a valid meta element we want to search for attributes
while (true) {
// Try to find the next attribute after the current position
- var attr = getAttribute();
+ var attr = _getAttribute();
if (attr == null) return true;
if (attr[0] == "charset") {
var tentativeEncoding = attr[1];
var codec = codecName(tentativeEncoding);
if (codec != null) {
- encoding = codec;
+ _encoding = codec;
return false;
}
} else if (attr[0] == "content") {
@@ -195,54 +194,54 @@
var tentativeEncoding = contentParser.parse();
var codec = codecName(tentativeEncoding);
if (codec != null) {
- encoding = codec;
+ _encoding = codec;
return false;
}
}
}
}
- bool handlePossibleStartTag() => handlePossibleTag(false);
+ bool _handlePossibleStartTag() => _handlePossibleTag(false);
- bool handlePossibleEndTag() {
- data.next();
- return handlePossibleTag(true);
+ bool _handlePossibleEndTag() {
+ _data._next();
+ return _handlePossibleTag(true);
}
- bool handlePossibleTag(bool endTag) {
- if (!isLetter(data.currentByte)) {
+ bool _handlePossibleTag(bool endTag) {
+ if (!isLetter(_data._currentByte)) {
//If the next byte is not an ascii letter either ignore this
//fragment (possible start tag case) or treat it according to
//handleOther
if (endTag) {
- data.previous();
- handleOther();
+ _data._previous();
+ _handleOther();
}
return true;
}
- var c = data.skipUntil(isSpaceOrAngleBracket);
+ var c = _data._skipUntil(_isSpaceOrAngleBracket);
if (c == "<") {
// return to the first step in the overall "two step" algorithm
// reprocessing the < byte
- data.previous();
+ _data._previous();
} else {
//Read all attributes
- var attr = getAttribute();
+ var attr = _getAttribute();
while (attr != null) {
- attr = getAttribute();
+ attr = _getAttribute();
}
}
return true;
}
- bool handleOther() => data.jumpTo(">");
+ bool _handleOther() => _data._jumpTo(">");
/// Return a name,value pair for the next attribute in the stream,
/// if one is found, or null
- List<String> getAttribute() {
+ List<String> _getAttribute() {
// Step 1 (skip chars)
- var c = data.skipChars((x) => x == "/" || isWhitespace(x));
+ var c = _data._skipChars((x) => x == "/" || isWhitespace(x));
// Step 2
if (c == ">" || c == null) {
return null;
@@ -258,8 +257,8 @@
break;
} else if (isWhitespace(c)) {
// Step 6!
- c = data.skipChars();
- c = data.next();
+ c = _data._skipChars();
+ c = _data._next();
break;
} else if (c == "/" || c == ">") {
return [attrName.join(), ""];
@@ -269,27 +268,27 @@
attrName.add(c);
}
// Step 5
- c = data.next();
+ c = _data._next();
}
// Step 7
if (c != "=") {
- data.previous();
+ _data._previous();
return [attrName.join(), ""];
}
// Step 8
- data.next();
+ _data._next();
// Step 9
- c = data.skipChars();
+ c = _data._skipChars();
// Step 10
if (c == "'" || c == '"') {
// 10.1
var quoteChar = c;
while (true) {
// 10.2
- c = data.next();
+ c = _data._next();
if (c == quoteChar) {
// 10.3
- data.next();
+ _data._next();
return [attrName.join(), attrValue.join()];
} else if (isLetter(c)) {
// 10.4
@@ -310,8 +309,8 @@
}
// Step 11
while (true) {
- c = data.next();
- if (isSpaceOrAngleBracket(c)) {
+ c = _data._next();
+ if (_isSpaceOrAngleBracket(c)) {
return [attrName.join(), attrValue.join()];
} else if (c == null) {
return null;
@@ -333,34 +332,34 @@
try {
// Check if the attr name is charset
// otherwise return
- data.jumpTo("charset");
- data.position += 1;
- data.skipChars();
- if (data.currentByte != "=") {
+ data._jumpTo("charset");
+ data._position += 1;
+ data._skipChars();
+ if (data._currentByte != "=") {
// If there is no = sign keep looking for attrs
return null;
}
- data.position += 1;
- data.skipChars();
+ data._position += 1;
+ data._skipChars();
// Look for an encoding between matching quote marks
- if (data.currentByte == '"' || data.currentByte == "'") {
- var quoteMark = data.currentByte;
- data.position += 1;
- var oldPosition = data.position;
- if (data.jumpTo(quoteMark)) {
- return data.slice(oldPosition, data.position);
+ if (data._currentByte == '"' || data._currentByte == "'") {
+ var quoteMark = data._currentByte;
+ data._position += 1;
+ var oldPosition = data._position;
+ if (data._jumpTo(quoteMark)) {
+ return data._slice(oldPosition, data._position);
} else {
return null;
}
} else {
// Unquoted value
- var oldPosition = data.position;
+ var oldPosition = data._position;
try {
- data.skipUntil(isWhitespace);
- return data.slice(oldPosition, data.position);
+ data._skipUntil(isWhitespace);
+ return data._slice(oldPosition, data._position);
} on StateError catch (_) {
//Return the whole remaining value
- return data.slice(oldPosition);
+ return data._slice(oldPosition);
}
}
} on StateError catch (_) {
@@ -369,8 +368,8 @@
}
}
-bool isSpaceOrAngleBracket(String char) {
+bool _isSpaceOrAngleBracket(String char) {
return char == ">" || char == "<" || isWhitespace(char);
}
-typedef CharPreciate = bool Function(String char);
+typedef _CharPredicate = bool Function(String char);
diff --git a/lib/src/inputstream.dart b/lib/src/html_input_stream.dart
similarity index 83%
rename from lib/src/inputstream.dart
rename to lib/src/html_input_stream.dart
index dbcf98b..42b1741 100644
--- a/lib/src/inputstream.dart
+++ b/lib/src/html_input_stream.dart
@@ -1,21 +1,12 @@
-library inputstream;
-
import 'dart:collection';
-import 'package:utf/utf.dart';
+import 'dart:convert' show ascii, utf8;
+
import 'package:source_span/source_span.dart';
-import 'char_encodings.dart';
+
import 'constants.dart';
import 'encoding_parser.dart';
import 'utils.dart';
-/// Hooks to call into dart:io without directly referencing it.
-class ConsoleSupport {
- List<int> bytesFromFile(source) => null;
-}
-
-// TODO(jmesserly): use lazy init here when supported.
-ConsoleSupport consoleSupport = ConsoleSupport();
-
/// Provides a unicode stream of characters to the HtmlTokenizer.
///
/// This class takes care of character encoding and removing or replacing
@@ -26,7 +17,7 @@
static const int numBytesMeta = 512;
/// Encoding to use if no other information can be found.
- static const String defaultEncoding = 'windows-1252';
+ static const String defaultEncoding = 'utf-8';
/// The name of the character encoding.
String charEncodingName;
@@ -75,24 +66,14 @@
this.sourceUrl])
: charEncodingName = codecName(encoding) {
if (source is String) {
- _rawChars = toCodepoints(source);
+ _rawChars = source.runes.toList();
charEncodingName = 'utf-8';
charEncodingCertain = true;
} else if (source is List<int>) {
_rawBytes = source;
} else {
- // TODO(jmesserly): it's unfortunate we need to read all bytes in advance,
- // but it's necessary because of how the UTF decoders work.
- _rawBytes = consoleSupport.bytesFromFile(source);
-
- if (_rawBytes == null) {
- // TODO(jmesserly): we should accept some kind of stream API too.
- // Unfortunately dart:io InputStream is async only, which won't work.
- throw ArgumentError("'source' must be a String or "
- "List<int> (of bytes). You can also pass a RandomAccessFile if you"
- "`import 'package:html/parser_console.dart'` and call "
- "`useConsole()`.");
- }
+ throw ArgumentError.value(
+ source, 'source', 'Must be a String or List<int>.');
}
// Detect encoding iff no explicit "transport level" encoding is supplied
@@ -111,7 +92,7 @@
_chars = <int>[];
if (_rawChars == null) {
- _rawChars = decodeBytes(charEncodingName, _rawBytes);
+ _rawChars = _decodeBytes(charEncodingName, _rawBytes);
}
bool skipNewline = false;
@@ -121,7 +102,7 @@
if (c == NEWLINE) continue;
}
- if (invalidUnicode(c)) errors.add('invalid-codepoint');
+ if (_invalidUnicode(c)) errors.add('invalid-codepoint');
if (0xD800 <= c && c <= 0xDFFF) {
c = 0xFFFD;
@@ -196,17 +177,9 @@
/// encoding otherwise return null.
String detectBOM() {
// Try detecting the BOM using bytes from the string
- if (hasUtf8Bom(_rawBytes)) {
+ if (_hasUtf8Bom(_rawBytes)) {
return 'utf-8';
}
- // Note: we don't need to remember whether it was big or little endian
- // because the decoder will do that later. It will also eat the BOM for us.
- if (hasUtf16Bom(_rawBytes)) {
- return 'utf-16';
- }
- if (hasUtf32Bom(_rawBytes)) {
- return 'utf-32';
- }
return null;
}
@@ -262,7 +235,7 @@
// TODO(jmesserly): the Python code used a regex to check for this. But
// Dart doesn't let you create a regexp with invalid characters.
-bool invalidUnicode(int c) {
+bool _invalidUnicode(int c) {
if (0x0001 <= c && c <= 0x0008) return true;
if (0x000E <= c && c <= 0x001F) return true;
if (0x007F <= c && c <= 0x009F) return true;
@@ -319,3 +292,32 @@
var canonicalName = encoding.replaceAll(asciiPunctuation, '').toLowerCase();
return encodings[canonicalName];
}
+
+/// Returns true if the [bytes] starts with a UTF-8 byte order mark.
+/// Since UTF-8 doesn't have byte order, it's somewhat of a misnomer, but it is
+/// used in HTML to detect the UTF-
+bool _hasUtf8Bom(List<int> bytes, [int offset = 0, int length]) {
+ int end = length != null ? offset + length : bytes.length;
+ return (offset + 3) <= end &&
+ bytes[offset] == 0xEF &&
+ bytes[offset + 1] == 0xBB &&
+ bytes[offset + 2] == 0xBF;
+}
+
+/// Decodes the [bytes] with the provided [encoding] and returns an iterable for
+/// the codepoints. Supports the major unicode encodings as well as ascii and
+/// and windows-1252 encodings.
+Iterable<int> _decodeBytes(String encoding, List<int> bytes) {
+ switch (encoding) {
+ case 'ascii':
+ return ascii.decode(bytes).runes;
+
+ case 'utf-8':
+ // NOTE: To match the behavior of the other decode functions, we eat the
+ // UTF-8 BOM here. This is the default behavior of `utf8.decode`.
+ return utf8.decode(bytes).runes;
+
+ default:
+ throw ArgumentError('Encoding $encoding not supported');
+ }
+}
diff --git a/lib/src/tokenizer.dart b/lib/src/tokenizer.dart
index 48d6365..638663e 100644
--- a/lib/src/tokenizer.dart
+++ b/lib/src/tokenizer.dart
@@ -3,7 +3,7 @@
import 'dart:collection';
import 'package:html/parser.dart' show HtmlParser;
import 'constants.dart';
-import 'inputstream.dart';
+import 'html_input_stream.dart';
import 'token.dart';
import 'utils.dart';
diff --git a/pubspec.yaml b/pubspec.yaml
index e13c01a..4480d41 100644
--- a/pubspec.yaml
+++ b/pubspec.yaml
@@ -1,5 +1,5 @@
name: html
-version: 0.13.4+2
+version: 0.14.0+1
description: APIs for parsing and manipulating HTML content outside the browser.
author: Dart Team <misc@dartlang.org>
@@ -11,7 +11,6 @@
dependencies:
csslib: '>=0.13.2 <0.16.0'
source_span: '>=1.0.0 <2.0.0'
- utf: '>=0.9.0 <0.10.0'
dev_dependencies:
path: ^1.6.2
diff --git a/test/data/parser_feature/raw_file.html b/test/data/parser_feature/raw_file.html
deleted file mode 100644
index bcdbf76..0000000
--- a/test/data/parser_feature/raw_file.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<!doctype html>
-<html>
-<body>
-Hello world!
-</body>
-</html>
diff --git a/test/parser_feature_test.dart b/test/parser_feature_test.dart
index 2591a2d..0889f44 100644
--- a/test/parser_feature_test.dart
+++ b/test/parser_feature_test.dart
@@ -1,13 +1,13 @@
/// Additional feature tests that aren't based on test data.
library parser_feature_test;
-import 'package:test/test.dart';
import 'package:html/dom.dart';
import 'package:html/parser.dart';
import 'package:html/src/constants.dart';
import 'package:html/src/encoding_parser.dart';
import 'package:html/src/treebuilder.dart';
import 'package:source_span/source_span.dart';
+import 'package:test/test.dart';
main() {
_testElementSpans();
diff --git a/test/parser_test.dart b/test/parser_test.dart
index 1289f61..1db1586 100644
--- a/test/parser_test.dart
+++ b/test/parser_test.dart
@@ -2,13 +2,12 @@
library parser_test;
import 'dart:convert';
-import 'dart:io';
-import 'package:path/path.dart' as pathos;
-import 'package:test/test.dart';
+
import 'package:html/dom.dart';
import 'package:html/parser.dart';
-import 'package:html/parser_console.dart' as parser_console;
-import 'package:html/src/inputstream.dart' as inputstream;
+import 'package:path/path.dart' as pathos;
+import 'package:test/test.dart';
+
import 'support.dart';
// Run the parse error checks
@@ -71,16 +70,6 @@
}
void main() {
- test('dart:io', () {
- // ensure IO support is unregistered
- expect(inputstream.consoleSupport,
- const TypeMatcher<inputstream.ConsoleSupport>());
- var file = File('$testDataDir/parser_feature/raw_file.html').openSync();
- expect(() => parse(file), throwsA(const TypeMatcher<ArgumentError>()));
- parser_console.useConsole();
- expect(parse(file).body.innerHtml.trim(), 'Hello world!');
- });
-
for (var path in getDataFiles('tree-construction')) {
if (!path.endsWith('.dat')) continue;
diff --git a/test/tokenizer_test.dart b/test/tokenizer_test.dart
index 744a496..59dd2aa 100644
--- a/test/tokenizer_test.dart
+++ b/test/tokenizer_test.dart
@@ -7,10 +7,8 @@
import 'dart:mirrors';
import 'package:path/path.dart' as pathos;
import 'package:test/test.dart';
-import 'package:html/src/char_encodings.dart';
import 'package:html/src/token.dart';
import 'package:html/src/tokenizer.dart';
-import 'package:utf/utf.dart';
import 'support.dart';
class TokenizerTestParser {
@@ -24,7 +22,7 @@
List parse(String str) {
// Note: we need to pass bytes to the tokenizer if we want it to handle BOM.
- var bytes = codepointsToUtf8(toCodepoints(str));
+ var bytes = utf8.encode(str);
var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8');
outputTokens = [];