blob: 9da260bed305e3303b9381a0e9bd5888e50a98be [file] [log] [blame]
library encoding_parser;
import 'constants.dart';
import 'inputstream.dart';
// TODO(jmesserly): I converted StopIteration to StateError("No more elements").
// Seems strange to throw this from outside of an iterator though.
/// String-like object with an associated position and various extra methods
/// If the position is ever greater than the string length then an exception is
/// raised.
class EncodingBytes {
final String _bytes;
int _position = -1;
EncodingBytes(this._bytes);
int get length => _bytes.length;
String next() {
var p = _position = _position + 1;
if (p >= length) {
throw new StateError("No more elements");
} else if (p < 0) {
throw new RangeError(p);
}
return _bytes[p];
}
String previous() {
var p = _position;
if (p >= length) {
throw new StateError("No more elements");
} else if (p < 0) {
throw new RangeError(p);
}
_position = p = p - 1;
return _bytes[p];
}
set position(int value) {
if (_position >= length) {
throw new StateError("No more elements");
}
_position = value;
}
int get position {
if (_position >= length) {
throw new StateError("No more elements");
}
if (_position >= 0) {
return _position;
} else {
return 0;
}
}
String get currentByte => _bytes[position];
/// Skip past a list of characters. Defaults to skipping [isWhitespace].
String skipChars([CharPreciate skipChars]) {
if (skipChars == null) skipChars = isWhitespace;
var p = position; // use property for the error-checking
while (p < length) {
var c = _bytes[p];
if (!skipChars(c)) {
_position = p;
return c;
}
p += 1;
}
_position = p;
return null;
}
String skipUntil(CharPreciate untilChars) {
var p = position;
while (p < length) {
var c = _bytes[p];
if (untilChars(c)) {
_position = p;
return c;
}
p += 1;
}
return null;
}
/// Look for a sequence of bytes at the start of a string. If the bytes
/// are found return true and advance the position to the byte after the
/// match. Otherwise return false and leave the position alone.
bool matchBytes(String bytes) {
var p = position;
if (_bytes.length < p + bytes.length) {
return false;
}
var data = _bytes.substring(p, p + bytes.length);
if (data == bytes) {
position += bytes.length;
return true;
}
return false;
}
/// Look for the next sequence of bytes matching a given sequence. If
/// a match is found advance the position to the last byte of the match
bool jumpTo(String bytes) {
var newPosition = _bytes.indexOf(bytes, position);
if (newPosition >= 0) {
_position = newPosition + bytes.length - 1;
return true;
} else {
throw new StateError("No more elements");
}
}
String slice(int start, [int end]) {
if (end == null) end = length;
if (end < 0) end += length;
return _bytes.substring(start, end - start);
}
}
typedef bool _MethodHandler();
class _DispatchEntry {
final String pattern;
final _MethodHandler handler;
_DispatchEntry(this.pattern, this.handler);
}
/// Mini parser for detecting character encoding from meta elements.
class EncodingParser {
final EncodingBytes data;
String encoding;
/// [bytes] - the data to work on for encoding detection.
EncodingParser(List<int> bytes)
// Note: this is intentionally interpreting bytes as codepoints.
: data = new EncodingBytes(new String.fromCharCodes(bytes).toLowerCase());
String getEncoding() {
final methodDispatch = [
new _DispatchEntry("<!--", handleComment),
new _DispatchEntry("<meta", handleMeta),
new _DispatchEntry("</", handlePossibleEndTag),
new _DispatchEntry("<!", handleOther),
new _DispatchEntry("<?", handleOther),
new _DispatchEntry("<", handlePossibleStartTag),
];
try {
for (;;) {
for (var dispatch in methodDispatch) {
if (data.matchBytes(dispatch.pattern)) {
var keepParsing = dispatch.handler();
if (keepParsing) break;
// We found an encoding. Stop.
return encoding;
}
}
data.position += 1;
}
} on StateError catch (_) {
// Catch this here to match behavior of Python's StopIteration
// TODO(jmesserly): refactor to not use exceptions
}
return encoding;
}
/// Skip over comments.
bool handleComment() => data.jumpTo("-->");
bool handleMeta() {
if (!isWhitespace(data.currentByte)) {
// if we have <meta not followed by a space so just keep going
return true;
}
// We have a valid meta element we want to search for attributes
while (true) {
// Try to find the next attribute after the current position
var attr = getAttribute();
if (attr == null) return true;
if (attr[0] == "charset") {
var tentativeEncoding = attr[1];
var codec = codecName(tentativeEncoding);
if (codec != null) {
encoding = codec;
return false;
}
} else if (attr[0] == "content") {
var contentParser = new ContentAttrParser(new EncodingBytes(attr[1]));
var tentativeEncoding = contentParser.parse();
var codec = codecName(tentativeEncoding);
if (codec != null) {
encoding = codec;
return false;
}
}
}
}
bool handlePossibleStartTag() => handlePossibleTag(false);
bool handlePossibleEndTag() {
data.next();
return handlePossibleTag(true);
}
bool handlePossibleTag(bool endTag) {
if (!isLetter(data.currentByte)) {
//If the next byte is not an ascii letter either ignore this
//fragment (possible start tag case) or treat it according to
//handleOther
if (endTag) {
data.previous();
handleOther();
}
return true;
}
var c = data.skipUntil(isSpaceOrAngleBracket);
if (c == "<") {
// return to the first step in the overall "two step" algorithm
// reprocessing the < byte
data.previous();
} else {
//Read all attributes
var attr = getAttribute();
while (attr != null) {
attr = getAttribute();
}
}
return true;
}
bool handleOther() => data.jumpTo(">");
/// Return a name,value pair for the next attribute in the stream,
/// if one is found, or null
List<String> getAttribute() {
// Step 1 (skip chars)
var c = data.skipChars((x) => x == "/" || isWhitespace(x));
// Step 2
if (c == ">" || c == null) {
return null;
}
// Step 3
var attrName = [];
var attrValue = [];
// Step 4 attribute name
while (true) {
if (c == null) {
return null;
} else if (c == "=" && attrName.length > 0) {
break;
} else if (isWhitespace(c)) {
// Step 6!
c = data.skipChars();
c = data.next();
break;
} else if (c == "/" || c == ">") {
return [attrName.join(), ""];
} else if (isLetter(c)) {
attrName.add(c.toLowerCase());
} else {
attrName.add(c);
}
// Step 5
c = data.next();
}
// Step 7
if (c != "=") {
data.previous();
return [attrName.join(), ""];
}
// Step 8
data.next();
// Step 9
c = data.skipChars();
// Step 10
if (c == "'" || c == '"') {
// 10.1
var quoteChar = c;
while (true) {
// 10.2
c = data.next();
if (c == quoteChar) {
// 10.3
data.next();
return [attrName.join(), attrValue.join()];
} else if (isLetter(c)) {
// 10.4
attrValue.add(c.toLowerCase());
} else {
// 10.5
attrValue.add(c);
}
}
} else if (c == ">") {
return [attrName.join(), ""];
} else if (c == null) {
return null;
} else if (isLetter(c)) {
attrValue.add(c.toLowerCase());
} else {
attrValue.add(c);
}
// Step 11
while (true) {
c = data.next();
if (isSpaceOrAngleBracket(c)) {
return [attrName.join(), attrValue.join()];
} else if (c == null) {
return null;
} else if (isLetter(c)) {
attrValue.add(c.toLowerCase());
} else {
attrValue.add(c);
}
}
}
}
class ContentAttrParser {
final EncodingBytes data;
ContentAttrParser(this.data);
String parse() {
try {
// Check if the attr name is charset
// otherwise return
data.jumpTo("charset");
data.position += 1;
data.skipChars();
if (data.currentByte != "=") {
// If there is no = sign keep looking for attrs
return null;
}
data.position += 1;
data.skipChars();
// Look for an encoding between matching quote marks
if (data.currentByte == '"' || data.currentByte == "'") {
var quoteMark = data.currentByte;
data.position += 1;
var oldPosition = data.position;
if (data.jumpTo(quoteMark)) {
return data.slice(oldPosition, data.position);
} else {
return null;
}
} else {
// Unquoted value
var oldPosition = data.position;
try {
data.skipUntil(isWhitespace);
return data.slice(oldPosition, data.position);
} on StateError catch (_) {
//Return the whole remaining value
return data.slice(oldPosition);
}
}
} on StateError catch (_) {
return null;
}
}
}
bool isSpaceOrAngleBracket(String char) {
return char == ">" || char == "<" || isWhitespace(char);
}
typedef bool CharPreciate(String char);