[text] Rewrite the text plist parser to be like text/template/parser This commit overhauls the text property list parser, reducing the cost in time and memory and overall sanity required to parse text property list documents. Herein, support is also added for: * UTF-16 text property lists (#26) * Proper scanning of UTF-8 codepoints * Encoding conversion (UTF-16{BE,LE) +- BOM -> UTF-8) * Empty data values, <> * Error messages that include line and column info (#25) * Legacy strings file format (dictionary without { }) (#27) * Shortcut strings file format (dictionaries without values) (#27) * Short hex/unicode/octal escapes (\x2, \u33, \0) * Empty documents parsing as empty dictionaries * Detection of garbage after the end of a document The character tables have been augmented with their own characterSet type, which allows them to report on their own residence. All characters outside the 0-255 range will be considered "not in set" for now. In the benchmarks below, *Step(Parse|Decode) operate on a relatively small synthetic property list that contains every property list type. BigParse operates on a ~700kb binary property list created by converting the iTunes software update catalog from XML to GNUStep or OpenStep. Pretty benchmarks include whitespace. benchmark old ns/op new ns/op delta BenchmarkBigGNUStepParse-4 125008990 33544860 -73.17% BenchmarkBigPrettyGNUStepParse-4 54869160 38049063 -30.65% BenchmarkBigOpenStepParse-4 124436480 31491614 -74.69% BenchmarkBigPrettyOpenStepParse-4 54080760 34542446 -36.13% BenchmarkOpenStepParse-4 20177 13894 -31.14% BenchmarkGNUStepParse-4 18742 15087 -19.50% benchmark old allocs new allocs delta BenchmarkBigGNUStepParse-4 2248154 120655 -94.63% BenchmarkBigPrettyGNUStepParse-4 969515 120655 -87.56% BenchmarkBigOpenStepParse-4 2251448 120655 -94.64% BenchmarkBigPrettyOpenStepParse-4 969541 120655 -87.56% BenchmarkOpenStepParse-4 234 44 -81.20% BenchmarkGNUStepParse-4 186 47 -74.73% benchmark old bytes new bytes delta BenchmarkBigGNUStepParse-4 67633657 24006777 -64.50% BenchmarkBigPrettyGNUStepParse-4 30100843 24006784 -20.25% BenchmarkBigOpenStepParse-4 67657126 24023625 -64.49% BenchmarkBigPrettyOpenStepParse-4 30101001 24023619 -20.19% BenchmarkOpenStepParse-4 15376 10192 -33.71% BenchmarkGNUStepParse-4 14992 10320 -31.16% Fixes #25 Fixes #26 Fixes #27

commit: c17dcc5f37b9733ee759dc22a39d5087a3325457 [log] [tgz]
author: Dustin L. Howett <dustin@howett.net> Wed Apr 19 03:55:14 2017
committer: Dustin L. Howett <dustin@howett.net> Mon Apr 24 03:19:58 2017
tree: 8c1b607c825359970de3402fded85b80cd48e892
parent: c4a68d895d5012d89897ed90cccafa265acbad0f [diff]
diff --git a/invalid_text_test.go b/invalid_text_test.go
index c7c3dc5..8b5845e 100644
--- a/invalid_text_test.go
+++ b/invalid_text_test.go

@@ -23,13 +23,19 @@
 	{"Missing Equals in Dictionary", `{"A"A;}`},
 	{"Missing Semicolon in Dictionary", `{"A"=A}`},
 	{"Invalid GNUStep type", "<*F33>"},
-	{"Invalid GNUStep type data", "(<*I>"},
+	{"Invalid GNUStep int", "(<*I>"},
+	{"Invalid GNUStep date", "<*D5>"},
+	{"Truncated GNUStep value", "<*I3"},
 	{"Invalid data", "<EQ>"},
-	{"Truncated unicode escape", `"\u231"`},
-	{"Truncated hex escape", `"\x2"`},
-	{"Truncated octal escape", `"\02"`},
+	{"Truncated unicode escape", `"\u231`},
+	{"Truncated hex escape", `"\x2`},
+	{"Truncated octal escape", `"\02`},
 	{"Truncated data", `<33`},
+	{"Uneven data", `<3>`},
 	{"Truncated block comment", `/* hello`},
+	{"Truncated quoted string", `"hi`},
+	{"Garbage after end of non-string", "<ab> cde"},
+	{"Broken UTF-16", "\xFE\xFF\x01"},
 }
 
 func TestInvalidTextPlists(t *testing.T) {

diff --git a/text_generator.go b/text_generator.go
index 31eb9d6..53078ba 100644
--- a/text_generator.go
+++ b/text_generator.go

@@ -11,7 +11,7 @@
 	writer io.Writer
 	format int
 
-	quotableTable *[4]uint64
+	quotableTable *characterSet
 
 	indent string
 	depth  int
@@ -49,7 +49,7 @@
 			s += us
 		} else {
 			c := uint8(r)
-			if (*p.quotableTable)[c/64]&(1<<(c%64)) > 0 {
+			if p.quotableTable.ContainsByte(c) {
 				quot = true
 			}
 

diff --git a/text_parser.go b/text_parser.go
index 1cf2ad5..7e49d6f 100644
--- a/text_parser.go
+++ b/text_parser.go

@@ -1,26 +1,64 @@
 package plist
 
 import (
-	"bufio"
-	"encoding/hex"
+	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
+	"io/ioutil"
 	"runtime"
 	"strings"
 	"time"
+	"unicode/utf16"
+	"unicode/utf8"
 )
 
-type byteReader interface {
-	io.Reader
-	io.ByteScanner
-	Peek(n int) ([]byte, error)
-	ReadBytes(delim byte) ([]byte, error)
+type textPlistParser struct {
+	reader io.Reader
+	format int
+
+	input string
+	start int
+	pos   int
+	width int
 }
 
-type textPlistParser struct {
-	reader             byteReader
-	whitespaceReplacer *strings.Replacer
-	format             int
+func convertU16(buffer []byte, bo binary.ByteOrder) (string, error) {
+	if len(buffer)%2 != 0 {
+		return "", errors.New("truncated utf16")
+	}
+
+	tmp := make([]uint16, len(buffer)/2)
+	for i := 0; i < len(buffer); i += 2 {
+		tmp[i/2] = bo.Uint16(buffer[i : i+2])
+	}
+	return string(utf16.Decode(tmp)), nil
+}
+
+func guessEncodingAndConvert(buffer []byte) (string, error) {
+	if len(buffer) >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF {
+		// UTF-8 BOM
+		return zeroCopy8BitString(buffer, 3, len(buffer)-3), nil
+	} else if len(buffer) >= 2 {
+		// UTF-16 guesses
+
+		switch {
+		// stream is big-endian (BOM is FE FF or head is 00 XX)
+		case (buffer[0] == 0xFE && buffer[1] == 0xFF):
+			return convertU16(buffer[2:], binary.BigEndian)
+		case (buffer[0] == 0 && buffer[1] != 0):
+			return convertU16(buffer, binary.BigEndian)
+
+		// stream is little-endian (BOM is FE FF or head is XX 00)
+		case (buffer[0] == 0xFF && buffer[1] == 0xFE):
+			return convertU16(buffer[2:], binary.LittleEndian)
+		case (buffer[0] != 0 && buffer[1] == 0):
+			return convertU16(buffer, binary.LittleEndian)
+		}
+	}
+
+	// fallback: assume ASCII (not great!)
+	return zeroCopy8BitString(buffer, 0, len(buffer)), nil
 }
 
 func (p *textPlistParser) parseDocument() (pval cfValue, parseError error) {
@@ -29,209 +67,295 @@
 			if _, ok := r.(runtime.Error); ok {
 				panic(r)
 			}
-			if _, ok := r.(invalidPlistError); ok {
-				parseError = r.(error)
-			} else {
-				// Wrap all non-invalid-plist errors.
-				parseError = plistParseError{"text", r.(error)}
-			}
+			// Wrap all non-invalid-plist errors.
+			parseError = plistParseError{"text", r.(error)}
 		}
 	}()
-	pval = p.parsePlistValue()
+
+	buffer, err := ioutil.ReadAll(p.reader)
+	if err != nil {
+		panic(err)
+	}
+
+	p.input, err = guessEncodingAndConvert(buffer)
+	if err != nil {
+		panic(err)
+	}
+
+	val := p.parsePlistValue()
+
+	p.skipWhitespaceAndComments()
+	if p.peek() != eof {
+		if _, ok := val.(cfString); !ok {
+			p.error("garbage after end of document")
+		}
+
+		p.start = 0
+		p.pos = 0
+		val = p.parseDictionary(true)
+	}
+
+	pval = val
+
 	return
 }
 
-func (p *textPlistParser) chugWhitespace() {
-ws:
+const eof rune = -1
+
+func (p *textPlistParser) error(e string, args ...interface{}) {
+	line := strings.Count(p.input[:p.pos], "\n")
+	char := p.pos - strings.LastIndex(p.input[:p.pos], "\n") - 1
+	panic(fmt.Errorf("%s at line %d character %d", fmt.Sprintf(e, args...), line, char))
+}
+
+func (p *textPlistParser) next() rune {
+	if int(p.pos) >= len(p.input) {
+		p.width = 0
+		return eof
+	}
+	r, w := utf8.DecodeRuneInString(p.input[p.pos:])
+	p.width = w
+	p.pos += p.width
+	return r
+}
+
+func (p *textPlistParser) backup() {
+	p.pos -= p.width
+}
+
+func (p *textPlistParser) peek() rune {
+	r := p.next()
+	p.backup()
+	return r
+}
+
+func (p *textPlistParser) emit() string {
+	s := p.input[p.start:p.pos]
+	p.start = p.pos
+	return s
+}
+
+func (p *textPlistParser) ignore() {
+	p.start = p.pos
+}
+
+func (p *textPlistParser) empty() bool {
+	return p.start == p.pos
+}
+
+func (p *textPlistParser) scanUntil(ch rune) {
+	if x := strings.IndexRune(p.input[p.pos:], ch); x >= 0 {
+		p.pos += x
+		return
+	}
+	p.pos = len(p.input)
+}
+
+func (p *textPlistParser) scanUntilAny(chs string) {
+	if x := strings.IndexAny(p.input[p.pos:], chs); x >= 0 {
+		p.pos += x
+		return
+	}
+	p.pos = len(p.input)
+}
+
+func (p *textPlistParser) scanCharactersInSet(ch *characterSet) {
+	for ch.Contains(p.next()) {
+	}
+	p.backup()
+}
+
+func (p *textPlistParser) scanCharactersNotInSet(ch *characterSet) {
+	var r rune
 	for {
-		c, err := p.reader.ReadByte()
-		if err != nil && err != io.EOF {
-			panic(err)
-		}
-		if whitespace[c/64]&(1<<(c%64)) == 0 {
-			if c == '/' && err != io.EOF {
-				// A / at the end of the file is not the begining of a comment.
-				cs, err := p.reader.Peek(1)
-				if err != nil && err != io.EOF {
-					panic(err)
-				}
-				if err == io.EOF {
-					return
-				}
-				c = cs[0]
-				switch c {
-				case '/':
-					for {
-						c, err = p.reader.ReadByte()
-						if err != nil && err != io.EOF {
-							panic(err)
-						} else if err == io.EOF {
-							break
-						}
-						// TODO: UTF-8
-						if c == '\n' || c == '\r' {
-							break
-						}
-					}
-				case '*':
-					// Peek returned a value here, so it is safe to read.
-					_, _ = p.reader.ReadByte()
-					star := false
-					for {
-						c, err = p.reader.ReadByte()
-						if err != nil {
-							panic(err)
-						}
-						if c == '*' {
-							star = true
-						} else if c == '/' && star {
-							break
-						} else {
-							star = false
-						}
-					}
-				default:
-					p.reader.UnreadByte() // Not the beginning of a // or /* comment
-					break ws
-				}
-				continue
-			}
-			p.reader.UnreadByte()
+		r = p.next()
+		if r == eof || ch.Contains(r) {
 			break
 		}
 	}
+	p.backup()
 }
 
-func (p *textPlistParser) parseQuotedString() cfString {
-	escaping := false
-	s := ""
+func (p *textPlistParser) skipWhitespaceAndComments() {
 	for {
-		byt, err := p.reader.ReadByte()
-		// EOF here is an error: we're inside a quoted string!
-		if err != nil {
-			panic(err)
-		}
-		c := rune(byt)
-		if !escaping {
-			if c == '"' {
-				break
-			} else if c == '\\' {
-				escaping = true
-				continue
+		p.scanCharactersInSet(&whitespace)
+		if strings.HasPrefix(p.input[p.pos:], "//") {
+			p.scanCharactersNotInSet(&newlineCharacterSet)
+		} else if strings.HasPrefix(p.input[p.pos:], "/*") {
+			if x := strings.Index(p.input[p.pos:], "*/"); x >= 0 {
+				p.pos += x + 2 // skip the */ as well
+				continue       // consume more whitespace
+			} else {
+				p.error("unexpected eof in block comment")
 			}
 		} else {
-			escaping = false
-			// Everything that is not listed here passes through unharmed.
-			switch c {
-			case 'a':
-				c = '\a'
-			case 'b':
-				c = '\b'
-			case 'v':
-				c = '\v'
-			case 'f':
-				c = '\f'
-			case 't':
-				c = '\t'
-			case 'r':
-				c = '\r'
-			case 'n':
-				c = '\n'
-			case 'x', 'u', 'U': // hex and unicode
-				l := 4
-				if c == 'x' {
-					l = 2
-				}
-				hex := make([]byte, l)
-				p.reader.Read(hex)
-				newc := mustParseInt(string(hex), 16, 16)
-				c = rune(newc)
-			case '0', '1', '2', '3', '4', '5', '6', '7': // octal!
-				oct := make([]byte, 3)
-				oct[0] = uint8(c)
-				p.reader.Read(oct[1:])
-				newc := mustParseInt(string(oct), 8, 16)
-				c = rune(newc)
-			}
+			break
 		}
-		s += string(c)
 	}
-	return cfString(s)
+	p.ignore()
+}
+
+func (p *textPlistParser) parseOctalDigits(max int) uint64 {
+	var val uint64
+
+	for i := 0; i < max; i++ {
+		r := p.next()
+
+		if r >= '0' && r <= '7' {
+			val <<= 3
+			val |= uint64((r - '0'))
+		} else {
+			p.backup()
+			break
+		}
+	}
+	return val
+}
+
+func (p *textPlistParser) parseHexDigits(max int) uint64 {
+	var val uint64
+
+	for i := 0; i < max; i++ {
+		r := p.next()
+
+		if r >= 'a' && r <= 'f' {
+			val <<= 4
+			val |= 10 + uint64((r - 'a'))
+		} else if r >= 'A' && r <= 'F' {
+			val <<= 4
+			val |= 10 + uint64((r - 'A'))
+		} else if r >= '0' && r <= '9' {
+			val <<= 4
+			val |= uint64((r - '0'))
+		} else {
+			p.backup()
+			break
+		}
+	}
+	return val
+}
+
+// the \ has already been consumed
+func (p *textPlistParser) parseEscape() string {
+	var s string
+	switch p.next() {
+	case 'a':
+		s = "\a"
+	case 'b':
+		s = "\b"
+	case 'v':
+		s = "\v"
+	case 'f':
+		s = "\f"
+	case 't':
+		s = "\t"
+	case 'r':
+		s = "\r"
+	case 'n':
+		s = "\n"
+	case '\\':
+		s = `\`
+	case '"':
+		s = `"`
+	case 'x':
+		s = string(rune(p.parseHexDigits(2)))
+	case 'u', 'U':
+		s = string(rune(p.parseHexDigits(4)))
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		p.backup() // we've already consumed one of the digits
+		s = string(rune(p.parseOctalDigits(3)))
+	default:
+		p.backup() // everything else should be accepted
+	}
+	p.ignore() // skip the entire escape sequence
+	return s
+}
+
+// the " has already been consumed
+func (p *textPlistParser) parseQuotedString() cfString {
+	p.ignore() // ignore the "
+
+	slowPath := false
+	s := ""
+
+	for {
+		p.scanUntilAny(`"\`)
+		switch p.peek() {
+		case eof:
+			p.error("unexpected eof in quoted string")
+		case '"':
+			section := p.emit()
+			p.pos++ // skip "
+			if !slowPath {
+				return cfString(section)
+			} else {
+				s += section
+				return cfString(s)
+			}
+		case '\\':
+			slowPath = true
+			s += p.emit()
+			p.next() // consume \
+			s += p.parseEscape()
+		}
+	}
 }
 
 func (p *textPlistParser) parseUnquotedString() cfString {
-	s := ""
-	for {
-		c, err := p.reader.ReadByte()
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			panic(err)
-		}
-		// if we encounter a character that must be quoted, we're done.
-		// the GNUStep quote table is more lax here, so we use it instead of the OpenStep one.
-		if gsQuotable[c/64]&(1<<(c%64)) > 0 {
-			p.reader.UnreadByte()
-			break
-		}
-		s += string(c)
-	}
-
+	p.scanCharactersNotInSet(&gsQuotable)
+	s := p.emit()
 	if s == "" {
-		panic(errors.New("invalid unquoted string (found an unquoted character that should be quoted?)"))
+		p.error("invalid unquoted string (found an unquoted character that should be quoted?)")
 	}
 
 	return cfString(s)
 }
 
-func (p *textPlistParser) parseDictionary() *cfDictionary {
+// the { has already been consumed
+func (p *textPlistParser) parseDictionary(ignoreEof bool) *cfDictionary {
+	//p.ignore() // ignore the {
 	var keypv cfValue
 	keys := make([]string, 0, 32)
 	values := make([]cfValue, 0, 32)
+outer:
 	for {
-		p.chugWhitespace()
+		p.skipWhitespaceAndComments()
 
-		c, err := p.reader.ReadByte()
-		// EOF here is an error: we're inside a dictionary!
-		if err != nil {
-			panic(err)
-		}
-
-		if c == '}' {
-			break
-		} else if c == '"' {
+		switch p.next() {
+		case eof:
+			if !ignoreEof {
+				p.error("unexpected eof in dictionary")
+			}
+			fallthrough
+		case '}':
+			break outer
+		case '"':
 			keypv = p.parseQuotedString()
-		} else {
-			p.reader.UnreadByte() // Whoops, ate part of the string
+		default:
+			p.backup()
 			keypv = p.parseUnquotedString()
 		}
-		if keypv == nil {
-			// TODO better error
-			panic(errors.New("missing dictionary key"))
-		}
 
-		p.chugWhitespace()
-		c, err = p.reader.ReadByte()
-		if err != nil {
-			panic(err)
-		}
+		// INVARIANT: key can't be nil; parseQuoted and parseUnquoted
+		// will panic out before they return nil.
 
-		if c != '=' {
-			panic(errors.New("missing = in dictionary"))
-		}
+		p.skipWhitespaceAndComments()
 
-		// whitespace is guzzled within
-		val := p.parsePlistValue()
+		var val cfValue
+		n := p.next()
+		if n == ';' {
+			val = keypv
+		} else if n == '=' {
+			// whitespace is consumed within
+			val = p.parsePlistValue()
 
-		p.chugWhitespace()
-		c, err = p.reader.ReadByte()
-		if err != nil {
-			panic(err)
-		}
+			p.skipWhitespaceAndComments()
 
-		if c != ';' {
-			panic(errors.New("missing ; in dictionary"))
+			if p.next() != ';' {
+				p.error("missing ; in dictionary")
+			}
+		} else {
+			p.error("missing = in dictionary")
 		}
 
 		keys = append(keys, string(keypv.(cfString)))
@@ -241,23 +365,26 @@
 	return &cfDictionary{keys: keys, values: values}
 }
 
+// the ( has already been consumed
 func (p *textPlistParser) parseArray() *cfArray {
+	//p.ignore() // ignore the (
 	values := make([]cfValue, 0, 32)
+outer:
 	for {
-		c, err := p.reader.ReadByte()
-		// EOF here is an error: we're inside an array!
-		if err != nil {
-			panic(err)
+		p.skipWhitespaceAndComments()
+
+		switch p.next() {
+		case eof:
+			p.error("unexpected eof in array")
+		case ')':
+			break outer // done here
+		case ',':
+			continue // restart; ,) is valid and we don't want to blow it
+		default:
+			p.backup()
 		}
 
-		if c == ')' {
-			break
-		} else if c == ',' {
-			continue
-		}
-
-		p.reader.UnreadByte()
-		pval := p.parsePlistValue()
+		pval := p.parsePlistValue() // whitespace is consumed within
 		if str, ok := pval.(cfString); ok && string(str) == "" {
 			// Empty strings in arrays are apparently skipped?
 			// TODO: Figure out why this was implemented.
@@ -268,95 +395,121 @@
 	return &cfArray{values}
 }
 
-func (p *textPlistParser) parseGNUStepValue(v []byte) cfValue {
-	if len(v) < 3 {
-		panic(errors.New("invalid GNUStep extended value"))
+// the <* have already been consumed
+func (p *textPlistParser) parseGNUStepValue() cfValue {
+	typ := p.next()
+	p.ignore()
+	p.scanUntil('>')
+
+	if typ == eof || typ == '>' || p.empty() || p.peek() == eof {
+		p.error("invalid GNUStep extended value")
 	}
-	typ := v[1]
-	v = v[2:]
+
+	v := p.emit()
+	p.next() // consume the >
+
 	switch typ {
 	case 'I':
 		if v[0] == '-' {
-			n := mustParseInt(string(v), 10, 64)
+			n := mustParseInt(v, 10, 64)
 			return &cfNumber{signed: true, value: uint64(n)}
 		} else {
-			n := mustParseUint(string(v), 10, 64)
+			n := mustParseUint(v, 10, 64)
 			return &cfNumber{signed: false, value: n}
 		}
 	case 'R':
-		n := mustParseFloat(string(v), 64)
+		n := mustParseFloat(v, 64)
 		return &cfReal{wide: true, value: n} // TODO(DH) 32/64
 	case 'B':
 		b := v[0] == 'Y'
 		return cfBoolean(b)
 	case 'D':
-		t, err := time.Parse(textPlistTimeLayout, string(v))
+		t, err := time.Parse(textPlistTimeLayout, v)
 		if err != nil {
-			panic(err)
+			p.error(err.Error())
 		}
 
 		return cfDate(t.In(time.UTC))
 	}
-	panic(errors.New("invalid GNUStep type " + string(typ)))
+	p.error("invalid GNUStep type " + string(typ))
+	return nil
+}
+
+// The < has already been consumed
+func (p *textPlistParser) parseHexData() cfData {
+	buf := make([]byte, 256)
+	i := 0
+	c := 0
+
+	for {
+		r := p.next()
+		switch r {
+		case eof:
+			p.error("unexpected eof in data")
+		case '>':
+			if c&1 == 1 {
+				p.error("uneven number of hex digits in data")
+			}
+			p.ignore()
+			return cfData(buf[:i])
+		case ' ', '\t', '\n', '\r', '\u2028', '\u2029': // more lax than apple here: skip spaces
+			continue
+		}
+
+		buf[i] <<= 4
+		if r >= 'a' && r <= 'f' {
+			buf[i] |= 10 + byte((r - 'a'))
+		} else if r >= 'A' && r <= 'F' {
+			buf[i] |= 10 + byte((r - 'A'))
+		} else if r >= '0' && r <= '9' {
+			buf[i] |= byte((r - '0'))
+		} else {
+			p.error("unexpected hex digit `%c'", r)
+		}
+
+		c++
+		if c&1 == 0 {
+			i++
+			if i >= len(buf) {
+				realloc := make([]byte, len(buf)*2)
+				copy(realloc, buf)
+				buf = realloc
+			}
+		}
+	}
 }
 
 func (p *textPlistParser) parsePlistValue() cfValue {
 	for {
-		p.chugWhitespace()
+		p.skipWhitespaceAndComments()
 
-		c, err := p.reader.ReadByte()
-		if err != nil && err != io.EOF {
-			panic(err)
-		}
-		switch c {
+		switch p.next() {
+		case eof:
+			return &cfDictionary{}
 		case '<':
-			bytes, err := p.reader.ReadBytes('>')
-			if err != nil {
-				panic(err)
-			}
-			bytes = bytes[:len(bytes)-1]
-
-			if len(bytes) == 0 {
-				panic(errors.New("invalid empty angle-bracketed element"))
-			}
-
-			if bytes[0] == '*' {
+			if p.next() == '*' {
 				p.format = GNUStepFormat
-				return p.parseGNUStepValue(bytes)
-			} else {
-				s := p.whitespaceReplacer.Replace(string(bytes))
-				data, err := hex.DecodeString(s)
-				if err != nil {
-					panic(err)
-				}
-				return cfData(data)
+				return p.parseGNUStepValue()
 			}
+
+			p.backup()
+			return p.parseHexData()
 		case '"':
 			return p.parseQuotedString()
 		case '{':
-			return p.parseDictionary()
+			return p.parseDictionary(false)
 		case '(':
 			return p.parseArray()
 		default:
-			if gsQuotable[c/64]&(1<<(c%64)) > 0 {
-				panic(errors.New("unexpected non-quotable character at root level"))
-			}
-			p.reader.UnreadByte() // Place back in buffer for parseUnquotedString
+			p.backup()
 			return p.parseUnquotedString()
 		}
 	}
 }
 
 func newTextPlistParser(r io.Reader) *textPlistParser {
-	var reader byteReader
-	if rd, ok := r.(byteReader); ok {
-		reader = rd
-	} else {
-		reader = bufio.NewReader(r)
-	}
 	return &textPlistParser{
-		reader:             reader,
-		whitespaceReplacer: strings.NewReplacer("\t", "", "\n", "", " ", "", "\r", ""),
-		format:             OpenStepFormat,
+		reader: r,
+		format: OpenStepFormat,
 	}
 }

diff --git a/text_tables.go b/text_tables.go
index ec6586b..319c55c 100644
--- a/text_tables.go
+++ b/text_tables.go

@@ -1,9 +1,19 @@
 package plist
 
+type characterSet [4]uint64
+
+func (s *characterSet) Contains(ch rune) bool {
+	return ch >= 0 && ch <= 255 && s.ContainsByte(byte(ch))
+}
+
+func (s *characterSet) ContainsByte(ch byte) bool {
+	return (s[ch/64]&(1<<(ch%64)) > 0)
+}
+
 // Bitmap of characters that must be inside a quoted string
 // when written to an old-style property list
 // Low bits represent lower characters, and each uint64 represents 64 characters.
-var gsQuotable = [4]uint64{
+var gsQuotable = characterSet{
 	0x78001385ffffffff,
 	0xa800000138000000,
 	0xffffffffffffffff,
@@ -11,16 +21,23 @@
 }
 
 // 7f instead of 3f in the top line: CFOldStylePlist.c says . is valid, but they quote it.
-var osQuotable = [4]uint64{
+var osQuotable = characterSet{
 	0xf4007f6fffffffff,
 	0xf8000001f8000001,
 	0xffffffffffffffff,
 	0xffffffffffffffff,
 }
 
-var whitespace = [4]uint64{
+var whitespace = characterSet{
 	0x0000000100003f00,
 	0x0000000000000000,
 	0x0000000000000000,
 	0x0000000000000000,
 }
+
+var newlineCharacterSet = characterSet{
+	0x0000000000002400,
+	0x0000000000000000,
+	0x0000000000000000,
+	0x0000000000000000,
+}

diff --git a/text_test.go b/text_test.go
index c654037..95e87cf 100644
--- a/text_test.go
+++ b/text_test.go

@@ -61,12 +61,13 @@
 	{
 		Name: "Escapes",
 		Data: struct {
-			A, B, V, F, T, R, N, Hex1, Unicode1, Unicode2, Octal1 string
+			W, A, B, V, F, T, R, N, Hex1, Unicode1, Unicode2, Octal1 string
 		}{
-			"\a", "\b", "\v", "\f", "\t", "\r", "\n", "\u00ab", "\u00ac", "\u00ad", "\033",
+			"w", "\a", "\b", "\v", "\f", "\t", "\r", "\n", "\u00ab", "\u00ac", "\u00ad", "\033",
 		},
 		Expected: map[int][]byte{
 			OpenStepFormat: []byte(`{
+				W="\w";
 				A="\a";
 				B="\b";
 				V="\v";
@@ -168,6 +169,22 @@
 		SkipEncode: map[int]bool{OpenStepFormat: true},
 	},
 	{
+		Name: "Various Truncated Escapes",
+		Data: "\x01\x02\x03\x04\x057",
+		Expected: map[int][]byte{
+			OpenStepFormat: []byte(`"\x1\u02\U003\4\0057"`),
+		},
+		SkipEncode: map[int]bool{OpenStepFormat: true},
+	},
+	{
+		Name: "Various Case-Insensitive Escapes",
+		Data: "\u00AB\uCDEF",
+		Expected: map[int][]byte{
+			OpenStepFormat: []byte(`"\xaB\uCdEf"`),
+		},
+		SkipEncode: map[int]bool{OpenStepFormat: true},
+	},
+	{
 		Name: "Data long enough to trigger implementation-specific reallocation", // this is for coverage :(
 		Data: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
 		Expected: map[int][]byte{
@@ -175,6 +192,22 @@
 		},
 		SkipEncode: map[int]bool{OpenStepFormat: true},
 	},
+	{
+		Name: "Empty Document",
+		Data: map[string]interface{}{}, // Defined to be an empty dictionary
+		Expected: map[int][]byte{
+			OpenStepFormat: []byte{},
+		},
+		SkipEncode: map[int]bool{OpenStepFormat: true},
+	},
+	{
+		Name: "Document consisting of only whitespace",
+		Data: map[string]interface{}{}, // Defined to be an empty dictionary
+		Expected: map[int][]byte{
+			OpenStepFormat: []byte(" \n\t"),
+		},
+		SkipEncode: map[int]bool{OpenStepFormat: true},
+	},
 }
 
 func TestTextDecode(t *testing.T) {

diff --git a/zerocopy.go b/zerocopy.go
index 025c908..999f401 100644
--- a/zerocopy.go
+++ b/zerocopy.go

@@ -8,6 +8,10 @@
 )
 
 func zeroCopy8BitString(buf []byte, off int, len int) string {
+	if len == 0 {
+		return ""
+	}
+
 	var s string
 	hdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
 	hdr.Data = uintptr(unsafe.Pointer(&buf[off]))
commit	c17dcc5f37b9733ee759dc22a39d5087a3325457	[log] [tgz]
author	Dustin L. Howett <dustin@howett.net>	Wed Apr 19 03:55:14 2017
committer	Dustin L. Howett <dustin@howett.net>	Mon Apr 24 03:19:58 2017
tree	8c1b607c825359970de3402fded85b80cd48e892
parent	c4a68d895d5012d89897ed90cccafa265acbad0f [diff]