[text] Rewrite the text plist parser to be like text/template/parser
This commit overhauls the text property list parser, reducing the cost
in time and memory and overall sanity required to parse text property
list documents.
Herein, support is also added for:
* UTF-16 text property lists (#26)
* Proper scanning of UTF-8 codepoints
* Encoding conversion (UTF-16{BE,LE) +- BOM -> UTF-8)
* Empty data values, <>
* Error messages that include line and column info (#25)
* Legacy strings file format (dictionary without { }) (#27)
* Shortcut strings file format (dictionaries without values) (#27)
* Short hex/unicode/octal escapes (\x2, \u33, \0)
* Empty documents parsing as empty dictionaries
* Detection of garbage after the end of a document
The character tables have been augmented with their own characterSet
type, which allows them to report on their own residence. All characters
outside the 0-255 range will be considered "not in set" for now.
In the benchmarks below, *Step(Parse|Decode) operate on a relatively
small synthetic property list that contains every property list type.
BigParse operates on a ~700kb binary property list created by converting
the iTunes software update catalog from XML to GNUStep or OpenStep.
Pretty benchmarks include whitespace.
benchmark old ns/op new ns/op delta
BenchmarkBigGNUStepParse-4 125008990 33544860 -73.17%
BenchmarkBigPrettyGNUStepParse-4 54869160 38049063 -30.65%
BenchmarkBigOpenStepParse-4 124436480 31491614 -74.69%
BenchmarkBigPrettyOpenStepParse-4 54080760 34542446 -36.13%
BenchmarkOpenStepParse-4 20177 13894 -31.14%
BenchmarkGNUStepParse-4 18742 15087 -19.50%
benchmark old allocs new allocs delta
BenchmarkBigGNUStepParse-4 2248154 120655 -94.63%
BenchmarkBigPrettyGNUStepParse-4 969515 120655 -87.56%
BenchmarkBigOpenStepParse-4 2251448 120655 -94.64%
BenchmarkBigPrettyOpenStepParse-4 969541 120655 -87.56%
BenchmarkOpenStepParse-4 234 44 -81.20%
BenchmarkGNUStepParse-4 186 47 -74.73%
benchmark old bytes new bytes delta
BenchmarkBigGNUStepParse-4 67633657 24006777 -64.50%
BenchmarkBigPrettyGNUStepParse-4 30100843 24006784 -20.25%
BenchmarkBigOpenStepParse-4 67657126 24023625 -64.49%
BenchmarkBigPrettyOpenStepParse-4 30101001 24023619 -20.19%
BenchmarkOpenStepParse-4 15376 10192 -33.71%
BenchmarkGNUStepParse-4 14992 10320 -31.16%
Fixes #25
Fixes #26
Fixes #27
diff --git a/invalid_text_test.go b/invalid_text_test.go
index c7c3dc5..8b5845e 100644
--- a/invalid_text_test.go
+++ b/invalid_text_test.go
@@ -23,13 +23,19 @@
{"Missing Equals in Dictionary", `{"A"A;}`},
{"Missing Semicolon in Dictionary", `{"A"=A}`},
{"Invalid GNUStep type", "<*F33>"},
- {"Invalid GNUStep type data", "(<*I>"},
+ {"Invalid GNUStep int", "(<*I>"},
+ {"Invalid GNUStep date", "<*D5>"},
+ {"Truncated GNUStep value", "<*I3"},
{"Invalid data", "<EQ>"},
- {"Truncated unicode escape", `"\u231"`},
- {"Truncated hex escape", `"\x2"`},
- {"Truncated octal escape", `"\02"`},
+ {"Truncated unicode escape", `"\u231`},
+ {"Truncated hex escape", `"\x2`},
+ {"Truncated octal escape", `"\02`},
{"Truncated data", `<33`},
+ {"Uneven data", `<3>`},
{"Truncated block comment", `/* hello`},
+ {"Truncated quoted string", `"hi`},
+ {"Garbage after end of non-string", "<ab> cde"},
+ {"Broken UTF-16", "\xFE\xFF\x01"},
}
func TestInvalidTextPlists(t *testing.T) {
diff --git a/text_generator.go b/text_generator.go
index 31eb9d6..53078ba 100644
--- a/text_generator.go
+++ b/text_generator.go
@@ -11,7 +11,7 @@
writer io.Writer
format int
- quotableTable *[4]uint64
+ quotableTable *characterSet
indent string
depth int
@@ -49,7 +49,7 @@
s += us
} else {
c := uint8(r)
- if (*p.quotableTable)[c/64]&(1<<(c%64)) > 0 {
+ if p.quotableTable.ContainsByte(c) {
quot = true
}
diff --git a/text_parser.go b/text_parser.go
index 1cf2ad5..7e49d6f 100644
--- a/text_parser.go
+++ b/text_parser.go
@@ -1,26 +1,64 @@
package plist
import (
- "bufio"
- "encoding/hex"
+ "encoding/binary"
"errors"
+ "fmt"
"io"
+ "io/ioutil"
"runtime"
"strings"
"time"
+ "unicode/utf16"
+ "unicode/utf8"
)
-type byteReader interface {
- io.Reader
- io.ByteScanner
- Peek(n int) ([]byte, error)
- ReadBytes(delim byte) ([]byte, error)
+type textPlistParser struct {
+ reader io.Reader
+ format int
+
+ input string
+ start int
+ pos int
+ width int
}
-type textPlistParser struct {
- reader byteReader
- whitespaceReplacer *strings.Replacer
- format int
+func convertU16(buffer []byte, bo binary.ByteOrder) (string, error) {
+ if len(buffer)%2 != 0 {
+ return "", errors.New("truncated utf16")
+ }
+
+ tmp := make([]uint16, len(buffer)/2)
+ for i := 0; i < len(buffer); i += 2 {
+ tmp[i/2] = bo.Uint16(buffer[i : i+2])
+ }
+ return string(utf16.Decode(tmp)), nil
+}
+
+func guessEncodingAndConvert(buffer []byte) (string, error) {
+ if len(buffer) >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF {
+ // UTF-8 BOM
+ return zeroCopy8BitString(buffer, 3, len(buffer)-3), nil
+ } else if len(buffer) >= 2 {
+ // UTF-16 guesses
+
+ switch {
+ // stream is big-endian (BOM is FE FF or head is 00 XX)
+ case (buffer[0] == 0xFE && buffer[1] == 0xFF):
+ return convertU16(buffer[2:], binary.BigEndian)
+ case (buffer[0] == 0 && buffer[1] != 0):
+ return convertU16(buffer, binary.BigEndian)
+
+ // stream is little-endian (BOM is FE FF or head is XX 00)
+ case (buffer[0] == 0xFF && buffer[1] == 0xFE):
+ return convertU16(buffer[2:], binary.LittleEndian)
+ case (buffer[0] != 0 && buffer[1] == 0):
+ return convertU16(buffer, binary.LittleEndian)
+ }
+ }
+
+ // fallback: assume ASCII (not great!)
+ return zeroCopy8BitString(buffer, 0, len(buffer)), nil
}
func (p *textPlistParser) parseDocument() (pval cfValue, parseError error) {
@@ -29,209 +67,295 @@
if _, ok := r.(runtime.Error); ok {
panic(r)
}
- if _, ok := r.(invalidPlistError); ok {
- parseError = r.(error)
- } else {
- // Wrap all non-invalid-plist errors.
- parseError = plistParseError{"text", r.(error)}
- }
+ // Wrap all non-invalid-plist errors.
+ parseError = plistParseError{"text", r.(error)}
}
}()
- pval = p.parsePlistValue()
+
+ buffer, err := ioutil.ReadAll(p.reader)
+ if err != nil {
+ panic(err)
+ }
+
+ p.input, err = guessEncodingAndConvert(buffer)
+ if err != nil {
+ panic(err)
+ }
+
+ val := p.parsePlistValue()
+
+ p.skipWhitespaceAndComments()
+ if p.peek() != eof {
+ if _, ok := val.(cfString); !ok {
+ p.error("garbage after end of document")
+ }
+
+ p.start = 0
+ p.pos = 0
+ val = p.parseDictionary(true)
+ }
+
+ pval = val
+
return
}
-func (p *textPlistParser) chugWhitespace() {
-ws:
+const eof rune = -1
+
+func (p *textPlistParser) error(e string, args ...interface{}) {
+ line := strings.Count(p.input[:p.pos], "\n")
+ char := p.pos - strings.LastIndex(p.input[:p.pos], "\n") - 1
+ panic(fmt.Errorf("%s at line %d character %d", fmt.Sprintf(e, args...), line, char))
+}
+
+func (p *textPlistParser) next() rune {
+ if int(p.pos) >= len(p.input) {
+ p.width = 0
+ return eof
+ }
+ r, w := utf8.DecodeRuneInString(p.input[p.pos:])
+ p.width = w
+ p.pos += p.width
+ return r
+}
+
+func (p *textPlistParser) backup() {
+ p.pos -= p.width
+}
+
+func (p *textPlistParser) peek() rune {
+ r := p.next()
+ p.backup()
+ return r
+}
+
+func (p *textPlistParser) emit() string {
+ s := p.input[p.start:p.pos]
+ p.start = p.pos
+ return s
+}
+
+func (p *textPlistParser) ignore() {
+ p.start = p.pos
+}
+
+func (p *textPlistParser) empty() bool {
+ return p.start == p.pos
+}
+
+func (p *textPlistParser) scanUntil(ch rune) {
+ if x := strings.IndexRune(p.input[p.pos:], ch); x >= 0 {
+ p.pos += x
+ return
+ }
+ p.pos = len(p.input)
+}
+
+func (p *textPlistParser) scanUntilAny(chs string) {
+ if x := strings.IndexAny(p.input[p.pos:], chs); x >= 0 {
+ p.pos += x
+ return
+ }
+ p.pos = len(p.input)
+}
+
+func (p *textPlistParser) scanCharactersInSet(ch *characterSet) {
+ for ch.Contains(p.next()) {
+ }
+ p.backup()
+}
+
+func (p *textPlistParser) scanCharactersNotInSet(ch *characterSet) {
+ var r rune
for {
- c, err := p.reader.ReadByte()
- if err != nil && err != io.EOF {
- panic(err)
- }
- if whitespace[c/64]&(1<<(c%64)) == 0 {
- if c == '/' && err != io.EOF {
- // A / at the end of the file is not the begining of a comment.
- cs, err := p.reader.Peek(1)
- if err != nil && err != io.EOF {
- panic(err)
- }
- if err == io.EOF {
- return
- }
- c = cs[0]
- switch c {
- case '/':
- for {
- c, err = p.reader.ReadByte()
- if err != nil && err != io.EOF {
- panic(err)
- } else if err == io.EOF {
- break
- }
- // TODO: UTF-8
- if c == '\n' || c == '\r' {
- break
- }
- }
- case '*':
- // Peek returned a value here, so it is safe to read.
- _, _ = p.reader.ReadByte()
- star := false
- for {
- c, err = p.reader.ReadByte()
- if err != nil {
- panic(err)
- }
- if c == '*' {
- star = true
- } else if c == '/' && star {
- break
- } else {
- star = false
- }
- }
- default:
- p.reader.UnreadByte() // Not the beginning of a // or /* comment
- break ws
- }
- continue
- }
- p.reader.UnreadByte()
+ r = p.next()
+ if r == eof || ch.Contains(r) {
break
}
}
+ p.backup()
}
-func (p *textPlistParser) parseQuotedString() cfString {
- escaping := false
- s := ""
+func (p *textPlistParser) skipWhitespaceAndComments() {
for {
- byt, err := p.reader.ReadByte()
- // EOF here is an error: we're inside a quoted string!
- if err != nil {
- panic(err)
- }
- c := rune(byt)
- if !escaping {
- if c == '"' {
- break
- } else if c == '\\' {
- escaping = true
- continue
+ p.scanCharactersInSet(&whitespace)
+ if strings.HasPrefix(p.input[p.pos:], "//") {
+ p.scanCharactersNotInSet(&newlineCharacterSet)
+ } else if strings.HasPrefix(p.input[p.pos:], "/*") {
+ if x := strings.Index(p.input[p.pos:], "*/"); x >= 0 {
+ p.pos += x + 2 // skip the */ as well
+ continue // consume more whitespace
+ } else {
+ p.error("unexpected eof in block comment")
}
} else {
- escaping = false
- // Everything that is not listed here passes through unharmed.
- switch c {
- case 'a':
- c = '\a'
- case 'b':
- c = '\b'
- case 'v':
- c = '\v'
- case 'f':
- c = '\f'
- case 't':
- c = '\t'
- case 'r':
- c = '\r'
- case 'n':
- c = '\n'
- case 'x', 'u', 'U': // hex and unicode
- l := 4
- if c == 'x' {
- l = 2
- }
- hex := make([]byte, l)
- p.reader.Read(hex)
- newc := mustParseInt(string(hex), 16, 16)
- c = rune(newc)
- case '0', '1', '2', '3', '4', '5', '6', '7': // octal!
- oct := make([]byte, 3)
- oct[0] = uint8(c)
- p.reader.Read(oct[1:])
- newc := mustParseInt(string(oct), 8, 16)
- c = rune(newc)
- }
+ break
}
- s += string(c)
}
- return cfString(s)
+ p.ignore()
+}
+
+func (p *textPlistParser) parseOctalDigits(max int) uint64 {
+ var val uint64
+
+ for i := 0; i < max; i++ {
+ r := p.next()
+
+ if r >= '0' && r <= '7' {
+ val <<= 3
+ val |= uint64((r - '0'))
+ } else {
+ p.backup()
+ break
+ }
+ }
+ return val
+}
+
+func (p *textPlistParser) parseHexDigits(max int) uint64 {
+ var val uint64
+
+ for i := 0; i < max; i++ {
+ r := p.next()
+
+ if r >= 'a' && r <= 'f' {
+ val <<= 4
+ val |= 10 + uint64((r - 'a'))
+ } else if r >= 'A' && r <= 'F' {
+ val <<= 4
+ val |= 10 + uint64((r - 'A'))
+ } else if r >= '0' && r <= '9' {
+ val <<= 4
+ val |= uint64((r - '0'))
+ } else {
+ p.backup()
+ break
+ }
+ }
+ return val
+}
+
+// the \ has already been consumed
+func (p *textPlistParser) parseEscape() string {
+ var s string
+ switch p.next() {
+ case 'a':
+ s = "\a"
+ case 'b':
+ s = "\b"
+ case 'v':
+ s = "\v"
+ case 'f':
+ s = "\f"
+ case 't':
+ s = "\t"
+ case 'r':
+ s = "\r"
+ case 'n':
+ s = "\n"
+ case '\\':
+ s = `\`
+ case '"':
+ s = `"`
+ case 'x':
+ s = string(rune(p.parseHexDigits(2)))
+ case 'u', 'U':
+ s = string(rune(p.parseHexDigits(4)))
+ case '0', '1', '2', '3', '4', '5', '6', '7':
+ p.backup() // we've already consumed one of the digits
+ s = string(rune(p.parseOctalDigits(3)))
+ default:
+ p.backup() // everything else should be accepted
+ }
+ p.ignore() // skip the entire escape sequence
+ return s
+}
+
+// the " has already been consumed
+func (p *textPlistParser) parseQuotedString() cfString {
+ p.ignore() // ignore the "
+
+ slowPath := false
+ s := ""
+
+ for {
+ p.scanUntilAny(`"\`)
+ switch p.peek() {
+ case eof:
+ p.error("unexpected eof in quoted string")
+ case '"':
+ section := p.emit()
+ p.pos++ // skip "
+ if !slowPath {
+ return cfString(section)
+ } else {
+ s += section
+ return cfString(s)
+ }
+ case '\\':
+ slowPath = true
+ s += p.emit()
+ p.next() // consume \
+ s += p.parseEscape()
+ }
+ }
}
func (p *textPlistParser) parseUnquotedString() cfString {
- s := ""
- for {
- c, err := p.reader.ReadByte()
- if err != nil {
- if err == io.EOF {
- break
- }
- panic(err)
- }
- // if we encounter a character that must be quoted, we're done.
- // the GNUStep quote table is more lax here, so we use it instead of the OpenStep one.
- if gsQuotable[c/64]&(1<<(c%64)) > 0 {
- p.reader.UnreadByte()
- break
- }
- s += string(c)
- }
-
+ p.scanCharactersNotInSet(&gsQuotable)
+ s := p.emit()
if s == "" {
- panic(errors.New("invalid unquoted string (found an unquoted character that should be quoted?)"))
+ p.error("invalid unquoted string (found an unquoted character that should be quoted?)")
}
return cfString(s)
}
-func (p *textPlistParser) parseDictionary() *cfDictionary {
+// the { has already been consumed
+func (p *textPlistParser) parseDictionary(ignoreEof bool) *cfDictionary {
+ //p.ignore() // ignore the {
var keypv cfValue
keys := make([]string, 0, 32)
values := make([]cfValue, 0, 32)
+outer:
for {
- p.chugWhitespace()
+ p.skipWhitespaceAndComments()
- c, err := p.reader.ReadByte()
- // EOF here is an error: we're inside a dictionary!
- if err != nil {
- panic(err)
- }
-
- if c == '}' {
- break
- } else if c == '"' {
+ switch p.next() {
+ case eof:
+ if !ignoreEof {
+ p.error("unexpected eof in dictionary")
+ }
+ fallthrough
+ case '}':
+ break outer
+ case '"':
keypv = p.parseQuotedString()
- } else {
- p.reader.UnreadByte() // Whoops, ate part of the string
+ default:
+ p.backup()
keypv = p.parseUnquotedString()
}
- if keypv == nil {
- // TODO better error
- panic(errors.New("missing dictionary key"))
- }
- p.chugWhitespace()
- c, err = p.reader.ReadByte()
- if err != nil {
- panic(err)
- }
+ // INVARIANT: key can't be nil; parseQuoted and parseUnquoted
+ // will panic out before they return nil.
- if c != '=' {
- panic(errors.New("missing = in dictionary"))
- }
+ p.skipWhitespaceAndComments()
- // whitespace is guzzled within
- val := p.parsePlistValue()
+ var val cfValue
+ n := p.next()
+ if n == ';' {
+ val = keypv
+ } else if n == '=' {
+ // whitespace is consumed within
+ val = p.parsePlistValue()
- p.chugWhitespace()
- c, err = p.reader.ReadByte()
- if err != nil {
- panic(err)
- }
+ p.skipWhitespaceAndComments()
- if c != ';' {
- panic(errors.New("missing ; in dictionary"))
+ if p.next() != ';' {
+ p.error("missing ; in dictionary")
+ }
+ } else {
+ p.error("missing = in dictionary")
}
keys = append(keys, string(keypv.(cfString)))
@@ -241,23 +365,26 @@
return &cfDictionary{keys: keys, values: values}
}
+// the ( has already been consumed
func (p *textPlistParser) parseArray() *cfArray {
+ //p.ignore() // ignore the (
values := make([]cfValue, 0, 32)
+outer:
for {
- c, err := p.reader.ReadByte()
- // EOF here is an error: we're inside an array!
- if err != nil {
- panic(err)
+ p.skipWhitespaceAndComments()
+
+ switch p.next() {
+ case eof:
+ p.error("unexpected eof in array")
+ case ')':
+ break outer // done here
+ case ',':
+ continue // restart; ,) is valid and we don't want to blow it
+ default:
+ p.backup()
}
- if c == ')' {
- break
- } else if c == ',' {
- continue
- }
-
- p.reader.UnreadByte()
- pval := p.parsePlistValue()
+ pval := p.parsePlistValue() // whitespace is consumed within
if str, ok := pval.(cfString); ok && string(str) == "" {
// Empty strings in arrays are apparently skipped?
// TODO: Figure out why this was implemented.
@@ -268,95 +395,121 @@
return &cfArray{values}
}
-func (p *textPlistParser) parseGNUStepValue(v []byte) cfValue {
- if len(v) < 3 {
- panic(errors.New("invalid GNUStep extended value"))
+// the <* have already been consumed
+func (p *textPlistParser) parseGNUStepValue() cfValue {
+ typ := p.next()
+ p.ignore()
+ p.scanUntil('>')
+
+ if typ == eof || typ == '>' || p.empty() || p.peek() == eof {
+ p.error("invalid GNUStep extended value")
}
- typ := v[1]
- v = v[2:]
+
+ v := p.emit()
+ p.next() // consume the >
+
switch typ {
case 'I':
if v[0] == '-' {
- n := mustParseInt(string(v), 10, 64)
+ n := mustParseInt(v, 10, 64)
return &cfNumber{signed: true, value: uint64(n)}
} else {
- n := mustParseUint(string(v), 10, 64)
+ n := mustParseUint(v, 10, 64)
return &cfNumber{signed: false, value: n}
}
case 'R':
- n := mustParseFloat(string(v), 64)
+ n := mustParseFloat(v, 64)
return &cfReal{wide: true, value: n} // TODO(DH) 32/64
case 'B':
b := v[0] == 'Y'
return cfBoolean(b)
case 'D':
- t, err := time.Parse(textPlistTimeLayout, string(v))
+ t, err := time.Parse(textPlistTimeLayout, v)
if err != nil {
- panic(err)
+ p.error(err.Error())
}
return cfDate(t.In(time.UTC))
}
- panic(errors.New("invalid GNUStep type " + string(typ)))
+ p.error("invalid GNUStep type " + string(typ))
+ return nil
+}
+
+// The < has already been consumed
+func (p *textPlistParser) parseHexData() cfData {
+ buf := make([]byte, 256)
+ i := 0
+ c := 0
+
+ for {
+ r := p.next()
+ switch r {
+ case eof:
+ p.error("unexpected eof in data")
+ case '>':
+ if c&1 == 1 {
+ p.error("uneven number of hex digits in data")
+ }
+ p.ignore()
+ return cfData(buf[:i])
+ case ' ', '\t', '\n', '\r', '\u2028', '\u2029': // more lax than apple here: skip spaces
+ continue
+ }
+
+ buf[i] <<= 4
+ if r >= 'a' && r <= 'f' {
+ buf[i] |= 10 + byte((r - 'a'))
+ } else if r >= 'A' && r <= 'F' {
+ buf[i] |= 10 + byte((r - 'A'))
+ } else if r >= '0' && r <= '9' {
+ buf[i] |= byte((r - '0'))
+ } else {
+ p.error("unexpected hex digit `%c'", r)
+ }
+
+ c++
+ if c&1 == 0 {
+ i++
+ if i >= len(buf) {
+ realloc := make([]byte, len(buf)*2)
+ copy(realloc, buf)
+ buf = realloc
+ }
+ }
+ }
}
func (p *textPlistParser) parsePlistValue() cfValue {
for {
- p.chugWhitespace()
+ p.skipWhitespaceAndComments()
- c, err := p.reader.ReadByte()
- if err != nil && err != io.EOF {
- panic(err)
- }
- switch c {
+ switch p.next() {
+ case eof:
+ return &cfDictionary{}
case '<':
- bytes, err := p.reader.ReadBytes('>')
- if err != nil {
- panic(err)
- }
- bytes = bytes[:len(bytes)-1]
-
- if len(bytes) == 0 {
- panic(errors.New("invalid empty angle-bracketed element"))
- }
-
- if bytes[0] == '*' {
+ if p.next() == '*' {
p.format = GNUStepFormat
- return p.parseGNUStepValue(bytes)
- } else {
- s := p.whitespaceReplacer.Replace(string(bytes))
- data, err := hex.DecodeString(s)
- if err != nil {
- panic(err)
- }
- return cfData(data)
+ return p.parseGNUStepValue()
}
+
+ p.backup()
+ return p.parseHexData()
case '"':
return p.parseQuotedString()
case '{':
- return p.parseDictionary()
+ return p.parseDictionary(false)
case '(':
return p.parseArray()
default:
- if gsQuotable[c/64]&(1<<(c%64)) > 0 {
- panic(errors.New("unexpected non-quotable character at root level"))
- }
- p.reader.UnreadByte() // Place back in buffer for parseUnquotedString
+ p.backup()
return p.parseUnquotedString()
}
}
}
func newTextPlistParser(r io.Reader) *textPlistParser {
- var reader byteReader
- if rd, ok := r.(byteReader); ok {
- reader = rd
- } else {
- reader = bufio.NewReader(r)
- }
return &textPlistParser{
- reader: reader,
- whitespaceReplacer: strings.NewReplacer("\t", "", "\n", "", " ", "", "\r", ""),
- format: OpenStepFormat,
+ reader: r,
+ format: OpenStepFormat,
}
}
diff --git a/text_tables.go b/text_tables.go
index ec6586b..319c55c 100644
--- a/text_tables.go
+++ b/text_tables.go
@@ -1,9 +1,19 @@
package plist
+type characterSet [4]uint64
+
+func (s *characterSet) Contains(ch rune) bool {
+ return ch >= 0 && ch <= 255 && s.ContainsByte(byte(ch))
+}
+
+func (s *characterSet) ContainsByte(ch byte) bool {
+ return (s[ch/64]&(1<<(ch%64)) > 0)
+}
+
// Bitmap of characters that must be inside a quoted string
// when written to an old-style property list
// Low bits represent lower characters, and each uint64 represents 64 characters.
-var gsQuotable = [4]uint64{
+var gsQuotable = characterSet{
0x78001385ffffffff,
0xa800000138000000,
0xffffffffffffffff,
@@ -11,16 +21,23 @@
}
// 7f instead of 3f in the top line: CFOldStylePlist.c says . is valid, but they quote it.
-var osQuotable = [4]uint64{
+var osQuotable = characterSet{
0xf4007f6fffffffff,
0xf8000001f8000001,
0xffffffffffffffff,
0xffffffffffffffff,
}
-var whitespace = [4]uint64{
+var whitespace = characterSet{
0x0000000100003f00,
0x0000000000000000,
0x0000000000000000,
0x0000000000000000,
}
+
+var newlineCharacterSet = characterSet{
+ 0x0000000000002400,
+ 0x0000000000000000,
+ 0x0000000000000000,
+ 0x0000000000000000,
+}
diff --git a/text_test.go b/text_test.go
index c654037..95e87cf 100644
--- a/text_test.go
+++ b/text_test.go
@@ -61,12 +61,13 @@
{
Name: "Escapes",
Data: struct {
- A, B, V, F, T, R, N, Hex1, Unicode1, Unicode2, Octal1 string
+ W, A, B, V, F, T, R, N, Hex1, Unicode1, Unicode2, Octal1 string
}{
- "\a", "\b", "\v", "\f", "\t", "\r", "\n", "\u00ab", "\u00ac", "\u00ad", "\033",
+ "w", "\a", "\b", "\v", "\f", "\t", "\r", "\n", "\u00ab", "\u00ac", "\u00ad", "\033",
},
Expected: map[int][]byte{
OpenStepFormat: []byte(`{
+ W="\w";
A="\a";
B="\b";
V="\v";
@@ -168,6 +169,22 @@
SkipEncode: map[int]bool{OpenStepFormat: true},
},
{
+ Name: "Various Truncated Escapes",
+ Data: "\x01\x02\x03\x04\x057",
+ Expected: map[int][]byte{
+ OpenStepFormat: []byte(`"\x1\u02\U003\4\0057"`),
+ },
+ SkipEncode: map[int]bool{OpenStepFormat: true},
+ },
+ {
+ Name: "Various Case-Insensitive Escapes",
+ Data: "\u00AB\uCDEF",
+ Expected: map[int][]byte{
+ OpenStepFormat: []byte(`"\xaB\uCdEf"`),
+ },
+ SkipEncode: map[int]bool{OpenStepFormat: true},
+ },
+ {
Name: "Data long enough to trigger implementation-specific reallocation", // this is for coverage :(
Data: []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01},
Expected: map[int][]byte{
@@ -175,6 +192,22 @@
},
SkipEncode: map[int]bool{OpenStepFormat: true},
},
+ {
+ Name: "Empty Document",
+ Data: map[string]interface{}{}, // Defined to be an empty dictionary
+ Expected: map[int][]byte{
+ OpenStepFormat: []byte{},
+ },
+ SkipEncode: map[int]bool{OpenStepFormat: true},
+ },
+ {
+ Name: "Document consisting of only whitespace",
+ Data: map[string]interface{}{}, // Defined to be an empty dictionary
+ Expected: map[int][]byte{
+ OpenStepFormat: []byte(" \n\t"),
+ },
+ SkipEncode: map[int]bool{OpenStepFormat: true},
+ },
}
func TestTextDecode(t *testing.T) {
diff --git a/zerocopy.go b/zerocopy.go
index 025c908..999f401 100644
--- a/zerocopy.go
+++ b/zerocopy.go
@@ -8,6 +8,10 @@
)
func zeroCopy8BitString(buf []byte, off int, len int) string {
+ if len == 0 {
+ return ""
+ }
+
var s string
hdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
hdr.Data = uintptr(unsafe.Pointer(&buf[off]))