| """Implementation of JSONDecoder |
| """ |
| import re |
| import sys |
| import struct |
| |
| from simplejson.scanner import make_scanner |
| def _import_c_scanstring(): |
| try: |
| from simplejson._speedups import scanstring |
| return scanstring |
| except ImportError: |
| return None |
| c_scanstring = _import_c_scanstring() |
| |
| __all__ = ['JSONDecoder'] |
| |
| FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL |
| |
| def _floatconstants(): |
| _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') |
| # The struct module in Python 2.4 would get frexp() out of range here |
| # when an endian is specified in the format string. Fixed in Python 2.5+ |
| if sys.byteorder != 'big': |
| _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] |
| nan, inf = struct.unpack('dd', _BYTES) |
| return nan, inf, -inf |
| |
| NaN, PosInf, NegInf = _floatconstants() |
| |
| |
| class JSONDecodeError(ValueError): |
| """Subclass of ValueError with the following additional properties: |
| |
| msg: The unformatted error message |
| doc: The JSON document being parsed |
| pos: The start index of doc where parsing failed |
| end: The end index of doc where parsing failed (may be None) |
| lineno: The line corresponding to pos |
| colno: The column corresponding to pos |
| endlineno: The line corresponding to end (may be None) |
| endcolno: The column corresponding to end (may be None) |
| |
| """ |
| def __init__(self, msg, doc, pos, end=None): |
| ValueError.__init__(self, errmsg(msg, doc, pos, end=end)) |
| self.msg = msg |
| self.doc = doc |
| self.pos = pos |
| self.end = end |
| self.lineno, self.colno = linecol(doc, pos) |
| if end is not None: |
| self.endlineno, self.endcolno = linecol(doc, end) |
| else: |
| self.endlineno, self.endcolno = None, None |
| |
| |
| def linecol(doc, pos): |
| lineno = doc.count('\n', 0, pos) + 1 |
| if lineno == 1: |
| colno = pos |
| else: |
| colno = pos - doc.rindex('\n', 0, pos) |
| return lineno, colno |
| |
| |
| def errmsg(msg, doc, pos, end=None): |
| # Note that this function is called from _speedups |
| lineno, colno = linecol(doc, pos) |
| if end is None: |
| #fmt = '{0}: line {1} column {2} (char {3})' |
| #return fmt.format(msg, lineno, colno, pos) |
| fmt = '%s: line %d column %d (char %d)' |
| return fmt % (msg, lineno, colno, pos) |
| endlineno, endcolno = linecol(doc, end) |
| #fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})' |
| #return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end) |
| fmt = '%s: line %d column %d - line %d column %d (char %d - %d)' |
| return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end) |
| |
| |
| _CONSTANTS = { |
| '-Infinity': NegInf, |
| 'Infinity': PosInf, |
| 'NaN': NaN, |
| } |
| |
| STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) |
| BACKSLASH = { |
| '"': u'"', '\\': u'\\', '/': u'/', |
| 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t', |
| } |
| |
| DEFAULT_ENCODING = "utf-8" |
| |
| def py_scanstring(s, end, encoding=None, strict=True, |
| _b=BACKSLASH, _m=STRINGCHUNK.match): |
| """Scan the string s for a JSON string. End is the index of the |
| character in s after the quote that started the JSON string. |
| Unescapes all valid JSON string escape sequences and raises ValueError |
| on attempt to decode an invalid string. If strict is False then literal |
| control characters are allowed in the string. |
| |
| Returns a tuple of the decoded string and the index of the character in s |
| after the end quote.""" |
| if encoding is None: |
| encoding = DEFAULT_ENCODING |
| chunks = [] |
| _append = chunks.append |
| begin = end - 1 |
| while 1: |
| chunk = _m(s, end) |
| if chunk is None: |
| raise JSONDecodeError( |
| "Unterminated string starting at", s, begin) |
| end = chunk.end() |
| content, terminator = chunk.groups() |
| # Content is contains zero or more unescaped string characters |
| if content: |
| if not isinstance(content, unicode): |
| content = unicode(content, encoding) |
| _append(content) |
| # Terminator is the end of string, a literal control character, |
| # or a backslash denoting that an escape sequence follows |
| if terminator == '"': |
| break |
| elif terminator != '\\': |
| if strict: |
| msg = "Invalid control character %r at" % (terminator,) |
| #msg = "Invalid control character {0!r} at".format(terminator) |
| raise JSONDecodeError(msg, s, end) |
| else: |
| _append(terminator) |
| continue |
| try: |
| esc = s[end] |
| except IndexError: |
| raise JSONDecodeError( |
| "Unterminated string starting at", s, begin) |
| # If not a unicode escape sequence, must be in the lookup table |
| if esc != 'u': |
| try: |
| char = _b[esc] |
| except KeyError: |
| msg = "Invalid \\escape: " + repr(esc) |
| raise JSONDecodeError(msg, s, end) |
| end += 1 |
| else: |
| # Unicode escape sequence |
| esc = s[end + 1:end + 5] |
| next_end = end + 5 |
| if len(esc) != 4: |
| msg = "Invalid \\uXXXX escape" |
| raise JSONDecodeError(msg, s, end) |
| uni = int(esc, 16) |
| # Check for surrogate pair on UCS-4 systems |
| if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535: |
| msg = "Invalid \\uXXXX\\uXXXX surrogate pair" |
| if not s[end + 5:end + 7] == '\\u': |
| raise JSONDecodeError(msg, s, end) |
| esc2 = s[end + 7:end + 11] |
| if len(esc2) != 4: |
| raise JSONDecodeError(msg, s, end) |
| uni2 = int(esc2, 16) |
| uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) |
| next_end += 6 |
| char = unichr(uni) |
| end = next_end |
| # Append the unescaped character |
| _append(char) |
| return u''.join(chunks), end |
| |
| |
| # Use speedup if available |
| scanstring = c_scanstring or py_scanstring |
| |
| WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) |
| WHITESPACE_STR = ' \t\n\r' |
| |
| def JSONObject((s, end), encoding, strict, scan_once, object_hook, |
| object_pairs_hook, memo=None, |
| _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
| # Backwards compatibility |
| if memo is None: |
| memo = {} |
| memo_get = memo.setdefault |
| pairs = [] |
| # Use a slice to prevent IndexError from being raised, the following |
| # check will raise a more specific ValueError if the string is empty |
| nextchar = s[end:end + 1] |
| # Normally we expect nextchar == '"' |
| if nextchar != '"': |
| if nextchar in _ws: |
| end = _w(s, end).end() |
| nextchar = s[end:end + 1] |
| # Trivial empty object |
| if nextchar == '}': |
| if object_pairs_hook is not None: |
| result = object_pairs_hook(pairs) |
| return result, end + 1 |
| pairs = {} |
| if object_hook is not None: |
| pairs = object_hook(pairs) |
| return pairs, end + 1 |
| elif nextchar != '"': |
| raise JSONDecodeError( |
| "Expecting property name enclosed in double quotes", |
| s, end) |
| end += 1 |
| while True: |
| key, end = scanstring(s, end, encoding, strict) |
| key = memo_get(key, key) |
| |
| # To skip some function call overhead we optimize the fast paths where |
| # the JSON key separator is ": " or just ":". |
| if s[end:end + 1] != ':': |
| end = _w(s, end).end() |
| if s[end:end + 1] != ':': |
| raise JSONDecodeError("Expecting ':' delimiter", s, end) |
| |
| end += 1 |
| |
| try: |
| if s[end] in _ws: |
| end += 1 |
| if s[end] in _ws: |
| end = _w(s, end + 1).end() |
| except IndexError: |
| pass |
| |
| try: |
| value, end = scan_once(s, end) |
| except StopIteration: |
| raise JSONDecodeError("Expecting object", s, end) |
| pairs.append((key, value)) |
| |
| try: |
| nextchar = s[end] |
| if nextchar in _ws: |
| end = _w(s, end + 1).end() |
| nextchar = s[end] |
| except IndexError: |
| nextchar = '' |
| end += 1 |
| |
| if nextchar == '}': |
| break |
| elif nextchar != ',': |
| raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) |
| |
| try: |
| nextchar = s[end] |
| if nextchar in _ws: |
| end += 1 |
| nextchar = s[end] |
| if nextchar in _ws: |
| end = _w(s, end + 1).end() |
| nextchar = s[end] |
| except IndexError: |
| nextchar = '' |
| |
| end += 1 |
| if nextchar != '"': |
| raise JSONDecodeError( |
| "Expecting property name enclosed in double quotes", |
| s, end - 1) |
| |
| if object_pairs_hook is not None: |
| result = object_pairs_hook(pairs) |
| return result, end |
| pairs = dict(pairs) |
| if object_hook is not None: |
| pairs = object_hook(pairs) |
| return pairs, end |
| |
| def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): |
| values = [] |
| nextchar = s[end:end + 1] |
| if nextchar in _ws: |
| end = _w(s, end + 1).end() |
| nextchar = s[end:end + 1] |
| # Look-ahead for trivial empty array |
| if nextchar == ']': |
| return values, end + 1 |
| _append = values.append |
| while True: |
| try: |
| value, end = scan_once(s, end) |
| except StopIteration: |
| raise JSONDecodeError("Expecting object", s, end) |
| _append(value) |
| nextchar = s[end:end + 1] |
| if nextchar in _ws: |
| end = _w(s, end + 1).end() |
| nextchar = s[end:end + 1] |
| end += 1 |
| if nextchar == ']': |
| break |
| elif nextchar != ',': |
| raise JSONDecodeError("Expecting ',' delimiter", s, end) |
| |
| try: |
| if s[end] in _ws: |
| end += 1 |
| if s[end] in _ws: |
| end = _w(s, end + 1).end() |
| except IndexError: |
| pass |
| |
| return values, end |
| |
| class JSONDecoder(object): |
| """Simple JSON <http://json.org> decoder |
| |
| Performs the following translations in decoding by default: |
| |
| +---------------+-------------------+ |
| | JSON | Python | |
| +===============+===================+ |
| | object | dict | |
| +---------------+-------------------+ |
| | array | list | |
| +---------------+-------------------+ |
| | string | unicode | |
| +---------------+-------------------+ |
| | number (int) | int, long | |
| +---------------+-------------------+ |
| | number (real) | float | |
| +---------------+-------------------+ |
| | true | True | |
| +---------------+-------------------+ |
| | false | False | |
| +---------------+-------------------+ |
| | null | None | |
| +---------------+-------------------+ |
| |
| It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as |
| their corresponding ``float`` values, which is outside the JSON spec. |
| |
| """ |
| |
| def __init__(self, encoding=None, object_hook=None, parse_float=None, |
| parse_int=None, parse_constant=None, strict=True, |
| object_pairs_hook=None): |
| """ |
| *encoding* determines the encoding used to interpret any |
| :class:`str` objects decoded by this instance (``'utf-8'`` by |
| default). It has no effect when decoding :class:`unicode` objects. |
| |
| Note that currently only encodings that are a superset of ASCII work, |
| strings of other encodings should be passed in as :class:`unicode`. |
| |
| *object_hook*, if specified, will be called with the result of every |
| JSON object decoded and its return value will be used in place of the |
| given :class:`dict`. This can be used to provide custom |
| deserializations (e.g. to support JSON-RPC class hinting). |
| |
| *object_pairs_hook* is an optional function that will be called with |
| the result of any object literal decode with an ordered list of pairs. |
| The return value of *object_pairs_hook* will be used instead of the |
| :class:`dict`. This feature can be used to implement custom decoders |
| that rely on the order that the key and value pairs are decoded (for |
| example, :func:`collections.OrderedDict` will remember the order of |
| insertion). If *object_hook* is also defined, the *object_pairs_hook* |
| takes priority. |
| |
| *parse_float*, if specified, will be called with the string of every |
| JSON float to be decoded. By default, this is equivalent to |
| ``float(num_str)``. This can be used to use another datatype or parser |
| for JSON floats (e.g. :class:`decimal.Decimal`). |
| |
| *parse_int*, if specified, will be called with the string of every |
| JSON int to be decoded. By default, this is equivalent to |
| ``int(num_str)``. This can be used to use another datatype or parser |
| for JSON integers (e.g. :class:`float`). |
| |
| *parse_constant*, if specified, will be called with one of the |
| following strings: ``'-Infinity'``, ``'Infinity'``, ``'NaN'``. This |
| can be used to raise an exception if invalid JSON numbers are |
| encountered. |
| |
| *strict* controls the parser's behavior when it encounters an |
| invalid control character in a string. The default setting of |
| ``True`` means that unescaped control characters are parse errors, if |
| ``False`` then control characters will be allowed in strings. |
| |
| """ |
| self.encoding = encoding |
| self.object_hook = object_hook |
| self.object_pairs_hook = object_pairs_hook |
| self.parse_float = parse_float or float |
| self.parse_int = parse_int or int |
| self.parse_constant = parse_constant or _CONSTANTS.__getitem__ |
| self.strict = strict |
| self.parse_object = JSONObject |
| self.parse_array = JSONArray |
| self.parse_string = scanstring |
| self.memo = {} |
| self.scan_once = make_scanner(self) |
| |
| def decode(self, s, _w=WHITESPACE.match): |
| """Return the Python representation of ``s`` (a ``str`` or ``unicode`` |
| instance containing a JSON document) |
| |
| """ |
| obj, end = self.raw_decode(s) |
| end = _w(s, end).end() |
| if end != len(s): |
| raise JSONDecodeError("Extra data", s, end, len(s)) |
| return obj |
| |
| def raw_decode(self, s, idx=0, _w=WHITESPACE.match): |
| """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` |
| beginning with a JSON document) and return a 2-tuple of the Python |
| representation and the index in ``s`` where the document ended. |
| Optionally, ``idx`` can be used to specify an offset in ``s`` where |
| the JSON document begins. |
| |
| This can be used to decode a JSON document from a string that may |
| have extraneous data at the end. |
| |
| """ |
| try: |
| obj, end = self.scan_once(s, idx=_w(s, idx).end()) |
| except StopIteration: |
| raise JSONDecodeError("No JSON object could be decoded", s, idx) |
| return obj, end |