[Python-checkins] r84505 - in python/branches/py3k: Lib/json/decoder.py Lib/json/scanner.py Lib/json/tests/test_decode.py Misc/NEWS Modules/_json.c

antoine.pitrou python-checkins at python.org
Sat Sep 4 22:16:54 CEST 2010


Author: antoine.pitrou
Date: Sat Sep 4 22:16:53 2010
New Revision: 84505
Log:
Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same
strings as keys.
Modified:
 python/branches/py3k/Lib/json/decoder.py
 python/branches/py3k/Lib/json/scanner.py
 python/branches/py3k/Lib/json/tests/test_decode.py
 python/branches/py3k/Misc/NEWS
 python/branches/py3k/Modules/_json.c
Modified: python/branches/py3k/Lib/json/decoder.py
==============================================================================
--- python/branches/py3k/Lib/json/decoder.py	(original)
+++ python/branches/py3k/Lib/json/decoder.py	Sat Sep 4 22:16:53 2010
@@ -147,10 +147,14 @@
 
 
 def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
- _w=WHITESPACE.match, _ws=WHITESPACE_STR):
+ memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
 s, end = s_and_end
 pairs = []
 pairs_append = pairs.append
+ # Backwards compatibility
+ if memo is None:
+ memo = {}
+ memo_get = memo.setdefault
 # Use a slice to prevent IndexError from being raised, the following
 # check will raise a more specific ValueError if the string is empty
 nextchar = s[end:end + 1]
@@ -167,6 +171,7 @@
 end += 1
 while True:
 key, end = scanstring(s, end, strict)
+ key = memo_get(key, key)
 # To skip some function call overhead we optimize the fast paths where
 # the JSON key separator is ": " or just ":".
 if s[end:end + 1] != ':':
@@ -214,7 +219,7 @@
 pairs = object_hook(pairs)
 return pairs, end
 
-def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
+def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
 s, end = s_and_end
 values = []
 nextchar = s[end:end + 1]
@@ -314,6 +319,7 @@
 self.parse_object = JSONObject
 self.parse_array = JSONArray
 self.parse_string = scanstring
+ self.memo = {}
 self.scan_once = make_scanner(self)
 
 
Modified: python/branches/py3k/Lib/json/scanner.py
==============================================================================
--- python/branches/py3k/Lib/json/scanner.py	(original)
+++ python/branches/py3k/Lib/json/scanner.py	Sat Sep 4 22:16:53 2010
@@ -22,6 +22,8 @@
 parse_int = context.parse_int
 parse_constant = context.parse_constant
 object_hook = context.object_hook
+ object_pairs_hook = context.object_pairs_hook
+ memo = context.memo
 
 def _scan_once(string, idx):
 try:
@@ -33,7 +35,7 @@
 return parse_string(string, idx + 1, strict)
 elif nextchar == '{':
 return parse_object((string, idx + 1), strict,
- _scan_once, object_hook, object_pairs_hook)
+ _scan_once, object_hook, object_pairs_hook, memo)
 elif nextchar == '[':
 return parse_array((string, idx + 1), _scan_once)
 elif nextchar == 'n' and string[idx:idx + 4] == 'null':
@@ -60,6 +62,12 @@
 else:
 raise StopIteration
 
+ def scan_once(string, idx):
+ try:
+ return _scan_once(string, idx)
+ finally:
+ memo.clear()
+
 return _scan_once
 
 make_scanner = c_make_scanner or py_make_scanner
Modified: python/branches/py3k/Lib/json/tests/test_decode.py
==============================================================================
--- python/branches/py3k/Lib/json/tests/test_decode.py	(original)
+++ python/branches/py3k/Lib/json/tests/test_decode.py	Sat Sep 4 22:16:53 2010
@@ -1,10 +1,25 @@
 import decimal
 from unittest import TestCase
 from io import StringIO
+from contextlib import contextmanager
 
 import json
+import json.decoder
+import json.scanner
 from collections import OrderedDict
 
+
+ at contextmanager
+def use_python_scanner():
+ py_scanner = json.scanner.py_make_scanner
+ old_scanner = json.decoder.make_scanner
+ json.decoder.make_scanner = py_scanner
+ try:
+ yield
+ finally:
+ json.decoder.make_scanner = old_scanner
+
+
 class TestDecode(TestCase):
 def test_decimal(self):
 rval = json.loads('1.1', parse_float=decimal.Decimal)
@@ -39,3 +54,16 @@
 # exercise the uncommon cases. The array cases are already covered.
 rval = json.loads('{ "key" : "value" , "k":"v" }')
 self.assertEquals(rval, {"key":"value", "k":"v"})
+
+ def check_keys_reuse(self, source, loads):
+ rval = loads(source)
+ (a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
+ self.assertIs(a, c)
+ self.assertIs(b, d)
+
+ def test_keys_reuse(self):
+ s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
+ self.check_keys_reuse(s, json.loads)
+ # Disabled: the pure Python version of json simply doesn't work
+ with use_python_scanner():
+ self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Sat Sep 4 22:16:53 2010
@@ -165,6 +165,10 @@
 Library
 -------
 
+- Issue #7451: Improve decoding performance of JSON objects, and reduce
+ the memory consumption of said decoded objects when they use the same
+ strings as keys.
+
 - Issue #1100562: Fix deep-copying of objects derived from the list and
 dict types. Patch by Michele Orrù and Björn Lindqvist.
 
Modified: python/branches/py3k/Modules/_json.c
==============================================================================
--- python/branches/py3k/Modules/_json.c	(original)
+++ python/branches/py3k/Modules/_json.c	Sat Sep 4 22:16:53 2010
@@ -36,6 +36,7 @@
 PyObject *parse_float;
 PyObject *parse_int;
 PyObject *parse_constant;
+ PyObject *memo;
 } PyScannerObject;
 
 static PyMemberDef scanner_members[] = {
@@ -305,6 +306,21 @@
 return tpl;
 }
 
+#define APPEND_OLD_CHUNK \
+ if (chunk != NULL) { \
+ if (chunks == NULL) { \
+ chunks = PyList_New(0); \
+ if (chunks == NULL) { \
+ goto bail; \
+ } \
+ } \
+ if (PyList_Append(chunks, chunk)) { \
+ Py_DECREF(chunk); \
+ goto bail; \
+ } \
+ Py_CLEAR(chunk); \
+ }
+
 static PyObject *
 scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
 {
@@ -316,15 +332,14 @@
 
 Return value is a new PyUnicode
 */
- PyObject *rval;
+ PyObject *rval = NULL;
 Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
 Py_ssize_t begin = end - 1;
 Py_ssize_t next = begin;
 const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
- PyObject *chunks = PyList_New(0);
- if (chunks == NULL) {
- goto bail;
- }
+ PyObject *chunks = NULL;
+ PyObject *chunk = NULL;
+
 if (end < 0 || len <= end) {
 PyErr_SetString(PyExc_ValueError, "end is out of bounds");
 goto bail;
@@ -332,7 +347,6 @@
 while (1) {
 /* Find the end of the string or the next escape */
 Py_UNICODE c = 0;
- PyObject *chunk = NULL;
 for (next = end; next < len; next++) {
 c = buf[next];
 if (c == '"' || c == '\\') {
@@ -349,15 +363,11 @@
 }
 /* Pick up this chunk if it's not zero length */
 if (next != end) {
+ APPEND_OLD_CHUNK
 chunk = PyUnicode_FromUnicode(&buf[end], next - end);
 if (chunk == NULL) {
 goto bail;
 }
- if (PyList_Append(chunks, chunk)) {
- Py_DECREF(chunk);
- goto bail;
- }
- Py_DECREF(chunk);
 }
 next++;
 if (c == '"') {
@@ -459,27 +469,34 @@
 }
 #endif
 }
+ APPEND_OLD_CHUNK
 chunk = PyUnicode_FromUnicode(&c, 1);
 if (chunk == NULL) {
 goto bail;
 }
- if (PyList_Append(chunks, chunk)) {
- Py_DECREF(chunk);
+ }
+
+ if (chunks == NULL) {
+ if (chunk != NULL)
+ rval = chunk;
+ else
+ rval = PyUnicode_FromStringAndSize("", 0);
+ }
+ else {
+ APPEND_OLD_CHUNK
+ rval = join_list_unicode(chunks);
+ if (rval == NULL) {
 goto bail;
 }
- Py_DECREF(chunk);
+ Py_CLEAR(chunks);
 }
 
- rval = join_list_unicode(chunks);
- if (rval == NULL) {
- goto bail;
- }
- Py_DECREF(chunks);
 *next_end_ptr = end;
 return rval;
 bail:
 *next_end_ptr = -1;
 Py_XDECREF(chunks);
+ Py_XDECREF(chunk);
 return NULL;
 }
 
@@ -578,6 +595,7 @@
 Py_CLEAR(s->parse_float);
 Py_CLEAR(s->parse_int);
 Py_CLEAR(s->parse_constant);
+ Py_CLEAR(s->memo);
 return 0;
 }
 
@@ -593,10 +611,16 @@
 Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
 Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
 PyObject *val = NULL;
- PyObject *rval = PyList_New(0);
+ PyObject *rval = NULL;
 PyObject *key = NULL;
 int strict = PyObject_IsTrue(s->strict);
+ int has_pairs_hook = (s->object_pairs_hook != Py_None);
 Py_ssize_t next_idx;
+
+ if (has_pairs_hook)
+ rval = PyList_New(0);
+ else
+ rval = PyDict_New();
 if (rval == NULL)
 return NULL;
 
@@ -606,6 +630,8 @@
 /* only loop if the object is non-empty */
 if (idx <= end_idx && str[idx] != '}') {
 while (idx <= end_idx) {
+ PyObject *memokey;
+
 /* read key */
 if (str[idx] != '"') {
 raise_errmsg("Expecting property name", pystr, idx);
@@ -614,6 +640,16 @@
 key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
 if (key == NULL)
 goto bail;
+ memokey = PyDict_GetItem(s->memo, key);
+ if (memokey != NULL) {
+ Py_INCREF(memokey);
+ Py_DECREF(key);
+ key = memokey;
+ }
+ else {
+ if (PyDict_SetItem(s->memo, key, key) < 0)
+ goto bail;
+ }
 idx = next_idx;
 
 /* skip whitespace between key and : delimiter, read :, skip whitespace */
@@ -630,19 +666,24 @@
 if (val == NULL)
 goto bail;
 
- {
- PyObject *tuple = PyTuple_Pack(2, key, val);
- if (tuple == NULL)
+ if (has_pairs_hook) {
+ PyObject *item = PyTuple_Pack(2, key, val);
+ if (item == NULL)
 goto bail;
- if (PyList_Append(rval, tuple) == -1) {
- Py_DECREF(tuple);
+ Py_CLEAR(key);
+ Py_CLEAR(val);
+ if (PyList_Append(rval, item) == -1) {
+ Py_DECREF(item);
 goto bail;
 }
- Py_DECREF(tuple);
+ Py_DECREF(item);
+ }
+ else {
+ if (PyDict_SetItem(rval, key, val) < 0)
+ goto bail;
+ Py_CLEAR(key);
+ Py_CLEAR(val);
 }
-
- Py_CLEAR(key);
- Py_CLEAR(val);
 idx = next_idx;
 
 /* skip whitespace before } or , */
@@ -672,36 +713,23 @@
 
 *next_idx_ptr = idx + 1;
 
- if (s->object_pairs_hook != Py_None) {
+ if (has_pairs_hook) {
 val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
- if (val == NULL)
- goto bail;
 Py_DECREF(rval);
 return val;
 }
 
- val = PyDict_New();
- if (val == NULL)
- goto bail;
- if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
- goto bail;
- Py_DECREF(rval);
- rval = val;
-
 /* if object_hook is not None: rval = object_hook(rval) */
 if (s->object_hook != Py_None) {
 val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
- if (val == NULL)
- goto bail;
 Py_DECREF(rval);
- rval = val;
- val = NULL;
+ return val;
 }
 return rval;
 bail:
 Py_XDECREF(key);
 Py_XDECREF(val);
- Py_DECREF(rval);
+ Py_XDECREF(rval);
 return NULL;
 }
 
@@ -988,6 +1016,9 @@
 Py_TYPE(pystr)->tp_name);
 return NULL;
 }
+ PyDict_Clear(s->memo);
+ if (rval == NULL)
+ return NULL;
 return _build_rval_index_tuple(rval, next_idx);
 }
 
@@ -1021,6 +1052,12 @@
 if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
 return -1;
 
+ if (s->memo == NULL) {
+ s->memo = PyDict_New();
+ if (s->memo == NULL)
+ goto bail;
+ }
+
 /* All of these will fail "gracefully" so we don't need to verify them */
 s->strict = PyObject_GetAttrString(ctx, "strict");
 if (s->strict == NULL)


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /