[Python-checkins] cpython: Issue #12778: Reduce memory consumption when JSON-encoding a large container of

antoine.pitrou python-checkins at python.org
Fri Aug 19 18:05:54 CEST 2011


http://hg.python.org/cpython/rev/47176e8d7060
changeset: 71967:47176e8d7060
user: Antoine Pitrou <solipsis at pitrou.net>
date: Fri Aug 19 18:03:14 2011 +0200
summary:
 Issue #12778: Reduce memory consumption when JSON-encoding a large container of many small objects.
files:
 Lib/test/json_tests/test_dump.py | 19 +-
 Misc/NEWS | 3 +
 Modules/_json.c | 203 ++++++++++++++----
 3 files changed, 175 insertions(+), 50 deletions(-)
diff --git a/Lib/test/json_tests/test_dump.py b/Lib/test/json_tests/test_dump.py
--- a/Lib/test/json_tests/test_dump.py
+++ b/Lib/test/json_tests/test_dump.py
@@ -1,6 +1,7 @@
 from io import StringIO
 from test.json_tests import PyTest, CTest
 
+from test.support import precisionbigmemtest, _1G
 
 class TestDump:
 def test_dump(self):
@@ -21,4 +22,20 @@
 
 
 class TestPyDump(TestDump, PyTest): pass
-class TestCDump(TestDump, CTest): pass
+
+class TestCDump(TestDump, CTest):
+
+ # The size requirement here is hopefully over-estimated (actual
+ # memory consumption depending on implementation details, and also
+ # system memory management, since this may allocate a lot of
+ # small objects).
+
+ @precisionbigmemtest(size=_1G, memuse=1)
+ def test_large_list(self, size):
+ N = int(30 * 1024 * 1024 * (size / _1G))
+ l = [1] * N
+ encoded = self.dumps(l)
+ self.assertEqual(len(encoded), N * 3)
+ self.assertEqual(encoded[:1], "[")
+ self.assertEqual(encoded[-2:], "1]")
+ self.assertEqual(encoded[1:-2], "1, " * (N - 1))
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -262,6 +262,9 @@
 Library
 -------
 
+- Issue #12778: Reduce memory consumption when JSON-encoding a large
+ container of many small objects.
+
 - Issue #12650: Fix a race condition where a subprocess.Popen could leak
 resources (FD/zombie) when killed at the wrong time.
 
diff --git a/Modules/_json.c b/Modules/_json.c
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -75,6 +75,121 @@
 {NULL}
 };
 
+/*
+ * A two-level accumulator of unicode objects that avoids both the overhead
+ * of keeping a huge number of small separate objects, and the quadratic
+ * behaviour of using a naive repeated concatenation scheme.
+ */
+
+typedef struct {
+ PyObject *large; /* A list of previously accumulated large strings */
+ PyObject *small; /* Pending small strings */
+} accumulator;
+
+static PyObject *
+join_list_unicode(PyObject *lst)
+{
+ /* return u''.join(lst) */
+ static PyObject *sep = NULL;
+ if (sep == NULL) {
+ sep = PyUnicode_FromStringAndSize("", 0);
+ if (sep == NULL)
+ return NULL;
+ }
+ return PyUnicode_Join(sep, lst);
+}
+
+static int
+init_accumulator(accumulator *acc)
+{
+ acc->large = PyList_New(0);
+ if (acc->large == NULL)
+ return -1;
+ acc->small = PyList_New(0);
+ if (acc->small == NULL) {
+ Py_CLEAR(acc->large);
+ return -1;
+ }
+ return 0;
+}
+
+static int
+flush_accumulator(accumulator *acc)
+{
+ Py_ssize_t nsmall = PyList_GET_SIZE(acc->small);
+ if (nsmall) {
+ int ret;
+ PyObject *joined = join_list_unicode(acc->small);
+ if (joined == NULL)
+ return -1;
+ if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
+ Py_DECREF(joined);
+ return -1;
+ }
+ ret = PyList_Append(acc->large, joined);
+ Py_DECREF(joined);
+ return ret;
+ }
+ return 0;
+}
+
+static int
+accumulate_unicode(accumulator *acc, PyObject *obj)
+{
+ int ret;
+ Py_ssize_t nsmall;
+ assert(PyUnicode_Check(obj));
+
+ if (PyList_Append(acc->small, obj))
+ return -1;
+ nsmall = PyList_GET_SIZE(acc->small);
+ /* Each item in a list of unicode objects has an overhead (in 64-bit
+ * builds) of:
+ * - 8 bytes for the list slot
+ * - 56 bytes for the header of the unicode object
+ * that is, 64 bytes. 100000 such objects waste more than 6MB
+ * compared to a single concatenated string.
+ */
+ if (nsmall < 100000)
+ return 0;
+ PyObject *joined = join_list_unicode(acc->small);
+ if (joined == NULL)
+ return -1;
+ if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
+ Py_DECREF(joined);
+ return -1;
+ }
+ ret = PyList_Append(acc->large, joined);
+ Py_DECREF(joined);
+ return ret;
+}
+
+static PyObject *
+finish_accumulator(accumulator *acc)
+{
+ int ret;
+ PyObject *res;
+
+ ret = flush_accumulator(acc);
+ Py_CLEAR(acc->small);
+ if (ret) {
+ Py_CLEAR(acc->large);
+ return NULL;
+ }
+ res = acc->large;
+ acc->large = NULL;
+ return res;
+}
+
+static void
+destroy_accumulator(accumulator *acc)
+{
+ Py_CLEAR(acc->small);
+ Py_CLEAR(acc->large);
+}
+
+/* Forward decls */
+
 static PyObject *
 ascii_escape_unicode(PyObject *pystr);
 static PyObject *
@@ -101,11 +216,11 @@
 static int
 encoder_clear(PyObject *self);
 static int
-encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level);
+encoder_listencode_list(PyEncoderObject *s, accumulator *acc, PyObject *seq, Py_ssize_t indent_level);
 static int
-encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level);
+encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, PyObject *obj, Py_ssize_t indent_level);
 static int
-encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level);
+encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, PyObject *dct, Py_ssize_t indent_level);
 static PyObject *
 _encoded_const(PyObject *obj);
 static void
@@ -267,19 +382,6 @@
 }
 
 static PyObject *
-join_list_unicode(PyObject *lst)
-{
- /* return u''.join(lst) */
- static PyObject *sep = NULL;
- if (sep == NULL) {
- sep = PyUnicode_FromStringAndSize("", 0);
- if (sep == NULL)
- return NULL;
- }
- return PyUnicode_Join(sep, lst);
-}
-
-static PyObject *
 _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
 /* return (rval, idx) tuple, stealing reference to rval */
 PyObject *tpl;
@@ -1226,22 +1328,22 @@
 /* Python callable interface to encode_listencode_obj */
 static char *kwlist[] = {"obj", "_current_indent_level", NULL};
 PyObject *obj;
- PyObject *rval;
 Py_ssize_t indent_level;
 PyEncoderObject *s;
+ accumulator acc;
+
 assert(PyEncoder_Check(self));
 s = (PyEncoderObject *)self;
 if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist,
 &obj, _convertPyInt_AsSsize_t, &indent_level))
 return NULL;
- rval = PyList_New(0);
- if (rval == NULL)
+ if (init_accumulator(&acc))
 return NULL;
- if (encoder_listencode_obj(s, rval, obj, indent_level)) {
- Py_DECREF(rval);
+ if (encoder_listencode_obj(s, &acc, obj, indent_level)) {
+ destroy_accumulator(&acc);
 return NULL;
 }
- return rval;
+ return finish_accumulator(&acc);
 }
 
 static PyObject *
@@ -1313,18 +1415,19 @@
 }
 
 static int
-_steal_list_append(PyObject *lst, PyObject *stolen)
+_steal_accumulate(accumulator *acc, PyObject *stolen)
 {
 /* Append stolen and then decrement its reference count */
- int rval = PyList_Append(lst, stolen);
+ int rval = accumulate_unicode(acc, stolen);
 Py_DECREF(stolen);
 return rval;
 }
 
 static int
-encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level)
+encoder_listencode_obj(PyEncoderObject *s, accumulator *acc,
+ PyObject *obj, Py_ssize_t indent_level)
 {
- /* Encode Python object obj to a JSON term, rval is a PyList */
+ /* Encode Python object obj to a JSON term */
 PyObject *newobj;
 int rv;
 
@@ -1332,38 +1435,38 @@
 PyObject *cstr = _encoded_const(obj);
 if (cstr == NULL)
 return -1;
- return _steal_list_append(rval, cstr);
+ return _steal_accumulate(acc, cstr);
 }
 else if (PyUnicode_Check(obj))
 {
 PyObject *encoded = encoder_encode_string(s, obj);
 if (encoded == NULL)
 return -1;
- return _steal_list_append(rval, encoded);
+ return _steal_accumulate(acc, encoded);
 }
 else if (PyLong_Check(obj)) {
 PyObject *encoded = PyObject_Str(obj);
 if (encoded == NULL)
 return -1;
- return _steal_list_append(rval, encoded);
+ return _steal_accumulate(acc, encoded);
 }
 else if (PyFloat_Check(obj)) {
 PyObject *encoded = encoder_encode_float(s, obj);
 if (encoded == NULL)
 return -1;
- return _steal_list_append(rval, encoded);
+ return _steal_accumulate(acc, encoded);
 }
 else if (PyList_Check(obj) || PyTuple_Check(obj)) {
 if (Py_EnterRecursiveCall(" while encoding a JSON object"))
 return -1;
- rv = encoder_listencode_list(s, rval, obj, indent_level);
+ rv = encoder_listencode_list(s, acc, obj, indent_level);
 Py_LeaveRecursiveCall();
 return rv;
 }
 else if (PyDict_Check(obj)) {
 if (Py_EnterRecursiveCall(" while encoding a JSON object"))
 return -1;
- rv = encoder_listencode_dict(s, rval, obj, indent_level);
+ rv = encoder_listencode_dict(s, acc, obj, indent_level);
 Py_LeaveRecursiveCall();
 return rv;
 }
@@ -1394,7 +1497,7 @@
 
 if (Py_EnterRecursiveCall(" while encoding a JSON object"))
 return -1;
- rv = encoder_listencode_obj(s, rval, newobj, indent_level);
+ rv = encoder_listencode_obj(s, acc, newobj, indent_level);
 Py_LeaveRecursiveCall();
 
 Py_DECREF(newobj);
@@ -1414,9 +1517,10 @@
 }
 
 static int
-encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level)
+encoder_listencode_dict(PyEncoderObject *s, accumulator *acc,
+ PyObject *dct, Py_ssize_t indent_level)
 {
- /* Encode Python dict dct a JSON term, rval is a PyList */
+ /* Encode Python dict dct a JSON term */
 static PyObject *open_dict = NULL;
 static PyObject *close_dict = NULL;
 static PyObject *empty_dict = NULL;
@@ -1436,7 +1540,7 @@
 return -1;
 }
 if (Py_SIZE(dct) == 0)
- return PyList_Append(rval, empty_dict);
+ return accumulate_unicode(acc, empty_dict);
 
 if (s->markers != Py_None) {
 int has_key;
@@ -1454,7 +1558,7 @@
 }
 }
 
- if (PyList_Append(rval, open_dict))
+ if (accumulate_unicode(acc, open_dict))
 goto bail;
 
 if (s->indent != Py_None) {
@@ -1541,7 +1645,7 @@
 }
 
 if (idx) {
- if (PyList_Append(rval, s->item_separator))
+ if (accumulate_unicode(acc, s->item_separator))
 goto bail;
 }
 
@@ -1549,16 +1653,16 @@
 Py_CLEAR(kstr);
 if (encoded == NULL)
 goto bail;
- if (PyList_Append(rval, encoded)) {
+ if (accumulate_unicode(acc, encoded)) {
 Py_DECREF(encoded);
 goto bail;
 }
 Py_DECREF(encoded);
- if (PyList_Append(rval, s->key_separator))
+ if (accumulate_unicode(acc, s->key_separator))
 goto bail;
 
 value = PyTuple_GET_ITEM(item, 1);
- if (encoder_listencode_obj(s, rval, value, indent_level))
+ if (encoder_listencode_obj(s, acc, value, indent_level))
 goto bail;
 idx += 1;
 Py_DECREF(item);
@@ -1578,7 +1682,7 @@
 
 yield '\n' + (' ' * (_indent * _current_indent_level))
 }*/
- if (PyList_Append(rval, close_dict))
+ if (accumulate_unicode(acc, close_dict))
 goto bail;
 return 0;
 
@@ -1592,9 +1696,10 @@
 
 
 static int
-encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level)
+encoder_listencode_list(PyEncoderObject *s, accumulator *acc,
+ PyObject *seq, Py_ssize_t indent_level)
 {
- /* Encode Python list seq to a JSON term, rval is a PyList */
+ /* Encode Python list seq to a JSON term */
 static PyObject *open_array = NULL;
 static PyObject *close_array = NULL;
 static PyObject *empty_array = NULL;
@@ -1618,7 +1723,7 @@
 num_items = PySequence_Fast_GET_SIZE(s_fast);
 if (num_items == 0) {
 Py_DECREF(s_fast);
- return PyList_Append(rval, empty_array);
+ return accumulate_unicode(acc, empty_array);
 }
 
 if (s->markers != Py_None) {
@@ -1638,7 +1743,7 @@
 }
 
 seq_items = PySequence_Fast_ITEMS(s_fast);
- if (PyList_Append(rval, open_array))
+ if (accumulate_unicode(acc, open_array))
 goto bail;
 if (s->indent != Py_None) {
 /* TODO: DOES NOT RUN */
@@ -1652,10 +1757,10 @@
 for (i = 0; i < num_items; i++) {
 PyObject *obj = seq_items[i];
 if (i) {
- if (PyList_Append(rval, s->item_separator))
+ if (accumulate_unicode(acc, s->item_separator))
 goto bail;
 }
- if (encoder_listencode_obj(s, rval, obj, indent_level))
+ if (encoder_listencode_obj(s, acc, obj, indent_level))
 goto bail;
 }
 if (ident != NULL) {
@@ -1670,7 +1775,7 @@
 
 yield '\n' + (' ' * (_indent * _current_indent_level))
 }*/
- if (PyList_Append(rval, close_array))
+ if (accumulate_unicode(acc, close_array))
 goto bail;
 Py_DECREF(s_fast);
 return 0;
-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /