diff -r 39ddcc5c7fb9 Modules/_pickle.c --- a/Modules/_pickle.c Sat Feb 25 19:26:39 2012 +0200 +++ b/Modules/_pickle.c Sat Feb 25 20:32:56 2012 +0100 @@ -338,6 +338,8 @@ int fast_nesting; int fix_imports; /* Indicate whether Pickler should fix the name of globals for Python 2.x. */ + int bytestr; /* Indicate whether Bytes should be stored as + Python 2.x str */ PyObject *fast_memo; } PicklerObject; @@ -791,10 +793,10 @@ static int _Pickler_SetProtocol(PicklerObject *self, PyObject *proto_obj, - PyObject *fix_imports_obj) + PyObject *fix_imports_obj, PyObject *bytestr_obj) { long proto = 0; - int fix_imports; + int fix_imports, bytestr; if (proto_obj == NULL || proto_obj == Py_None) proto = DEFAULT_PROTOCOL; @@ -813,11 +815,14 @@ fix_imports = PyObject_IsTrue(fix_imports_obj); if (fix_imports == -1) return -1; + bytestr = PyObject_IsTrue(bytestr_obj); + if (bytestr == -1) + return -1; self->proto = proto; self->bin = proto> 0; self->fix_imports = fix_imports && proto < 3; - + self->bytestr = bytestr && proto < 3; return 0; } @@ -1703,103 +1708,166 @@ } static int +save_bytes_compat(PicklerObject *self, PyObject *obj) { + /* Older pickle protocols do not have an opcode for pickling bytes + objects. Therefore, we need to fake the copy protocol (i.e., + the __reduce__ method) to permit bytes object unpickling. + + Here we use a hack to be compatible with Python 2. Since in Python + 2 'bytes' is just an alias for 'str' (which has different + parameters than the actual bytes object), we use codecs.encode + to create the appropriate 'str' object when unpickled using + Python 2 *and* the appropriate 'bytes' object when unpickled + using Python 3. Again this is a hack and we don't need to do this + with newer protocols. */ + static PyObject *codecs_encode = NULL; + PyObject *reduce_value = NULL; + int status; + + if (codecs_encode == NULL) { + PyObject *codecs_module = PyImport_ImportModule("codecs"); + if (codecs_module == NULL) { + return -1; + } + codecs_encode = PyObject_GetAttrString(codecs_module, "encode"); + Py_DECREF(codecs_module); + if (codecs_encode == NULL) { + return -1; + } + } + + if (PyBytes_GET_SIZE(obj) == 0) { + reduce_value = Py_BuildValue("(O())", (PyObject*)&PyBytes_Type); + } + else { + static PyObject *latin1 = NULL; + PyObject *unicode_str = + PyUnicode_DecodeLatin1(PyBytes_AS_STRING(obj), + PyBytes_GET_SIZE(obj), + "strict"); + if (unicode_str == NULL) + return -1; + if (latin1 == NULL) { + latin1 = PyUnicode_InternFromString("latin1"); + if (latin1 == NULL) + return -1; + } + reduce_value = Py_BuildValue("(O(OO))", + codecs_encode, unicode_str, latin1); + Py_DECREF(unicode_str); + } + + if (reduce_value == NULL) + return -1; + + /* save_reduce() will memoize the object automatically. */ + status = save_reduce(self, reduce_value, obj); + Py_DECREF(reduce_value); + return status; +} +static int +save_bytes_STRING(PicklerObject *self, PyObject *obj) +{ + /* Dumps Bytes objects in the protocol 0 string representation */ + int retval = -1; + + char header[] = {STRING}; + char header_len = 1; + + PyObject *str_repr, *bytes_repr; + int len; + char *char_repr; + + if (!(str_repr = PyObject_ASCII(obj))) + goto done; + + if (!(bytes_repr = PyUnicode_AsASCIIString(str_repr))) + goto done; + + if ((len = PyBytes_Size(bytes_repr)) < 0) + goto done; + + if (!(char_repr = PyBytes_AsString(bytes_repr))) + goto done; + + // strip leading 'b' + char_repr += 1; + len -= 1; + + if (_Pickler_Write(self, header, header_len) < 0) + goto done; + + if (_Pickler_Write(self, char_repr, len) < 0) + goto done; + + if (_Pickler_Write(self, "\n", 1) < 0) + goto done; + + if (memo_put(self, obj) < 0) + goto done; + + retval = 0; +done: + Py_XDECREF(bytes_repr); + Py_XDECREF(str_repr); + return retval; +} + +static int +save_bytes_BYTES(PicklerObject *self, PyObject *obj) { + /* Dumps Bytes as (SHORT_)BIN{STRING,BYTES} */ + Py_ssize_t size; + char header[5]; + Py_ssize_t len; + + size = PyBytes_GET_SIZE(obj); + if (size < 0) + return -1; + else if (size < 256) { + header[0] = self->bytestr ? SHORT_BINSTRING : SHORT_BINBYTES; + header[1] = (unsigned char)size; + len = 2; + } + else if (size <= 0xffffffffL) { + header[0] = self->bytestr ? BINSTRING : BINBYTES; + header[1] = (unsigned char)(size & 0xff); + header[2] = (unsigned char)((size>> 8) & 0xff); + header[3] = (unsigned char)((size>> 16) & 0xff); + header[4] = (unsigned char)((size>> 24) & 0xff); + len = 5; + } + else { + PyErr_SetString(PyExc_OverflowError, + "cannot serialize a bytes object larger than 4GB"); + return -1; /* string too large */ + } + + if (_Pickler_Write(self, header, len) < 0) + return -1; + + if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) + return -1; + + if (memo_put(self, obj) < 0) + return -1; + + return 0; +} + +static int save_bytes(PicklerObject *self, PyObject *obj) { - if (self->proto < 3) { - /* Older pickle protocols do not have an opcode for pickling bytes - objects. Therefore, we need to fake the copy protocol (i.e., - the __reduce__ method) to permit bytes object unpickling. - - Here we use a hack to be compatible with Python 2. Since in Python - 2 'bytes' is just an alias for 'str' (which has different - parameters than the actual bytes object), we use codecs.encode - to create the appropriate 'str' object when unpickled using - Python 2 *and* the appropriate 'bytes' object when unpickled - using Python 3. Again this is a hack and we don't need to do this - with newer protocols. */ - static PyObject *codecs_encode = NULL; - PyObject *reduce_value = NULL; - int status; - - if (codecs_encode == NULL) { - PyObject *codecs_module = PyImport_ImportModule("codecs"); - if (codecs_module == NULL) { - return -1; - } - codecs_encode = PyObject_GetAttrString(codecs_module, "encode"); - Py_DECREF(codecs_module); - if (codecs_encode == NULL) { - return -1; - } - } - - if (PyBytes_GET_SIZE(obj) == 0) { - reduce_value = Py_BuildValue("(O())", (PyObject*)&PyBytes_Type); - } - else { - static PyObject *latin1 = NULL; - PyObject *unicode_str = - PyUnicode_DecodeLatin1(PyBytes_AS_STRING(obj), - PyBytes_GET_SIZE(obj), - "strict"); - if (unicode_str == NULL) - return -1; - if (latin1 == NULL) { - latin1 = PyUnicode_InternFromString("latin1"); - if (latin1 == NULL) - return -1; - } - reduce_value = Py_BuildValue("(O(OO))", - codecs_encode, unicode_str, latin1); - Py_DECREF(unicode_str); - } - - if (reduce_value == NULL) - return -1; - - /* save_reduce() will memoize the object automatically. */ - status = save_reduce(self, reduce_value, obj); - Py_DECREF(reduce_value); - return status; + if (self->bytestr) { + if (self->proto == 0) + return save_bytes_STRING(self, obj); + else + return save_bytes_BYTES(self, obj); } else { - Py_ssize_t size; - char header[5]; - Py_ssize_t len; - - size = PyBytes_GET_SIZE(obj); - if (size < 0) - return -1; - - if (size < 256) { - header[0] = SHORT_BINBYTES; - header[1] = (unsigned char)size; - len = 2; - } - else if (size <= 0xffffffffL) { - header[0] = BINBYTES; - header[1] = (unsigned char)(size & 0xff); - header[2] = (unsigned char)((size>> 8) & 0xff); - header[3] = (unsigned char)((size>> 16) & 0xff); - header[4] = (unsigned char)((size>> 24) & 0xff); - len = 5; - } - else { - PyErr_SetString(PyExc_OverflowError, - "cannot serialize a bytes object larger than 4GB"); - return -1; /* string too large */ - } - - if (_Pickler_Write(self, header, len) < 0) - return -1; - - if (_Pickler_Write(self, PyBytes_AS_STRING(obj), size) < 0) - return -1; - - if (memo_put(self, obj) < 0) - return -1; - - return 0; + if (self->proto < 3) + return save_bytes_compat(self, obj); + else + return save_bytes_BYTES(self, obj); } } @@ -3417,26 +3485,30 @@ "\n" "If fix_imports is True and protocol is less than 3, pickle will try to\n" "map the new Python 3.x names to the old module names used in Python\n" -"2.x, so that the pickle data stream is readable with Python 2.x.\n"); +"2.x, so that the pickle data stream is readable with Python 2.x.\n" +"\n" +"If bytestr is True and protocol is less than 3, bytes will be stored as\n" +"8-bit string instead of as bytes object.\n"); static int Pickler_init(PicklerObject *self, PyObject *args, PyObject *kwds) { - static char *kwlist[] = {"file", "protocol", "fix_imports", 0}; + static char *kwlist[] = {"file", "protocol", "fix_imports", "bytestr", 0}; PyObject *file; PyObject *proto_obj = NULL; PyObject *fix_imports = Py_True; + PyObject *bytestr = Py_False; _Py_IDENTIFIER(persistent_id); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OO:Pickler", - kwlist, &file, &proto_obj, &fix_imports)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|OOO:Pickler", + kwlist, &file, &proto_obj, &fix_imports, &bytestr)) return -1; /* In case of multiple __init__() calls, clear previous content. */ if (self->write != NULL) (void)Pickler_clear(self); - if (_Pickler_SetProtocol(self, proto_obj, fix_imports) < 0) + if (_Pickler_SetProtocol(self, proto_obj, fix_imports, bytestr) < 0) return -1; if (_Pickler_SetOutputStream(self, file) < 0) @@ -6020,6 +6092,7 @@ PyObject *file; PyObject *proto = NULL; PyObject *fix_imports = Py_True; + PyObject *bytestr = Py_False; PicklerObject *pickler; /* fix_imports is a keyword-only argument. */ @@ -6038,7 +6111,7 @@ if (pickler == NULL) return NULL; - if (_Pickler_SetProtocol(pickler, proto, fix_imports) < 0) + if (_Pickler_SetProtocol(pickler, proto, fix_imports, bytestr) < 0) goto error; if (_Pickler_SetOutputStream(pickler, file) < 0) @@ -6084,6 +6157,7 @@ PyObject *proto = NULL; PyObject *result; PyObject *fix_imports = Py_True; + PyObject *bytestr = Py_False; PicklerObject *pickler; /* fix_imports is a keyword-only argument. */ @@ -6102,7 +6176,7 @@ if (pickler == NULL) return NULL; - if (_Pickler_SetProtocol(pickler, proto, fix_imports) < 0) + if (_Pickler_SetProtocol(pickler, proto, fix_imports, bytestr) < 0) goto error; if (dump(pickler, obj) < 0)