[Python-checkins] cpython (merge 3.3 -> default): Issue #10156: In the interpreter's initialization phase, unicode globals

Sat Jan 26 11:21:56 CET 2013

http://hg.python.org/cpython/rev/cb12d642eed2
changeset: 81751:cb12d642eed2
parent: 81747:1f57fb5e1e8e
parent: 81750:01d4dd412581
user: Serhiy Storchaka <storchaka at gmail.com>
date: Sat Jan 26 12:18:17 2013 +0200
summary:
 Issue #10156: In the interpreter's initialization phase, unicode globals
are now initialized dynamically as needed.
files:
 Misc/NEWS | 3 +
 Objects/unicodeobject.c | 201 ++++++++++++---------------
 2 files changed, 90 insertions(+), 114 deletions(-)

diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+- Issue #10156: In the interpreter's initialization phase, unicode globals
+ are now initialized dynamically as needed.
+
 - Issue #16980: Fix processing of escaped non-ascii bytes in the
 unicode-escape-decode decoder.
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -49,8 +49,9 @@
 
 /* --- Globals ------------------------------------------------------------
 
- The globals are initialized by the _PyUnicode_Init() API and should
- not be used before calling that API.
+NOTE: In the interpreter's initialization phase, some globals are currently
+ initialized dynamically as needed. In the process Unicode objects may
+ be created before the Unicode type is ready.
 
 */
 
@@ -171,17 +172,36 @@
 Another way to look at this is that to say that the actual reference
 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
 */
-static PyObject *interned;
+static PyObject *interned = NULL;
 
 /* The empty Unicode object is shared to improve performance. */
-static PyObject *unicode_empty;
+static PyObject *unicode_empty = NULL;
+
+#define _Py_INCREF_UNICODE_EMPTY() \
+ do { \
+ if (unicode_empty != NULL) \
+ Py_INCREF(unicode_empty); \
+ else { \
+ unicode_empty = PyUnicode_New(0, 0); \
+ if (unicode_empty != NULL) { \
+ Py_INCREF(unicode_empty); \
+ assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
+ } \
+ } \
+ } while (0)
+
+#define _Py_RETURN_UNICODE_EMPTY() \
+ do { \
+ _Py_INCREF_UNICODE_EMPTY(); \
+ return unicode_empty; \
+ } while (0)
 
 /* List of static strings. */
-static _Py_Identifier *static_strings;
+static _Py_Identifier *static_strings = NULL;
 
 /* Single character Unicode strings in the Latin-1 range are being
 shared as well. */
-static PyObject *unicode_latin1[256];
+static PyObject *unicode_latin1[256] = {NULL};
 
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
@@ -406,9 +426,8 @@
 
 len = _PyUnicode_WSTR_LENGTH(unicode);
 if (len == 0) {
- Py_INCREF(unicode_empty);
 Py_DECREF(unicode);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 if (len == 1) {
@@ -442,8 +461,8 @@
 length = PyUnicode_GET_LENGTH(unicode);
 if (length == 0) {
 if (unicode != unicode_empty) {
- Py_INCREF(unicode_empty);
 Py_DECREF(unicode);
+ _Py_RETURN_UNICODE_EMPTY();
 }
 return unicode_empty;
 }
@@ -520,7 +539,7 @@
 
 #define BLOOM_MASK unsigned long
 
-static BLOOM_MASK bloom_linebreak;
+static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
 
 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
@@ -1602,9 +1621,11 @@
 return 0;
 
 if (length == 0) {
+ _Py_INCREF_UNICODE_EMPTY();
+ if (!unicode_empty)
+ return -1;
 Py_DECREF(*p_unicode);
 *p_unicode = unicode_empty;
- Py_INCREF(*p_unicode);
 return 0;
 }
 
@@ -1727,10 +1748,8 @@
 some optimizations which share commonly used objects. */
 
 /* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* Single character Unicode objects in the Latin-1 range are
 shared when using this constructor */
@@ -1889,10 +1908,8 @@
 PyObject *res;
 unsigned char max_char;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 assert(size > 0);
 if (size == 1)
 return get_latin1_char(u[0]);
@@ -1912,10 +1929,8 @@
 PyObject *res;
 Py_UCS2 max_char;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 assert(size > 0);
 if (size == 1) {
 Py_UCS4 ch = u[0];
@@ -1950,10 +1965,8 @@
 PyObject *res;
 Py_UCS4 max_char;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 assert(size > 0);
 if (size == 1) {
 Py_UCS4 ch = u[0];
@@ -2245,10 +2258,8 @@
 PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
 {
 if (w == NULL) {
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 PyErr_BadInternalCall();
 return NULL;
 }
@@ -2825,15 +2836,11 @@
 
 /* Decoding bytes objects is the most common case and should be fast */
 if (PyBytes_Check(obj)) {
- if (PyBytes_GET_SIZE(obj) == 0) {
- Py_INCREF(unicode_empty);
- v = unicode_empty;
- }
- else {
- v = PyUnicode_Decode(
- PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
- encoding, errors);
- }
+ if (PyBytes_GET_SIZE(obj) == 0)
+ _Py_RETURN_UNICODE_EMPTY();
+ v = PyUnicode_Decode(
+ PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
+ encoding, errors);
 return v;
 }
 
@@ -2853,12 +2860,11 @@
 }
 
 if (buffer.len == 0) {
- Py_INCREF(unicode_empty);
- v = unicode_empty;
- }
- else
- v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
-
+ PyBuffer_Release(&buffer);
+ _Py_RETURN_UNICODE_EMPTY();
+ }
+
+ v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
 PyBuffer_Release(&buffer);
 return v;
 }
@@ -4201,8 +4207,7 @@
 if (size == 0) {
 if (consumed)
 *consumed = 0;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 /* Start off assuming it's all ASCII. Widen later as necessary. */
@@ -4609,8 +4614,7 @@
 if (size == 0) {
 if (consumed)
 *consumed = 0;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
@@ -4868,8 +4872,7 @@
 if (q == e) {
 if (consumed)
 *consumed = size;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 #ifdef WORDS_BIGENDIAN
@@ -5108,8 +5111,7 @@
 if (q == e) {
 if (consumed)
 *consumed = size;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 #if PY_LITTLE_ENDIAN
@@ -5386,10 +5388,8 @@
 Py_ssize_t len;
 
 len = length_of_escaped_ascii_string(s, size);
- if (len == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (len == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* After length_of_escaped_ascii_string() there are two alternatives,
 either the string is pure ASCII with named escapes like \n, etc.
@@ -5781,10 +5781,8 @@
 PyObject *errorHandler = NULL;
 PyObject *exc = NULL;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* Escaped strings will always be longer than the resulting
 Unicode string, so we start with size here and then reduce the
@@ -5988,10 +5986,8 @@
 1))
 return NULL;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* XXX overflow detection missing */
 _PyUnicodeWriter_Init(&writer, 0);
@@ -6439,10 +6435,8 @@
 PyObject *errorHandler = NULL;
 PyObject *exc = NULL;
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
 if (size == 1 && (unsigned char)s[0] < 128)
@@ -6820,8 +6814,7 @@
 if (chunk_size == 0 && done) {
 if (v != NULL)
 break;
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 
@@ -7298,10 +7291,8 @@
 if (mapping == NULL)
 return PyUnicode_DecodeLatin1(s, size, errors);
 
- if (size == 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (size == 0)
+ _Py_RETURN_UNICODE_EMPTY();
 _PyUnicodeWriter_Init(&writer, 0);
 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
 goto onError;
@@ -9354,9 +9345,7 @@
 /* If empty sequence, return u"". */
 if (seqlen == 0) {
 Py_DECREF(fseq);
- Py_INCREF(unicode_empty);
- res = unicode_empty;
- return res;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 
 /* If singleton sequence with an exact Unicode, return that. */
@@ -10056,7 +10045,9 @@
 }
 new_size = slen + n * (len2 - len1);
 if (new_size == 0) {
- Py_INCREF(unicode_empty);
+ _Py_INCREF_UNICODE_EMPTY();
+ if (!unicode_empty)
+ goto error;
 u = unicode_empty;
 goto done;
 }
@@ -11559,10 +11550,8 @@
 PyErr_SetString(PyExc_IndexError, "string index out of range");
 return NULL;
 }
- if (start >= length || end < start) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (start >= length || end < start)
+ _Py_RETURN_UNICODE_EMPTY();
 
 length = end - start;
 if (PyUnicode_IS_ASCII(self)) {
@@ -11689,10 +11678,8 @@
 PyObject *u;
 Py_ssize_t nchars, n;
 
- if (len < 1) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (len < 1)
+ _Py_RETURN_UNICODE_EMPTY();
 
 /* no repeat, return original string */
 if (len == 1)
@@ -12832,8 +12819,7 @@
 {
 if (writer->pos == 0) {
 Py_XDECREF(writer->buffer);
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 }
 if (writer->readonly) {
 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
@@ -13051,8 +13037,7 @@
 }
 
 if (slicelength <= 0) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
+ _Py_RETURN_UNICODE_EMPTY();
 } else if (start == 0 && step == 1 &&
 slicelength == PyUnicode_GET_LENGTH(self)) {
 return unicode_result_unchanged(self);
@@ -14056,10 +14041,8 @@
 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
 kwlist, &x, &encoding, &errors))
 return NULL;
- if (x == NULL) {
- Py_INCREF(unicode_empty);
- return unicode_empty;
- }
+ if (x == NULL)
+ _Py_RETURN_UNICODE_EMPTY();
 if (encoding == NULL && errors == NULL)
 return PyObject_Str(x);
 else
@@ -14228,8 +14211,6 @@
 
 int _PyUnicode_Init(void)
 {
- int i;
-
 /* XXX - move this array to unicodectype.c ? */
 Py_UCS2 linebreak[] = {
 0x000A, /* LINE FEED */
@@ -14243,13 +14224,11 @@
 };
 
 /* Init the implementation */
- unicode_empty = PyUnicode_New(0, 0);
+ _Py_INCREF_UNICODE_EMPTY();
 if (!unicode_empty)
 Py_FatalError("Can't create empty string");
- assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
-
- for (i = 0; i < 256; i++)
- unicode_latin1[i] = NULL;
+ Py_DECREF(unicode_empty);
+
 if (PyType_Ready(&PyUnicode_Type) < 0)
 Py_FatalError("Can't initialize 'unicode'");
 
@@ -14289,15 +14268,10 @@
 {
 int i;
 
- Py_XDECREF(unicode_empty);
- unicode_empty = NULL;
-
- for (i = 0; i < 256; i++) {
- if (unicode_latin1[i]) {
- Py_DECREF(unicode_latin1[i]);
- unicode_latin1[i] = NULL;
- }
- }
+ Py_CLEAR(unicode_empty);
+
+ for (i = 0; i < 256; i++)
+ Py_CLEAR(unicode_latin1[i]);
 _PyUnicode_ClearStaticStrings();
 (void)PyUnicode_ClearFreeList();
 }
@@ -14426,8 +14400,7 @@
 "mortal/immortal\n", mortal_size, immortal_size);
 Py_DECREF(keys);
 PyDict_Clear(interned);
- Py_DECREF(interned);
- interned = NULL;
+ Py_CLEAR(interned);
 }
 
 
-- 
Repository URL: http://hg.python.org/cpython