[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.86,2.87

2001年4月23日 07:44:23 -0700

Update of /cvsroot/python/python/dist/src/Objects
In directory usw-pr-cvs1:/tmp/cvs-serv1593/Objects
Modified Files:
	unicodeobject.c 
Log Message:
This patch originated from an idea by Martin v. Loewis who submitted a
patch for sharing single character Unicode objects. 
Martin's patch had to be reworked in a number of ways to take Unicode
resizing into consideration as well. Here's what the updated patch
implements:
* Single character Unicode strings in the Latin-1 range are shared
 (not only ASCII chars as in Martin's original patch).
* The ASCII and Latin-1 codecs make use of this optimization,
 providing a noticable speedup for single character strings. Most
 Unicode methods can use the optimization as well (by virtue
 of using PyUnicode_FromUnicode()).
* Some code cleanup was done (replacing memcpy with Py_UNICODE_COPY)
* The PyUnicode_Resize() can now also handle the case of resizing
 unicode_empty which previously resulted in an error.
* Modified the internal API _PyUnicode_Resize() and
 the public PyUnicode_Resize() API to handle references to
 shared objects correctly. The _PyUnicode_Resize() signature
 changed due to this.
* Callers of PyUnicode_FromUnicode() may now only modify the Unicode
 object contents of the returned object in case they called the API
 with NULL as content template.
Note that even though this patch passes the regression tests, there
may still be subtle bugs in the sharing code.
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.86
retrieving revision 2.87
diff -C2 -r2.86 -r2.87
*** unicodeobject.c	2001年04月21日 02:46:11	2.86
--- unicodeobject.c	2001年04月23日 14:44:21	2.87
***************
*** 84,94 ****
 */
 
- /* The empty Unicode object */
- static PyUnicodeObject *unicode_empty;
- 
 /* Free list for Unicode objects */
 static PyUnicodeObject *unicode_freelist;
 static int unicode_freelist_size;
 
 /* Default encoding to use and assume when NULL is passed as encoding
 parameter; it is initialized by _PyUnicode_Init().
--- 84,98 ----
 */
 
 /* Free list for Unicode objects */
 static PyUnicodeObject *unicode_freelist;
 static int unicode_freelist_size;
 
+ /* The empty Unicode object is shared to improve performance. */
+ static PyUnicodeObject *unicode_empty;
+ 
+ /* Single character Unicode strings in the Latin-1 range are being
+ shared as well. */
+ static PyUnicodeObject *unicode_latin1[256];
+ 
 /* Default encoding to use and assume when NULL is passed as encoding
 parameter; it is initialized by _PyUnicode_Init().
***************
*** 98,102 ****
 
 */
- 
 static char unicode_default_encoding[100];
 
--- 102,105 ----
***************
*** 104,108 ****
 
 static
! int _PyUnicode_Resize(register PyUnicodeObject *unicode,
 int length)
 {
--- 107,111 ----
 
 static
! int unicode_resize(register PyUnicodeObject *unicode,
 int length)
 {
***************
*** 113,120 ****
 	goto reset;
 
! /* Resizing unicode_empty is not allowed. */
! if (unicode == unicode_empty) {
 PyErr_SetString(PyExc_SystemError,
! "can't resize empty unicode object");
 return -1;
 }
--- 116,128 ----
 	goto reset;
 
! /* Resizing shared object (unicode_empty or single character
! objects) in-place is not allowed. Use PyUnicode_Resize()
! instead ! */
! if (unicode == unicode_empty || 
! 	(unicode->length == 1 && 
! 	 unicode->str[0] < 256 &&
! 	 unicode_latin1[unicode->str[0]] == unicode)) {
 PyErr_SetString(PyExc_SystemError,
! "can't resize shared unicode objects");
 return -1;
 }
***************
*** 143,163 ****
 }
 
- int PyUnicode_Resize(PyObject **unicode,
- 		 int length)
- {
- PyUnicodeObject *v;
- 
- if (unicode == NULL) {
- 	PyErr_BadInternalCall();
- 	return -1;
- }
- v = (PyUnicodeObject *)*unicode;
- if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
- 	PyErr_BadInternalCall();
- 	return -1;
- }
- return _PyUnicode_Resize(v, length);
- }
- 
 /* We allocate one more byte to make sure the string is
 Ux0000 terminated -- XXX is this needed ? 
--- 151,154 ----
***************
*** 188,192 ****
 	 never downsize it. */
 	 if ((unicode->length < length) &&
! 		_PyUnicode_Resize(unicode, length)) {
 		PyMem_DEL(unicode->str);
 		goto onError;
--- 179,183 ----
 	 never downsize it. */
 	 if ((unicode->length < length) &&
! 		unicode_resize(unicode, length)) {
 		PyMem_DEL(unicode->str);
 		goto onError;
***************
*** 247,250 ****
--- 238,280 ----
 }
 
+ int PyUnicode_Resize(PyObject **unicode,
+ 		 int length)
+ {
+ register PyUnicodeObject *v;
+ 
+ /* Argument checks */
+ if (unicode == NULL) {
+ 	PyErr_BadInternalCall();
+ 	return -1;
+ }
+ v = (PyUnicodeObject *)*unicode;
+ if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
+ 	PyErr_BadInternalCall();
+ 	return -1;
+ }
+ 
+ /* Resizing unicode_empty and single character objects is not
+ possible since these are being shared. We simply return a fresh
+ copy with the same Unicode content. */
+ if (v->length != length && 
+ 	(v == unicode_empty || v->length == 1)) {
+ 	PyUnicodeObject *w = _PyUnicode_New(length);
+ 	if (w == NULL)
+ 	 return -1;
+ 	Py_UNICODE_COPY(w->str, v->str,
+ 			length < v->length ? length : v->length);
+ 	*unicode = (PyObject *)w;
+ 	return 0;
+ }
+ 
+ /* Note that we don't have to modify *unicode for unshared Unicode
+ objects, since we can modify them in-place. */
+ return unicode_resize(v, length);
+ }
+ 
+ /* Internal API for use in unicodeobject.c only ! */
+ #define _PyUnicode_Resize(unicodevar, length) \
+ PyUnicode_Resize(((PyObject **)(unicodevar)), length)
+ 
 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 				int size)
***************
*** 252,255 ****
--- 282,311 ----
 PyUnicodeObject *unicode;
 
+ /* If the Unicode data is known at construction time, we can apply
+ some optimizations which share commonly used objects. */
+ if (u != NULL) {
+ 
+ 	/* Optimization for empty strings */
+ 	if (size == 0 && unicode_empty != NULL) {
+ 	 Py_INCREF(unicode_empty);
+ 	 return (PyObject *)unicode_empty;
+ 	}
+ 
+ 	/* Single character Unicode objects in the Latin-1 range are
+ 	 shared when using this constructor */
+ 	if (size == 1 && *u < 256) {
+ 	 unicode = unicode_latin1[*u];
+ 	 if (!unicode) {
+ 		unicode = _PyUnicode_New(1);
+ 		unicode->str[0] = *u;
+ 		if (!unicode)
+ 		 return NULL;
+ 		unicode_latin1[*u] = unicode;
+ 	 }
+ 	 Py_INCREF(unicode);
+ 	 return (PyObject *)unicode;
+ 	}
+ }
+ 
 unicode = _PyUnicode_New(size);
 if (!unicode)
***************
*** 258,262 ****
 /* Copy the Unicode data into the new object */
 if (u != NULL)
! 	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
 
 return (PyObject *)unicode;
--- 314,318 ----
 /* Copy the Unicode data into the new object */
 if (u != NULL)
! 	Py_UNICODE_COPY(unicode->str, u, size);
 
 return (PyObject *)unicode;
***************
*** 749,753 ****
 
 /* Adjust length */
! if (_PyUnicode_Resize(unicode, p - unicode->str))
 goto onError;
 
--- 805,809 ----
 
 /* Adjust length */
! if (_PyUnicode_Resize(&unicode, p - unicode->str))
 goto onError;
 
***************
*** 1009,1013 ****
 
 /* Adjust length */
! if (_PyUnicode_Resize(unicode, p - unicode->str))
 goto onError;
 
--- 1065,1069 ----
 
 /* Adjust length */
! if (_PyUnicode_Resize(&unicode, p - unicode->str))
 goto onError;
 
***************
*** 1049,1053 ****
 #endif
 	)
! 	memcpy(p, s, size * sizeof(Py_UNICODE));
 else
 	while (size-- > 0) {
--- 1105,1109 ----
 #endif
 	)
! 	Py_UNICODE_COPY(p, s, size);
 else
 	while (size-- > 0) {
***************
*** 1264,1268 ****
 }
 }
! if (_PyUnicode_Resize(v, (int)(p - buf)))
 		goto onError;
 return (PyObject *)v;
--- 1320,1324 ----
 }
 }
! if (_PyUnicode_Resize(&v, (int)(p - buf)))
 		goto onError;
 return (PyObject *)v;
***************
*** 1452,1456 ****
 	*p++ = x;
 }
! if (_PyUnicode_Resize(v, (int)(p - buf)))
 	goto onError;
 return (PyObject *)v;
--- 1508,1512 ----
 	*p++ = x;
 }
! if (_PyUnicode_Resize(&v, (int)(p - buf)))
 	goto onError;
 return (PyObject *)v;
***************
*** 1523,1526 ****
--- 1579,1587 ----
 
 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
+ if (size == 1 && *(unsigned char*)s < 256) {
+ 	Py_UNICODE r = *(unsigned char*)s;
+ 	return PyUnicode_FromUnicode(&r, 1);
+ }
+ 
 v = _PyUnicode_New(size);
 if (v == NULL)
***************
*** 1655,1658 ****
--- 1716,1724 ----
 
 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
+ if (size == 1 && *(unsigned char*)s < 128) {
+ 	Py_UNICODE r = *(unsigned char*)s;
+ 	return PyUnicode_FromUnicode(&r, 1);
+ }
+ 
 v = _PyUnicode_New(size);
 if (v == NULL)
***************
*** 1672,1676 ****
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
! 	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 return (PyObject *)v;
--- 1738,1742 ----
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
! 	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 return (PyObject *)v;
***************
*** 1927,1931 ****
 			 (targetsize << 2);
 		 extrachars += needed;
! 		 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
 			Py_DECREF(x);
 			goto onError;
--- 1993,1998 ----
 			 (targetsize << 2);
 		 extrachars += needed;
! 		 if (_PyUnicode_Resize(&v, 
! 					 PyUnicode_GET_SIZE(v) + needed)) {
 			Py_DECREF(x);
 			goto onError;
***************
*** 1951,1955 ****
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
! 	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 return (PyObject *)v;
--- 2018,2022 ----
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
! 	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 return (PyObject *)v;
***************
*** 2069,2075 ****
 		 s = PyString_AS_STRING(v) + oldpos;
 		}
! 		memcpy(s,
! 		 PyString_AS_STRING(x),
! 		 targetsize);
 		s += targetsize;
 		extrachars -= targetsize;
--- 2136,2140 ----
 		 s = PyString_AS_STRING(v) + oldpos;
 		}
! 		memcpy(s, PyString_AS_STRING(x), targetsize);
 		s += targetsize;
 		extrachars -= targetsize;
***************
*** 2210,2214 ****
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
! 	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 
--- 2275,2279 ----
 }
 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
! 	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
 	 goto onError;
 
***************
*** 2507,2514 ****
 PyUnicodeObject *u;
 
! u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
! 						 self->length);
 if (u == NULL)
 	return NULL;
 if (!fixfct(u)) {
 	/* fixfct should return TRUE if it modified the buffer. If
--- 2572,2581 ----
 PyUnicodeObject *u;
 
! u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
 if (u == NULL)
 	return NULL;
+ 
+ Py_UNICODE_COPY(u->str, self->str, self->length);
+ 
 if (!fixfct(u)) {
 	/* fixfct should return TRUE if it modified the buffer. If
***************
*** 2699,2703 ****
 	itemlen = PyUnicode_GET_SIZE(item);
 	while (reslen + itemlen + seplen >= sz) {
! 	 if (_PyUnicode_Resize(res, sz*2))
 		goto onError;
 	 sz *= 2;
--- 2766,2770 ----
 	itemlen = PyUnicode_GET_SIZE(item);
 	while (reslen + itemlen + seplen >= sz) {
! 	 if (_PyUnicode_Resize(&res, sz*2))
 		goto onError;
 	 sz *= 2;
***************
*** 2705,2718 ****
 	}
 	if (i > 0) {
! 	 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
 	 p += seplen;
 	 reslen += seplen;
 	}
! 	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
 	p += itemlen;
 	reslen += itemlen;
 	Py_DECREF(item);
 }
! if (_PyUnicode_Resize(res, reslen))
 	goto onError;
 
--- 2772,2785 ----
 	}
 	if (i > 0) {
! 	 Py_UNICODE_COPY(p, sep, seplen);
 	 p += seplen;
 	 reslen += seplen;
 	}
! 	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
 	p += itemlen;
 	reslen += itemlen;
 	Py_DECREF(item);
 }
! if (_PyUnicode_Resize(&res, reslen))
 	goto onError;
 
***************
*** 3002,3009 ****
 	 
 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
! self->str,
 self->length
 );
! if (u)
 for (i = 0; i < u->length; i++)
 if (u->str[i] == u1) {
--- 3069,3078 ----
 	 
 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
! NULL,
 self->length
 );
! if (u != NULL) {
! 		Py_UNICODE_COPY(u->str, self->str, 
! 				self->length);
 for (i = 0; i < u->length; i++)
 if (u->str[i] == u1) {
***************
*** 3013,3016 ****
--- 3082,3086 ----
 }
 }
+ }
 
 } else {
***************
*** 4779,4783 ****
 		rescnt = fmtcnt + 100;
 		reslen += rescnt;
! 		if (_PyUnicode_Resize(result, reslen) < 0)
 		 return NULL;
 		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
--- 4849,4853 ----
 		rescnt = fmtcnt + 100;
 		reslen += rescnt;
! 		if (_PyUnicode_Resize(&result, reslen) < 0)
 		 return NULL;
 		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
***************
*** 5070,5074 ****
 		rescnt = width + fmtcnt + 100;
 		reslen += rescnt;
! 		if (_PyUnicode_Resize(result, reslen) < 0)
 		 return NULL;
 		res = PyUnicode_AS_UNICODE(result)
--- 5140,5144 ----
 		rescnt = width + fmtcnt + 100;
 		reslen += rescnt;
! 		if (_PyUnicode_Resize(&result, reslen) < 0)
 		 return NULL;
 		res = PyUnicode_AS_UNICODE(result)
***************
*** 5111,5115 ****
 		}
 	 }
! 	 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
 	 res += len;
 	 rescnt -= len;
--- 5181,5185 ----
 		}
 	 }
! 	 Py_UNICODE_COPY(res, pbuf, len);
 	 res += len;
 	 rescnt -= len;
***************
*** 5136,5140 ****
 }
 Py_DECREF(uformat);
! if (_PyUnicode_Resize(result, reslen - rescnt))
 	goto onError;
 return (PyObject *)result;
--- 5206,5210 ----
 }
 Py_DECREF(uformat);
! if (_PyUnicode_Resize(&result, reslen - rescnt))
 	goto onError;
 return (PyObject *)result;
***************
*** 5185,5188 ****
--- 5255,5260 ----
 void _PyUnicode_Init(void)
 {
+ int i;
+ 
 /* Doublecheck the configuration... */
 if (sizeof(Py_UNICODE) != 2)
***************
*** 5195,5198 ****
--- 5267,5272 ----
 unicode_empty = _PyUnicode_New(0);
 strcpy(unicode_default_encoding, "ascii");
+ for (i = 0; i < 256; i++)
+ 	unicode_latin1[i] = NULL;
 }
 
***************
*** 5203,5209 ****
--- 5277,5291 ----
 {
 PyUnicodeObject *u;
+ int i;
 
 Py_XDECREF(unicode_empty);
 unicode_empty = NULL;
+ 
+ for (i = 0; i < 256; i++) {
+ 	if (unicode_latin1[i]) {
+ 	 Py_DECREF(unicode_latin1[i]);
+ 	 unicode_latin1[i] = NULL;
+ 	}
+ }
 
 for (u = unicode_freelist; u != NULL;) {