[Python-checkins] bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (GH-18327)

Wed Feb 26 23:49:04 EST 2020

https://github.com/python/cpython/commit/02a4d57263a9846de35b0db12763ff9e7326f62c
commit: 02a4d57263a9846de35b0db12763ff9e7326f62c
branch: master
author: Inada Naoki <songofacandy at gmail.com>
committer: GitHub <noreply at github.com>
date: 2020年02月27日T13:48:59+09:00
summary:
bpo-39087: Optimize PyUnicode_AsUTF8AndSize() (GH-18327)
Avoid using temporary bytes object.
files:
A Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst
M Objects/stringlib/codecs.h
M Objects/unicodeobject.c

diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst b/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst
new file mode 100644
index 0000000000000..847f78f5b182e
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-02-03-21-12-39.bpo-39087.YnbUpL.rst	
@@ -0,0 +1,2 @@
+Optimize :c:func:`PyUnicode_AsUTF8` and :c:func:`PyUnicode_AsUTF8AndSize`
+slightly when they need to create internal UTF-8 cache.
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index 269a5581f7005..eb42e071751d7 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -256,8 +256,9 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
 UCS-1 strings don't need to handle surrogates for example. */
-Py_LOCAL_INLINE(PyObject *)
-STRINGLIB(utf8_encoder)(PyObject *unicode,
+Py_LOCAL_INLINE(char *)
+STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
+ PyObject *unicode,
 STRINGLIB_CHAR *data,
 Py_ssize_t size,
 _Py_error_handler error_handler,
@@ -277,17 +278,16 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 #else /* STRINGLIB_SIZEOF_CHAR == 4 */
 const Py_ssize_t max_char_size = 4;
 #endif
- _PyBytesWriter writer;
 
 assert(size >= 0);
- _PyBytesWriter_Init(&writer);
-
 if (size > PY_SSIZE_T_MAX / max_char_size) {
 /* integer overflow */
- return PyErr_NoMemory();
+ PyErr_NoMemory();
+ return NULL;
 }
 
- p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
+ _PyBytesWriter_Init(writer);
+ p = _PyBytesWriter_Alloc(writer, size * max_char_size);
 if (p == NULL)
 return NULL;
 
@@ -323,7 +323,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 endpos++;
 
 /* Only overallocate the buffer if it's not the last write */
- writer.overallocate = (endpos < size);
+ writer->overallocate = (endpos < size);
 
 switch (error_handler)
 {
@@ -347,8 +347,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 
 case _Py_ERROR_BACKSLASHREPLACE:
 /* subtract preallocated bytes */
- writer.min_size -= max_char_size * (endpos - startpos);
- p = backslashreplace(&writer, p,
+ writer->min_size -= max_char_size * (endpos - startpos);
+ p = backslashreplace(writer, p,
 unicode, startpos, endpos);
 if (p == NULL)
 goto error;
@@ -357,8 +357,8 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 
 case _Py_ERROR_XMLCHARREFREPLACE:
 /* subtract preallocated bytes */
- writer.min_size -= max_char_size * (endpos - startpos);
- p = xmlcharrefreplace(&writer, p,
+ writer->min_size -= max_char_size * (endpos - startpos);
+ p = xmlcharrefreplace(writer, p,
 unicode, startpos, endpos);
 if (p == NULL)
 goto error;
@@ -387,10 +387,10 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 goto error;
 
 /* subtract preallocated bytes */
- writer.min_size -= max_char_size * (newpos - startpos);
+ writer->min_size -= max_char_size * (newpos - startpos);
 
 if (PyBytes_Check(rep)) {
- p = _PyBytesWriter_WriteBytes(&writer, p,
+ p = _PyBytesWriter_WriteBytes(writer, p,
 PyBytes_AS_STRING(rep),
 PyBytes_GET_SIZE(rep));
 }
@@ -406,7 +406,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 goto error;
 }
 
- p = _PyBytesWriter_WriteBytes(&writer, p,
+ p = _PyBytesWriter_WriteBytes(writer, p,
 PyUnicode_DATA(rep),
 PyUnicode_GET_LENGTH(rep));
 }
@@ -420,7 +420,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 
 /* If overallocation was disabled, ensure that it was the last
 write. Otherwise, we missed an optimization */
- assert(writer.overallocate || i == size);
+ assert(writer->overallocate || i == size);
 }
 else
 #if STRINGLIB_SIZEOF_CHAR > 2
@@ -449,14 +449,13 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
 Py_XDECREF(error_handler_obj);
 Py_XDECREF(exc);
 #endif
- return _PyBytesWriter_Finish(&writer, p);
+ return p;
 
 #if STRINGLIB_SIZEOF_CHAR > 1
 error:
 Py_XDECREF(rep);
 Py_XDECREF(error_handler_obj);
 Py_XDECREF(exc);
- _PyBytesWriter_Dealloc(&writer);
 return NULL;
 #endif
 }
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index ee6d3dfd3945b..e0a666f70da36 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3991,11 +3991,11 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
 }
 
 
+static int unicode_fill_utf8(PyObject *unicode);
+
 const char *
 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
 {
- PyObject *bytes;
-
 if (!PyUnicode_Check(unicode)) {
 PyErr_BadArgument();
 return NULL;
@@ -4004,21 +4004,9 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
 return NULL;
 
 if (PyUnicode_UTF8(unicode) == NULL) {
- assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
- bytes = _PyUnicode_AsUTF8String(unicode, NULL);
- if (bytes == NULL)
- return NULL;
- _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
- if (_PyUnicode_UTF8(unicode) == NULL) {
- PyErr_NoMemory();
- Py_DECREF(bytes);
+ if (unicode_fill_utf8(unicode) == -1) {
 return NULL;
 }
- _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
- memcpy(_PyUnicode_UTF8(unicode),
- PyBytes_AS_STRING(bytes),
- _PyUnicode_UTF8_LENGTH(unicode) + 1);
- Py_DECREF(bytes);
 }
 
 if (psize)
@@ -5381,10 +5369,6 @@ static PyObject *
 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
 const char *errors)
 {
- enum PyUnicode_Kind kind;
- void *data;
- Py_ssize_t size;
-
 if (!PyUnicode_Check(unicode)) {
 PyErr_BadArgument();
 return NULL;
@@ -5397,9 +5381,12 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
 PyUnicode_UTF8_LENGTH(unicode));
 
- kind = PyUnicode_KIND(unicode);
- data = PyUnicode_DATA(unicode);
- size = PyUnicode_GET_LENGTH(unicode);
+ enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
+
+ _PyBytesWriter writer;
+ char *end;
 
 switch (kind) {
 default:
@@ -5407,12 +5394,73 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
 case PyUnicode_1BYTE_KIND:
 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
 assert(!PyUnicode_IS_ASCII(unicode));
- return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
+ end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+ break;
+ case PyUnicode_2BYTE_KIND:
+ end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+ break;
+ case PyUnicode_4BYTE_KIND:
+ end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
+ break;
+ }
+
+ if (end == NULL) {
+ _PyBytesWriter_Dealloc(&writer);
+ return NULL;
+ }
+ return _PyBytesWriter_Finish(&writer, end);
+}
+
+static int
+unicode_fill_utf8(PyObject *unicode)
+{
+ /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
+ assert(!PyUnicode_IS_ASCII(unicode));
+
+ enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
+
+ _PyBytesWriter writer;
+ char *end;
+
+ switch (kind) {
+ default:
+ Py_UNREACHABLE();
+ case PyUnicode_1BYTE_KIND:
+ end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
+ _Py_ERROR_STRICT, NULL);
+ break;
 case PyUnicode_2BYTE_KIND:
- return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
+ end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
+ _Py_ERROR_STRICT, NULL);
+ break;
 case PyUnicode_4BYTE_KIND:
- return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
+ end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
+ _Py_ERROR_STRICT, NULL);
+ break;
+ }
+ if (end == NULL) {
+ _PyBytesWriter_Dealloc(&writer);
+ return -1;
+ }
+
+ char *start = writer.use_small_buffer ? writer.small_buffer :
+ PyBytes_AS_STRING(writer.buffer);
+ Py_ssize_t len = end - start;
+
+ char *cache = PyObject_MALLOC(len + 1);
+ if (cache == NULL) {
+ _PyBytesWriter_Dealloc(&writer);
+ PyErr_NoMemory();
+ return -1;
 }
+ _PyUnicode_UTF8(unicode) = cache;
+ _PyUnicode_UTF8_LENGTH(unicode) = len;
+ memcpy(cache, start, len);
+ cache[len] = '0円';
+ _PyBytesWriter_Dealloc(&writer);
+ return 0;
 }
 
 PyObject *