changeset: 80064:9badfe3a31a7
user: Victor Stinner <victor.stinner@gmail.com>
date: Tue Oct 30 23:12:47 2012 +0100
files: Doc/whatsnew/3.4.rst Misc/NEWS Objects/unicodeobject.c
description:
Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster
Patch written by Serhiy Storchaka.
diff -r 43d87cdf9457 -r 9badfe3a31a7 Doc/whatsnew/3.4.rst
--- a/Doc/whatsnew/3.4.rst	Tue Oct 30 22:43:19 2012 +0100
+++ b/Doc/whatsnew/3.4.rst	Tue Oct 30 23:12:47 2012 +0100
@@ -157,7 +157,7 @@
 
 Major performance enhancements have been added:
 
-* None yet.
+* The UTF-32 decoder is now 3x to 4x faster.
 
 
 Build and C API Changes
diff -r 43d87cdf9457 -r 9badfe3a31a7 Misc/NEWS
--- a/Misc/NEWS	Tue Oct 30 22:43:19 2012 +0100
+++ b/Misc/NEWS	Tue Oct 30 23:12:47 2012 +0100
@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
+ written by Serhiy Storchaka.
+
 - Issue #16197: Update winreg docstrings and documentation to match code.
 Patch by Zachary Ware.
 
diff -r 43d87cdf9457 -r 9badfe3a31a7 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c	Tue Oct 30 22:43:19 2012 +0100
+++ b/Objects/unicodeobject.c	Tue Oct 30 23:12:47 2012 +0100
@@ -4804,14 +4804,8 @@
 Py_ssize_t outpos;
 PyObject *unicode;
 const unsigned char *q, *e;
- int bo = 0; /* assume native ordering by default */
+ int le, bo = 0; /* assume native ordering by default */
 const char *errmsg = "";
- /* Offsets from q for retrieving bytes in the right order. */
-#if PY_LITTLE_ENDIAN
- int iorder[] = {0, 1, 2, 3};
-#else
- int iorder[] = {3, 2, 1, 0};
-#endif
 PyObject *errorHandler = NULL;
 PyObject *exc = NULL;
 
@@ -4825,83 +4819,88 @@
 byte order setting accordingly. In native mode, the leading BOM
 mark is skipped, in all other modes, it is copied to the output
 stream as-is (giving a ZWNBSP character). */
- if (bo == 0) {
- if (size>= 4) {
- const Py_UCS4 bom = (q[iorder[3]] << 24) &#124; (q[iorder[2]] << 16) &#124; - (q[iorder[1]] << 8) &#124; q[iorder[0]]; -#if PY_LITTLE_ENDIAN - if (bom == 0x0000FEFF) { - q += 4; - bo = -1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = 1; - } + if (bo == 0 && size>= 4) {
+ Py_UCS4 bom = (q[3] << 24) &#124; (q[2] << 16) &#124; (q[1] << 8) &#124; q[0]; + if (bom == 0x0000FEFF) { + bo = -1; + q += 4; + } + else if (bom == 0xFFFE0000) { + bo = 1; + q += 4; + } + if (byteorder) + *byteorder = bo; + } + + if (q == e) { + if (consumed) + *consumed = size; + Py_INCREF(unicode_empty); + return unicode_empty; + } + +#ifdef WORDS_BIGENDIAN + le = bo < 0; #else - if (bom == 0x0000FEFF) { - q += 4; - bo = 1; - } - else if (bom == 0xFFFE0000) { + le = bo <= 0; +#endif + + unicode = PyUnicode_New((e - q + 3) / 4, 127); + if (!unicode) + return NULL; + + outpos = 0; + while (1) { + Py_UCS4 ch = 0; + Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode); + + if (e - q>= 4) {
+ enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+ void *data = PyUnicode_DATA(unicode);
+ const unsigned char *last = e - 4;
+ if (le) {
+ do {
+ ch = (q[3] << 24) &#124; (q[2] << 16) &#124; (q[1] << 8) &#124; q[0]; + if (ch> maxch)
+ break;
+ PyUnicode_WRITE(kind, data, outpos++, ch);
+ q += 4;
+ } while (q <= last); + } + else { + do { + ch = (q[0] << 24) &#124; (q[1] << 16) &#124; (q[2] << 8) &#124; q[3]; + if (ch> maxch)
+ break;
+ PyUnicode_WRITE(kind, data, outpos++, ch);
+ q += 4;
+ } while (q <= last); + } + } + + if (ch <= maxch) { + if (q == e &#124;&#124; consumed) + break; + /* remaining bytes at the end? (size should be divisible by 4) */ + errmsg = "truncated data"; + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; + } + else { + if (ch < 0x110000) { + if (unicode_putchar(&unicode, &outpos, ch) < 0) + goto onError; q += 4; - bo = -1; - } -#endif - } - } - - if (bo == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; - } - else if (bo == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; - } - - /* This might be one to much, because of a BOM */ - unicode = PyUnicode_New((size+3)/4, 127); - if (!unicode) - return NULL; - if (size == 0) - return unicode; - outpos = 0; - - while (q < e) { - Py_UCS4 ch; - /* remaining bytes at the end? (size should be divisible by 4) */ - if (e-q<4) { - if (consumed) - break; - errmsg = "truncated data"; - startinpos = ((const char *)q)-starts; - endinpos = ((const char *)e)-starts; - goto utf32Error; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ - } - ch = (q[iorder[3]] << 24) &#124; (q[iorder[2]] << 16) &#124; - (q[iorder[1]] << 8) &#124; q[iorder[0]]; - - if (ch>= 0x110000)
- {
+ continue;
+ }
 errmsg = "codepoint not in range(0x110000)";
- startinpos = ((const char *)q)-starts;
- endinpos = startinpos+4;
- goto utf32Error;
- }
- if (unicode_putchar(&unicode, &outpos, ch) < 0)
- goto onError;
- q += 4;
- continue;
- utf32Error:
+ startinpos = ((const char *)q) - starts;
+ endinpos = startinpos + 4;
+ }
+
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
 if (unicode_decode_call_errorhandler(
 errors, &errorHandler,
 "utf32", errmsg,
@@ -4910,9 +4909,6 @@
 goto onError;
 }
 
- if (byteorder)
- *byteorder = bo;
-
 if (consumed)
 *consumed = (const char *)q-starts;
 
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://hg.python.org/cpython/rev/9badfe3a31a7">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://hg.python.org/cpython/rev/9badfe3a31a7" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>