[Python-checkins] r54127 - sandbox/trunk/pep3101/test_simpleformat.py sandbox/trunk/pep3101/unicodeformat.c

Mon Mar 5 00:37:38 CET 2007

Author: eric.smith
Date: Mon Mar 5 00:37:37 2007
New Revision: 54127
Modified:
 sandbox/trunk/pep3101/test_simpleformat.py
 sandbox/trunk/pep3101/unicodeformat.c
Log:
Decimal ('d') unicode formatting complete. Modified test suite to test for both Unicode and string versions, for some tests.
Modified: sandbox/trunk/pep3101/test_simpleformat.py
==============================================================================

--- sandbox/trunk/pep3101/test_simpleformat.py	(original)
+++ sandbox/trunk/pep3101/test_simpleformat.py	Mon Mar 5 00:37:37 2007
@@ -20,6 +20,16 @@
 val = pep3101.format(text, *args, **kwargs)
 self.assertEquals(val, result)
 
+ def formatEqualsWithUnicode(self, result, text, *args, **kwargs):
+ text = str(text)
+ result = str(result)
+ val = pep3101.format(text, *args, **kwargs)
+ self.assertEquals(val, result)
+
+ # a quick check for unicode version
+ val = pep3101.format(unicode(text), *args, **kwargs)
+ self.assertEquals(val, unicode(result))
+
 def formatRaises(self, exc, text, *args, **kwargs):
 exc = exc or Exception #StringFormat.FormatError
 text = str(text)
@@ -100,64 +110,64 @@
 self.formatEquals("0.1515_.0%", "{0:.0%}", .1515)
 
 def test_string_specifiers(self):
- self.formatEquals("abc", "{0:.3s}", "abc")
+ self.formatEqualsWithUnicode("abc", "{0:.3s}", "abc")
 
- self.formatEquals("ab", "{0:.3s}", "ab")
+ self.formatEqualsWithUnicode("ab", "{0:.3s}", "ab")
 
- self.formatEquals("abc", "{0:.3s}", "abcdef")
- self.formatEquals("resultx", "{0:x<7s}", "result")
- self.formatEquals("resultxx", "{0:x<8s}", "result")
- self.formatEquals("result ", "{0: <7s}", "result")
- self.formatEquals("result ", "{0:<7s}", "result")
- self.formatEquals(" result", "{0:>7s}", "result")
- self.formatEquals(" result", "{0:>8s}", "result")
+ self.formatEqualsWithUnicode("abc", "{0:.3s}", "abcdef")
+ self.formatEqualsWithUnicode("resultx", "{0:x<7s}", "result")
+ self.formatEqualsWithUnicode("resultxx", "{0:x<8s}", "result")
+ self.formatEqualsWithUnicode("result ", "{0: <7s}", "result")
+ self.formatEqualsWithUnicode("result ", "{0:<7s}", "result")
+ self.formatEqualsWithUnicode(" result", "{0:>7s}", "result")
+ self.formatEqualsWithUnicode(" result", "{0:>8s}", "result")
 
 def test_repr_specifiers(self):
- self.formatEquals("3", "{0:r}", 3)
- self.formatEquals("3.141", "{0:5r}", 3.141592654)
+ self.formatEqualsWithUnicode("3", "{0:r}", 3)
+ self.formatEqualsWithUnicode("3.141", "{0:5r}", 3.141592654)
 
 # I'm not sure this is a good test, since the quoting might change
- self.formatEquals("'abcdefg'", "{0:r}", "abcdefg")
- self.formatEquals("'abcdefg", "{0:8r}", "abcdefg")
+ self.formatEqualsWithUnicode("'abcdefg'", "{0:r}", "abcdefg")
+ self.formatEqualsWithUnicode("'abcdefg", "{0:8r}", "abcdefg")
 
 def test_decimal_specifiers(self):
 self.assertRaises(TypeError, "{0:d}", "non-number")
 
- self.formatEquals("0", "{0:d}", 0)
- self.formatEquals("123", "{0:d}", 123)
- self.formatEquals("-123", "{0:d}", -123)
- self.formatEquals("+123", "{0:+d}", 123)
- self.formatEquals("-123", "{0:+d}", -123)
- self.formatEquals("123", "{0:-d}", 123)
- self.formatEquals("-123", "{0:-d}", -123)
- self.formatEquals("123", "{0:()d}", 123)
- self.formatEquals("(123)", "{0:()d}", -123)
+ self.formatEqualsWithUnicode("0", "{0:d}", 0)
+ self.formatEqualsWithUnicode("123", "{0:d}", 123)
+ self.formatEqualsWithUnicode("-123", "{0:d}", -123)
+ self.formatEqualsWithUnicode("+123", "{0:+d}", 123)
+ self.formatEqualsWithUnicode("-123", "{0:+d}", -123)
+ self.formatEqualsWithUnicode("123", "{0:-d}", 123)
+ self.formatEqualsWithUnicode("-123", "{0:-d}", -123)
+ self.formatEqualsWithUnicode("123", "{0:()d}", 123)
+ self.formatEqualsWithUnicode("(123)", "{0:()d}", -123)
 
 # need a long padding to force a reallocation (and hopefully a
 # memory move) in 'd' handling
- self.formatEquals(" " * 997 + "100", "{0:1000d}", 100)
+ self.formatEqualsWithUnicode(" " * 997 + "100", "{0:1000d}", 100)
 
 # now test with the 3 kinds of padding
- self.formatEquals("0 ", "{0:<10d}", 0)
- self.formatEquals("123 ", "{0:<10d}", 123)
- self.formatEquals("-123 ", "{0:<10d}", -123)
- self.formatEquals(" 123", "{0:>10d}", 123)
- self.formatEquals(" -123", "{0:>10d}", -123)
- self.formatEquals(" 123", "{0:=10d}", 123)
- self.formatEquals("+ 123", "{0:=+10d}", 123)
- self.formatEquals("- 123", "{0:=10d}", -123)
- self.formatEquals("- 123", "{0:=+10d}", -123)
- self.formatEquals(" 123", "{0:=()10d}", 123)
+ self.formatEqualsWithUnicode("0 ", "{0:<10d}", 0)
+ self.formatEqualsWithUnicode("123 ", "{0:<10d}", 123)
+ self.formatEqualsWithUnicode("-123 ", "{0:<10d}", -123)
+ self.formatEqualsWithUnicode(" 123", "{0:>10d}", 123)
+ self.formatEqualsWithUnicode(" -123", "{0:>10d}", -123)
+ self.formatEqualsWithUnicode(" 123", "{0:=10d}", 123)
+ self.formatEqualsWithUnicode("+ 123", "{0:=+10d}", 123)
+ self.formatEqualsWithUnicode("- 123", "{0:=10d}", -123)
+ self.formatEqualsWithUnicode("- 123", "{0:=+10d}", -123)
+ self.formatEqualsWithUnicode(" 123", "{0:=()10d}", 123)
 
 # XXX I'm not sure this is correct, maybe it should be " (123)"
- self.formatEquals("( 123)", "{0:=()10d}", -123)
+ self.formatEqualsWithUnicode("( 123)", "{0:=()10d}", -123)
 
- self.formatEquals("1" + "0" * 100, "{0:d}", 10**100)
- self.formatEquals("-1" + "0" * 100, "{0:d}", -10**100)
- self.formatEquals("+1" + "0" * 100, "{0:+d}", 10**100)
- self.formatEquals("(1" + "0" * 100 + ")", "{0:()d}", -10**100)
- self.formatEquals("( 1" + "0" * 100 + ")", "{0:()110d}", -10**100)
- self.formatEquals("( 1" + "0" * 100 + ")", "{0:()110d}", -10**100)
+ self.formatEqualsWithUnicode("1" + "0" * 100, "{0:d}", 10**100)
+ self.formatEqualsWithUnicode("-1" + "0" * 100, "{0:d}", -10**100)
+ self.formatEqualsWithUnicode("+1" + "0" * 100, "{0:+d}", 10**100)
+ self.formatEqualsWithUnicode("(1" + "0" * 100 + ")", "{0:()d}", -10**100)
+ self.formatEqualsWithUnicode("( 1" + "0" * 100 + ")", "{0:()110d}", -10**100)
+ self.formatEqualsWithUnicode("( 1" + "0" * 100 + ")", "{0:()110d}", -10**100)
 
 def test_char_specifiers(self):
 self.formatEquals("A", "{0:c}", "A")
Modified: sandbox/trunk/pep3101/unicodeformat.c
==============================================================================
--- sandbox/trunk/pep3101/unicodeformat.c	(original)
+++ sandbox/trunk/pep3101/unicodeformat.c	Mon Mar 5 00:37:37 2007
@@ -901,15 +901,40 @@
 }
 
 
+#if C_UNICODE
+/* taken from unicodeobject.c */
+/* note that since we work backward, the ranges can overlap */
+static Py_ssize_t
+strtounicode(Py_UNICODE *buffer, const char *charbuffer, Py_ssize_t len)
+{
+ register Py_ssize_t i;
+
+ /* don't know the length, calculate it */
+ if (len == -1)
+ len = strlen(charbuffer);
+ for (i = len - 1; i >= 0; i--)
+	buffer[i] = (Py_UNICODE) charbuffer[i];
+
+ return len;
+}
+#endif
+
 /* code liberally borrowed from stringobject.c's formatint() */
 /* into the output buffer, put <sign><number>. the caller will
 justify as needed */
-/* return the total number of bytes written, or -1 for error
- sets pbuf to point to the output buffer */
+/* this code internally uses 8-bit chars, even when formatting
+ unicode. that's because we use PyOS_snprintf() from both 8-bit and
+ unicode. that means we need to cast the allocated pointer, which
+ is always in units of CH_TYPE */
+/* when this function returns, the result will be in char or unicode
+ (CH_TYPE), as needed */
+/* return the total number of characters written, or -1 for error sets
+ pbuf to point to the output buffer */
 static Py_ssize_t
-_format_int(PyObject* v, FmtState *fs, CH_TYPE type, CH_TYPE **pbuf)
+_format_int(PyObject* v, FmtState *fs, char type, CH_TYPE **pbuf)
 {
- CH_TYPE *ptr;
+ char *ptr;
+ char *start;
 Py_ssize_t buflen = MAXLEN_INT_STRING;
 Py_ssize_t len;
 long x;
@@ -923,16 +948,17 @@
 return -1;
 }
 
- if (output_allocate(fs, MAXLEN_INT_STRING, pbuf) == 0) {
+ /* allocate as much space as we'll ever possibly need. note that
+ if we're doing unicode, we allocate as bytes, format as bytes,
+ but convert to unicode, all in the same buffer. */
+ if (output_allocate(fs, MAXLEN_INT_STRING, pbuf) == 0)
 return -1;
- }
 
- /* remember the start of the string */
- ptr = *pbuf;
+ /* remember the start of the string, as 8-bit chars */
+ start = ptr = (char*)*pbuf;
 
 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) {
- **pbuf = '-';
- *pbuf++;
+ *ptr++ = '-';
 buflen--;
 x = -x;
 }
@@ -942,12 +968,17 @@
 format[1] = type;
 format[2] = '0円';
 
- PyOS_snprintf(*pbuf, buflen, format, x);
+ PyOS_snprintf(ptr, buflen, format, x);
+
 
+ /* convert from chars to unicode, if needed */
+#if C_UNICODE
+ len = strtounicode(*pbuf, start, -1);
+#else
 /* compute the length. I believe this is done because the return value from
 snprintf above is unreliable */
-
- len = strlen(ptr);
+ len = strlen(start);
+#endif
 
 /* shrink the buffer down to how many characters we actually
 wrote. this is cheap, just pointer arithmetic */
@@ -956,11 +987,49 @@
 return len;
 }
 
+static Py_ssize_t
+_format_long(PyObject* v, FmtState *fs, char type, CH_TYPE **pbuf)
+{
+ char* p_charbuf;
+ Py_ssize_t n_allocated;
+ CH_TYPE* ptr;
+ int ok;
+ /* XXX len should probably be Py_ssize_t, but that's not how the
+ function is declared in stringobject.c */
+ int len;
+ PyObject *strobj;
+
+ strobj = _PyString_FormatLong(v, 0, 0, type, &p_charbuf, &len);
+ if (!strobj)
+ return -1;
+
+ n_allocated = len;
+ *pbuf = fs->outstr.ptr;
+
+#if C_UNICODE
+ /* allocate space in the output string, as CH_TYPE */
+ ok = output_allocate(fs, n_allocated, &ptr);
+ if (ok != 0) {
+ strtounicode(ptr, p_charbuf, n_allocated);
+ }
+#else
+ ok = output_data(fs, STROBJ_AS_PTR(strobj), n_allocated);
+#endif
+
+ /* we're done with the string representation */
+ Py_DECREF(strobj);
+
+ if (!ok)
+ return -1;
+ return n_allocated;
+}
+
 static int
 format_decimal(PyObject *fieldobj, FmtState *fs,
 const InternalFormatSpec *format)
 {
 Py_ssize_t width;
+ char* p_abuf;
 CH_TYPE align = format->align;
 CH_TYPE *p_buf;
 CH_TYPE *p_digits; /* pointer to the digits we have */
@@ -981,58 +1050,20 @@
 Py_ssize_t ofs_digits;
 CH_TYPE *tmp;
 
- /*************************************************************************/
- /* first, do everything as ascii *****************************************/
- if (PyLong_Check(fieldobj)) {
- /* a long integer */
-
- /* XXX this should probably be Py_ssize_t, but that's not how
- the function is declared */
- int len;
- int ok;
- PyObject *strobj = _PyString_FormatLong(fieldobj, 0,
- 0, format->type, &p_buf, &len);
+ /* n_allocated includes the total number of characters written,
+ including the sign, if any */
+ /* note that we're potentially converting format->type from
+ Unicode to char, but that's okay because we know what the valid
+ values can be */
 
- if (!strobj)
- return 0;
-
- n_allocated = STROBJ_GET_SIZE(strobj);
- p_buf = fs->outstr.ptr;
-
- /* allocate space in the output, and copy the data */
- ok = output_data(fs, STROBJ_AS_PTR(strobj), n_allocated);
-
- /* we're done with the string representation */
- Py_DECREF(strobj);
-
- if (ok == 0)
- return 0;
+ if (PyLong_Check(fieldobj)) {
+ n_allocated = _format_long(fieldobj, fs, (char)format->type, &p_buf);
 } else {
 /* a regular integer, we can be quicker in this case */
-
- /* n_allocated includes the total number of characters
- written, including the sign, if any */
- n_allocated = _format_int(fieldobj, fs, format->type, &p_buf);
- if (n_allocated < 0)
- return 0;
+ n_allocated = _format_int(fieldobj, fs, (char)format->type, &p_buf);
 }
-
- /* if needed, convert from asci to unicode */
-#if C_UNICODE
- /* taken from unicodeobject.c's strtounicode() */
-#if 0
-strtounicode(Py_UNICODE *buffer, const char *charbuffer)
-{
- register Py_ssize_t i;
- Py_ssize_t len = strlen(charbuffer);
- for (i = len - 1; i >= 0; i--)
-	buffer[i] = (Py_UNICODE) charbuffer[i];
-
- return len;
-}
-#endif
-#endif
- /* end ascii conversion **************************************************/
+ if (n_allocated < 0)
+ return 0;
 
 /* determine if a sign was written, and how many digits we wrote */
 n_digits = n_allocated;