[Python-checkins] CVS: python/dist/src/Objects unicodeobject.c,2.40,2.41

M.-A. Lemburg python-dev@python.org
Fri, 7 Jul 2000 10:51:10 -0700


Update of /cvsroot/python/python/dist/src/Objects
In directory slayer.i.sourceforge.net:/tmp/cvs-serv27530/Objects
Modified Files:
	unicodeobject.c 
Log Message:
New surrogate support in the UTF-8 codec. By Bill Tutt.
Index: unicodeobject.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v
retrieving revision 2.40
retrieving revision 2.41
diff -C2 -r2.40 -r2.41
*** unicodeobject.c	2000年07月07日 13:46:42	2.40
--- unicodeobject.c	2000年07月07日 17:51:08	2.41
***************
*** 658,665 ****
 
 while (s < e) {
! register Py_UNICODE ch = (unsigned char)*s;
 
 if (ch < 0x80) {
! *p++ = ch;
 s++;
 continue;
--- 658,665 ----
 
 while (s < e) {
! Py_UCS4 ch = (unsigned char)*s;
 
 if (ch < 0x80) {
! *p++ = (Py_UNICODE)ch;
 s++;
 continue;
***************
*** 688,692 ****
 UTF8_ERROR("illegal encoding");
 	 else
! 		*p++ = ch;
 break;
 
--- 688,692 ----
 UTF8_ERROR("illegal encoding");
 	 else
! 				*p++ = (Py_UNICODE)ch;
 break;
 
***************
*** 699,703 ****
 UTF8_ERROR("illegal encoding");
 	 else
! 		*p++ = ch;
 break;
 
--- 699,726 ----
 UTF8_ERROR("illegal encoding");
 	 else
! 				*p++ = (Py_UNICODE)ch;
! break;
! 
! case 4:
! if ((s[1] & 0xc0) != 0x80 ||
! (s[2] & 0xc0) != 0x80 ||
! (s[3] & 0xc0) != 0x80)
! UTF8_ERROR("invalid data");
! ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
! ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
! /* validate and convert to UTF-16 */
! if ((ch < 0x10000) || /* minimum value allowed for 4 byte encoding */
! (ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
! UTF8_ERROR("illegal encoding");
! /* compute and append the two surrogates: */
! 
! /* translate from 10000..10FFFF to 0..FFFF */
! ch -= 0x10000;
! 
! /* high surrogate = top 10 bits added to D800 */
! *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
! 
! /* low surrogate = bottom 10 bits added to DC00 */
! *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
 break;
 
***************
*** 759,764 ****
 char *p;
 char *q;
 
! v = PyString_FromStringAndSize(NULL, 3 * size);
 if (v == NULL)
 return NULL;
--- 782,791 ----
 char *p;
 char *q;
+ Py_UCS4 ch2;
+ unsigned int cbAllocated = 3 * size;
+ unsigned int cbWritten = 0;
+ int i = 0;
 
! v = PyString_FromStringAndSize(NULL, cbAllocated);
 if (v == NULL)
 return NULL;
***************
*** 767,788 ****
 
 p = q = PyString_AS_STRING(v);
! while (size-- > 0) {
! Py_UNICODE ch = *s++;
! if (ch < 0x80)
 *p++ = (char) ch;
 else if (ch < 0x0800) {
 *p++ = 0xc0 | (ch >> 6);
 *p++ = 0x80 | (ch & 0x3f);
! 	} else if (0xD800 <= ch && ch <= 0xDFFF) {
! 	 /* These byte ranges are reserved for UTF-16 surrogate
! 	 bytes which the Python implementation currently does
! 	 not support. */
! 	 if (utf8_encoding_error(&s, &p, errors, 
! 				 "unsupported code range"))
 		goto onError;
! } else {
! *p++ = 0xe0 | (ch >> 12);
! *p++ = 0x80 | ((ch >> 6) & 0x3f);
! *p++ = 0x80 | (ch & 0x3f);
 }
 }
--- 794,839 ----
 
 p = q = PyString_AS_STRING(v);
! while (i < size) {
! Py_UCS4 ch = s[i++];
! if (ch < 0x80) {
 *p++ = (char) ch;
+ cbWritten++;
+ }
 else if (ch < 0x0800) {
 *p++ = 0xc0 | (ch >> 6);
 *p++ = 0x80 | (ch & 0x3f);
! cbWritten += 2;
! }
! else {
! /* Check for high surrogate */
! if (0xD800 <= ch && ch <= 0xDBFF) {
! if (i != size) {
! ch2 = s[i];
! if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
! 
! if (cbWritten >= (cbAllocated - 4)) {
! 			 /* Provide enough room for some more
! 			 surrogates */
! 			 cbAllocated += 4*10;
! if (_PyString_Resize(&v, cbAllocated))
 		goto onError;
! }
! 
! /* combine the two values */
! ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
! 
! *p++ = (char)((ch >> 18) | 0xf0);
! *p++ = (char)(0x80 | (ch >> 12) & 0x3f);
! i++;
! cbWritten += 4;
! }
! }
! }
! else {
! *p++ = (char)(0xe0 | (ch >> 12));
! cbWritten += 3;
! }
! *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
! *p++ = (char)(0x80 | (ch & 0x3f));
 }
 }
***************
*** 1218,1222 ****
 const char *start = s + 1;
 const char *endBrace = start;
! unsigned int uiValue;
 unsigned long j;
 
--- 1269,1273 ----
 const char *start = s + 1;
 const char *endBrace = start;
! Py_UCS4 value;
 unsigned long j;
 
***************
*** 1249,1258 ****
 goto ucnFallthrough;
 }
! uiValue = ((_Py_UnicodeCharacterName *)
! (pucnHash->getValue(j)))->uiValue;
! if (uiValue < 1<<16)
 {
 /* In UCS-2 range, easy solution.. */
! *p++ = uiValue;
 }
 else
--- 1300,1309 ----
 goto ucnFallthrough;
 }
! value = ((_Py_UnicodeCharacterName *)
! (pucnHash->getValue(j)))->value;
! if (value < 1<<16)
 {
 /* In UCS-2 range, easy solution.. */
! *p++ = value;
 }
 else
***************
*** 1261,1271 ****
 /* compute and append the two surrogates: */
 /* translate from 10000..10FFFF to 0..FFFFF */
! uiValue -= 0x10000;
 
 /* high surrogate = top 10 bits added to D800 */
! *p++ = 0xD800 + (uiValue >> 10);
 
 /* low surrogate = bottom 10 bits added to DC00 */
! *p++ = 0xDC00 + (uiValue & ~0xFC00);
 }
 s = endBrace + 1;
--- 1312,1322 ----
 /* compute and append the two surrogates: */
 /* translate from 10000..10FFFF to 0..FFFFF */
! value -= 0x10000;
 
 /* high surrogate = top 10 bits added to D800 */
! *p++ = 0xD800 + (value >> 10);
 
 /* low surrogate = bottom 10 bits added to DC00 */
! *p++ = 0xDC00 + (value & ~0xFC00);
 }
 s = endBrace + 1;
***************
*** 3092,3101 ****
 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
 
! static unsigned long utf16Fixup[32] =
 {
 0, 0, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 
! 0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
 };
 
--- 3143,3152 ----
 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
 
! static short utf16Fixup[32] =
 {
 0, 0, 0, 0, 0, 0, 0, 0, 
 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 
! 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
 };
 
***************
*** 3112,3116 ****
 
 while (len1 > 0 && len2 > 0) {
! 	unsigned long c1, c2;
 	long diff;
 
--- 3163,3167 ----
 
 while (len1 > 0 && len2 > 0) {
! Py_UNICODE c1, c2; 
 	long diff;
 

AltStyle によって変換されたページ (->オリジナル) /