Commit a3ce2f7

authored

gh-55531: Implement normalize_encoding in C (#136643)

1 parent 6826166 commit a3ce2f7Copy full SHA for a3ce2f7

File tree

6 files changed

+123

-22

lines changed

Lib/encodings
- __init__.py
Misc/NEWS.d/next/Library
- 2025年07月14日-09-33-17.gh-issue-55531.Gt2e12.rst
Modules
- _codecsmodule.c
- clinic
  - _codecsmodule.c.h
Objects
- unicodeobject.c
Python
- fileutils.c

6 files changed

+123

-22

lines changed

`‎Lib/encodings/init.py‎`

Lines changed: 2 additions & 12 deletions

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@`
`30`	`30`
`31`	`31`	`import codecs`
`32`	`32`	`import sys`
	`33`	`+from _codecs import _normalize_encoding`
`33`	`34`	`from . import aliases`
`34`	`35`
`35`	`36`	`_cache = {}`
`@@ -55,18 +56,7 @@ def normalize_encoding(encoding):`
`55`	`56`	`if isinstance(encoding, bytes):`
`56`	`57`	`encoding = str(encoding, "ascii")`
`57`	`58`
`58`		`- chars = []`
`59`		`- punct = False`
`60`		`- for c in encoding:`
`61`		`- if c.isalnum() or c == '.':`
`62`		`- if punct and chars:`
`63`		`- chars.append('_')`
`64`		`- if c.isascii():`
`65`		`- chars.append(c)`
`66`		`- punct = False`
`67`		`- else:`
`68`		`- punct = True`
`69`		`- return ''.join(chars)`
	`59`	`+ return _normalize_encoding(encoding)`
`70`	`60`
`71`	`61`	`def search_function(encoding):`
`72`	`62`

`‎Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,4 @@`
	`1`	+:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
	`2`	`+by implementing the function in C using the private`
	`3`	+``_Py_normalize_encoding`` which has been modified to make lowercase
	`4`	`+conversion optional.`

`‎Modules/_codecsmodule.c‎`

Lines changed: 42 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject module, const char name)`
`1018`	`1018`	`return PyCodec_LookupError(name);`
`1019`	`1019`	`}`
`1020`	`1020`
	`1021`	`+extern int _Py_normalize_encoding(const char , char , size_t, int);`
	`1022`	`+`
	`1023`	`+/*[clinic input]`
	`1024`	`+_codecs._normalize_encoding`
	`1025`	`+ encoding: unicode`
	`1026`	`+`
	`1027`	`+Normalize an encoding name encoding.`
	`1028`	`+`
	`1029`	`+Used for encodings.normalize_encoding. Does not convert to lower case.`
	`1030`	`+[clinic start generated code]*/`
	`1031`	`+`
	`1032`	`+static PyObject *`
	`1033`	`+_codecs__normalize_encoding_impl(PyObject module, PyObject encoding)`
	`1034`	`+/[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]/`
	`1035`	`+{`
	`1036`	`+ Py_ssize_t len;`
	`1037`	`+ const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);`
	`1038`	`+ if (cstr == NULL) {`
	`1039`	`+ return NULL;`
	`1040`	`+ }`
	`1041`	`+`
	`1042`	`+ if (len > PY_SSIZE_T_MAX) {`
	`1043`	`+ PyErr_SetString(PyExc_OverflowError, "encoding is too large");`
	`1044`	`+ return NULL;`
	`1045`	`+ }`
	`1046`	`+`
	`1047`	`+ char *normalized = PyMem_Malloc(len + 1);`
	`1048`	`+ if (normalized == NULL) {`
	`1049`	`+ return PyErr_NoMemory();`
	`1050`	`+ }`
	`1051`	`+`
	`1052`	`+ if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {`
	`1053`	`+ PyMem_Free(normalized);`
	`1054`	`+ return NULL;`
	`1055`	`+ }`
	`1056`	`+`
	`1057`	`+ PyObject *result = PyUnicode_FromString(normalized);`
	`1058`	`+ PyMem_Free(normalized);`
	`1059`	`+ return result;`
	`1060`	`+}`
	`1061`	`+`
`1021`	`1062`	`/* --- Module API --------------------------------------------------------- */`
`1022`	`1063`
`1023`	`1064`	`static PyMethodDef _codecs_functions[] = {`
`@@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {`
`1067`	`1108`	`_CODECS_REGISTER_ERROR_METHODDEF`
`1068`	`1109`	`_CODECS__UNREGISTER_ERROR_METHODDEF`
`1069`	`1110`	`_CODECS_LOOKUP_ERROR_METHODDEF`
	`1111`	`+ _CODECS__NORMALIZE_ENCODING_METHODDEF`
`1070`	`1112`	`{NULL, NULL} /* sentinel */`
`1071`	`1113`	`};`
`1072`	`1114`

`‎Modules/clinic/_codecsmodule.c.h‎`

Lines changed: 65 additions & 1 deletion

Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

`‎Objects/unicodeobject.c‎`

Lines changed: 8 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,`
`3449`	`3449`	`return v;`
`3450`	`3450`	`}`
`3451`	`3451`
`3452`		`-/* Normalize an encoding name: similar to encodings.normalize_encoding(), but`
`3453`		`- also convert to lowercase. Return 1 on success, or 0 on error (encoding is`
`3454`		`- longer than lower_len-1). */`
	`3452`	`+/* Normalize an encoding name like encodings.normalize_encoding()`
	`3453`	`+ but allow to convert to lowercase if to_lower is true.`
	`3454`	`+ Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */`
`3455`	`3455`	`int`
`3456`	`3456`	`_Py_normalize_encoding(const char *encoding,`
`3457`	`3457`	`char *lower,`
`3458`		`- size_t lower_len)`
	`3458`	`+ size_t lower_len,`
	`3459`	`+ int to_lower)`
`3459`	`3460`	`{`
`3460`	`3461`	`const char *e;`
`3461`	`3462`	`char *l;`
`@@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,`
`3486`	`3487`	`if (l == l_end) {`
`3487`	`3488`	`return 0;`
`3488`	`3489`	`}`
`3489`		`- *l++ = Py_TOLOWER(c);`
	`3490`	`+ *l++ = to_lower ? Py_TOLOWER(c) : c;`
`3490`	`3491`	`}`
`3491`	`3492`	`else {`
`3492`	`3493`	`punct = 1;`
`@@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,`
`3521`	`3522`	`}`
`3522`	`3523`
`3523`	`3524`	`/* Shortcuts for common default encodings */`
`3524`		`- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {`
	`3525`	`+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {`
`3525`	`3526`	`char *lower = buflower;`
`3526`	`3527`
`3527`	`3528`	`/* Fast paths */`
`@@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,`
`3778`	`3779`	`}`
`3779`	`3780`
`3780`	`3781`	`/* Shortcuts for common default encodings */`
`3781`		`- if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {`
	`3782`	`+ if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {`
`3782`	`3783`	`char *lower = buflower;`
`3783`	`3784`
`3784`	`3785`	`/* Fast paths */`

`‎Python/fileutils.c‎`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t pwc, const char str, size_t len, mbstate_t *pmbs)`
`178`	`178`
`179`	`179`	`#define USE_FORCE_ASCII`
`180`	`180`
`181`		`-extern int _Py_normalize_encoding(const char , char , size_t);`
	`181`	`+extern int _Py_normalize_encoding(const char , char , size_t, int);`
`182`	`182`
`183`	`183`	`/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale`
`184`	`184`	`and POSIX locale. nl_langinfo(CODESET) announces an alias of the`
`@@ -229,7 +229,7 @@ check_force_ascii(void)`
`229`	`229`	`}`
`230`	`230`
`231`	`231`	`char encoding[20]; /* longest name: "iso_646.irv_19910円" */`
`232`		`- if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {`
	`232`	`+ if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {`
`233`	`233`	`goto error;`
`234`	`234`	`}`
`235`	`235`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit a3ce2f7

File tree

6 files changed

6 files changed

`‎Lib/encodings/init.py‎`

`‎Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst‎`

`‎Modules/_codecsmodule.c‎`

`‎Modules/clinic/_codecsmodule.c.h‎`

`‎Objects/unicodeobject.c‎`

`‎Python/fileutils.c‎`

0 commit comments

Uh oh!

File tree

6 files changed

6 files changed

‎Lib/encodings/__init__.py‎

‎Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst‎

‎Modules/_codecsmodule.c‎

‎Modules/clinic/_codecsmodule.c.h‎

‎Objects/unicodeobject.c‎

‎Python/fileutils.c‎

0 commit comments

`‎Lib/encodings/init.py‎`

`‎Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst‎`

`‎Modules/_codecsmodule.c‎`

`‎Modules/clinic/_codecsmodule.c.h‎`

`‎Objects/unicodeobject.c‎`

`‎Python/fileutils.c‎`