Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a3ce2f7

Browse files
gh-55531: Implement normalize_encoding in C (#136643)
Closes gh-55531
1 parent 6826166 commit a3ce2f7

File tree

6 files changed

+123
-22
lines changed

6 files changed

+123
-22
lines changed

‎Lib/encodings/__init__.py‎

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
import codecs
3232
import sys
33+
from _codecs import _normalize_encoding
3334
from . import aliases
3435

3536
_cache = {}
@@ -55,18 +56,7 @@ def normalize_encoding(encoding):
5556
if isinstance(encoding, bytes):
5657
encoding = str(encoding, "ascii")
5758

58-
chars = []
59-
punct = False
60-
for c in encoding:
61-
if c.isalnum() or c == '.':
62-
if punct and chars:
63-
chars.append('_')
64-
if c.isascii():
65-
chars.append(c)
66-
punct = False
67-
else:
68-
punct = True
69-
return ''.join(chars)
59+
return _normalize_encoding(encoding)
7060

7161
def search_function(encoding):
7262

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
2+
by implementing the function in C using the private
3+
``_Py_normalize_encoding`` which has been modified to make lowercase
4+
conversion optional.

‎Modules/_codecsmodule.c‎

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
10181018
return PyCodec_LookupError(name);
10191019
}
10201020

1021+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
1022+
1023+
/*[clinic input]
1024+
_codecs._normalize_encoding
1025+
encoding: unicode
1026+
1027+
Normalize an encoding name *encoding*.
1028+
1029+
Used for encodings.normalize_encoding. Does not convert to lower case.
1030+
[clinic start generated code]*/
1031+
1032+
static PyObject *
1033+
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
1034+
/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
1035+
{
1036+
Py_ssize_t len;
1037+
const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
1038+
if (cstr == NULL) {
1039+
return NULL;
1040+
}
1041+
1042+
if (len > PY_SSIZE_T_MAX) {
1043+
PyErr_SetString(PyExc_OverflowError, "encoding is too large");
1044+
return NULL;
1045+
}
1046+
1047+
char *normalized = PyMem_Malloc(len + 1);
1048+
if (normalized == NULL) {
1049+
return PyErr_NoMemory();
1050+
}
1051+
1052+
if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
1053+
PyMem_Free(normalized);
1054+
return NULL;
1055+
}
1056+
1057+
PyObject *result = PyUnicode_FromString(normalized);
1058+
PyMem_Free(normalized);
1059+
return result;
1060+
}
1061+
10211062
/* --- Module API --------------------------------------------------------- */
10221063

10231064
static PyMethodDef _codecs_functions[] = {
@@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
10671108
_CODECS_REGISTER_ERROR_METHODDEF
10681109
_CODECS__UNREGISTER_ERROR_METHODDEF
10691110
_CODECS_LOOKUP_ERROR_METHODDEF
1111+
_CODECS__NORMALIZE_ENCODING_METHODDEF
10701112
{NULL, NULL} /* sentinel */
10711113
};
10721114

‎Modules/clinic/_codecsmodule.c.h‎

Lines changed: 65 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎Objects/unicodeobject.c‎

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
34493449
return v;
34503450
}
34513451

3452-
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3453-
also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3454-
longer than lower_len-1). */
3452+
/* Normalize an encoding name like encodings.normalize_encoding()
3453+
but allow to convert to lowercase if *to_lower* is true.
3454+
Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
34553455
int
34563456
_Py_normalize_encoding(const char *encoding,
34573457
char *lower,
3458-
size_t lower_len)
3458+
size_t lower_len,
3459+
int to_lower)
34593460
{
34603461
const char *e;
34613462
char *l;
@@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
34863487
if (l == l_end) {
34873488
return 0;
34883489
}
3489-
*l++ = Py_TOLOWER(c);
3490+
*l++ = to_lower ? Py_TOLOWER(c) : c;
34903491
}
34913492
else {
34923493
punct = 1;
@@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
35213522
}
35223523

35233524
/* Shortcuts for common default encodings */
3524-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3525+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
35253526
char *lower = buflower;
35263527

35273528
/* Fast paths */
@@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
37783779
}
37793780

37803781
/* Shortcuts for common default encodings */
3781-
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3782+
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
37823783
char *lower = buflower;
37833784

37843785
/* Fast paths */

‎Python/fileutils.c‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
178178

179179
#define USE_FORCE_ASCII
180180

181-
extern int _Py_normalize_encoding(const char *, char *, size_t);
181+
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
182182

183183
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
184184
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -229,7 +229,7 @@ check_force_ascii(void)
229229
}
230230

231231
char encoding[20]; /* longest name: "iso_646.irv_19910円" */
232-
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
232+
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
233233
goto error;
234234
}
235235

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /