diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1144,6 +1144,13 @@ particular, the following variants typic | | | see also | | | | :mod:`encodings.idna` | +--------------------+---------+---------------------------+ +| locale | | Current locale encoding. | +| | | Only support ``'strict'`` | +| | | and ``'surrogateescape'`` | +| | | error handlers. | +| | | | +| | | .. versionadded:: 3.3 | ++--------------------+---------+---------------------------+ | mbcs | dbcs | Windows only: Encode | | | | operand according to the | | | | ANSI codepage (CP_ACP) | diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst --- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -301,6 +301,12 @@ The :mod:`~encodings.mbcs` codec has be :mod:`~encodings.mbcs` codec is now supporting all error handlers, instead of only ``replace`` to encode and ``ignore`` to decode. +A new codec has been added: ``locale`` (:issue:`13619`), the current locale +encoding. In most cases, it should be the same encoding than +:func:`sys.getfilesystemencoding`. Both encodings can be different if the +:data:`~locale.LC_ALL` or :data:`~locale.LC_CTYPE` locale is changed with +:func:`locale.setlocale` to use another locale encoding. + A new Windows-only codec has been added: ``cp65001`` (:issue:`13216`). It is the Windows code page 65001 (Windows UTF-8, ``CP_UTF8``). For example, it is used by ``sys.stdout`` if the console output code page is set to cp65001 (e.g. diff --git a/Lib/encodings/locale.py b/Lib/encodings/locale.py new file mode 100644 --- /dev/null +++ b/Lib/encodings/locale.py @@ -0,0 +1,32 @@ +""" +Codec of the current locale encoding. +""" + +import codecs + +class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return codecs.locale_encode(input, self.errors)[0] + +class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return codecs.locale_decode(input, self.errors)[0] + +class StreamWriter(codecs.StreamWriter): + encode = codecs.locale_encode + +class StreamReader(codecs.StreamReader): + decode = codecs.locale_decode + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='locale', + encode=codecs.locale_encode, + decode=codecs.locale_decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -2,6 +2,7 @@ import _testcapi import codecs import io import locale +import re import sys import unittest import warnings @@ -2033,6 +2034,51 @@ class CodePageTest(unittest.TestCase): self.assertEqual(decoded, ('abc', 3)) +class LocaleEncodingTest(unittest.TestCase): + def test_ascii(self): + self.assertEqual('abc'.encode('locale'), b'abc') + self.assertEqual(b'abc'.decode('locale'), 'abc') + + def test_errors(self): + old_locale = locale.getlocale(locale.LC_CTYPE) + try: + locale.setlocale(locale.LC_CTYPE, 'C') + + for ch in '\u20AC\uD800\U0010FFFF': + text = '[%s]' % ch + errmsg = ("'locale' codec can't encode character %a " + "in position 1: " % ch) + errmsg = '^' + re.escape(errmsg) + self.assertRaisesRegex( + UnicodeEncodeError, + errmsg, + text.encode, 'locale') + + self.assertEqual( + '\udc80\udce9\udcff'.encode('locale', 'surrogateescape'), + b'\x80\xe9\xff') + + try: + b'\xff'.decode('locale') + except UnicodeDecodeError: + data = b'[\x80\xe9\xff]' + self.assertRaisesRegex( + UnicodeDecodeError, + "'locale' codec can't decode byte 0x80 in position 2:", + data.decode, 'locale') + + self.assertEqual( + b'\x80\xe9\xff'.decode('locale', 'surrogateescape'), + '\udc80\udce9\udcff') + else: + # On FreeBSD, Solaris and Mac OS X, b'\xff' can be decoded in + # the C locale. The C locale is something like ISO-8859-1, not + # 7-bit ASCII. + pass + finally: + locale.setlocale(locale.LC_CTYPE, old_locale) + + def test_main(): support.run_unittest( UTF32Test, @@ -2063,6 +2109,7 @@ def test_main(): BomTest, TransformCodecTest, CodePageTest, + LocaleEncodingTest, ) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -645,6 +645,28 @@ code_page_decode(PyObject *self, #endif /* HAVE_MBCS */ +static PyObject * +locale_decode(PyObject *self, PyObject *args) +{ + Py_buffer pbuf; + const char *errors = NULL; + Py_ssize_t consumed; + PyObject *decoded = NULL; + + if (!PyArg_ParseTuple(args, "y*|z:locale_decode", + &pbuf, &errors)) + return NULL; + consumed = pbuf.len; + + decoded = PyUnicode_DecodeLocaleAndSize(pbuf.buf, pbuf.len, + errors); + PyBuffer_Release(&pbuf); + if (decoded == NULL) + return NULL; + return codec_tuple(decoded, consumed); +} + + /* --- Encoder ------------------------------------------------------------ */ static PyObject * @@ -1073,6 +1095,27 @@ code_page_encode(PyObject *self, #endif /* HAVE_MBCS */ +static PyObject * +locale_encode(PyObject *self, PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:locale_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL || PyUnicode_READY(str) < 0) { + Py_XDECREF(str); + return NULL; + } + v = codec_tuple(PyUnicode_EncodeLocale(str, errors), + PyUnicode_GET_LENGTH(str)); + Py_DECREF(str); + return v; +} + /* --- Error handler registry --------------------------------------------- */ PyDoc_STRVAR(register_error__doc__, @@ -1164,6 +1207,8 @@ static PyMethodDef _codecs_functions[] = {"code_page_encode", code_page_encode, METH_VARARGS}, {"code_page_decode", code_page_decode, METH_VARARGS}, #endif + {"locale_encode", locale_encode, METH_VARARGS}, + {"locale_decode", locale_decode, METH_VARARGS}, {"register_error", register_error, METH_VARARGS, register_error__doc__}, {"lookup_error", lookup_error, METH_VARARGS,

AltStyle によって変換されたページ (->オリジナル) /