Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 48a82a8

Browse files
fix(cmd): improve character encoding detection for sub-commands
1 parent 3123fec commit 48a82a8

File tree

4 files changed

+77
-3
lines changed

4 files changed

+77
-3
lines changed

‎commitizen/cmd.py‎

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from charset_normalizer import from_bytes
55

6+
from commitizen.exceptions import CharacterSetDecodeError
7+
68

79
class Command(NamedTuple):
810
out: str
@@ -12,6 +14,19 @@ class Command(NamedTuple):
1214
return_code: int
1315

1416

17+
def _try_decode(bytes_: bytes) -> str:
18+
try:
19+
return bytes_.decode("utf-8")
20+
except UnicodeDecodeError:
21+
charset_match = from_bytes(bytes_).best()
22+
if charset_match is None:
23+
raise CharacterSetDecodeError()
24+
try:
25+
return bytes_.decode(charset_match.encoding)
26+
except UnicodeDecodeError as e:
27+
raise CharacterSetDecodeError() from e
28+
29+
1530
def run(cmd: str) -> Command:
1631
process = subprocess.Popen(
1732
cmd,
@@ -23,8 +38,8 @@ def run(cmd: str) -> Command:
2338
stdout, stderr = process.communicate()
2439
return_code = process.returncode
2540
return Command(
26-
str(from_bytes(stdout).best()),
27-
str(from_bytes(stderr).best()),
41+
_try_decode(stdout),
42+
_try_decode(stderr),
2843
stdout,
2944
stderr,
3045
return_code,

‎commitizen/exceptions.py‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class ExitCode(enum.IntEnum):
2626
INVALID_CONFIGURATION = 19
2727
NOT_ALLOWED = 20
2828
NO_INCREMENT = 21
29+
UNRECOGNIZED_CHARACTERSET_ENCODING = 22
2930

3031

3132
class CommitizenException(Exception):
@@ -148,3 +149,7 @@ class InvalidConfigurationError(CommitizenException):
148149

149150
class NotAllowed(CommitizenException):
150151
exit_code = ExitCode.NOT_ALLOWED
152+
153+
154+
class CharacterSetDecodeError(CommitizenException):
155+
exit_code = ExitCode.UNRECOGNIZED_CHARACTERSET_ENCODING

‎docs/exit_codes.md‎

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ These exit codes can be found in `commitizen/exceptions.py::ExitCode`.
2828
| InvalidCommandArgumentError | 18 | The argument provide to command is invalid (e.g. `cz check -commit-msg-file filename --rev-range master..`) |
2929
| InvalidConfigurationError | 19 | An error was found in the Commitizen Configuration, such as duplicates in `change_type_order` |
3030
| NotAllowed | 20 | `--incremental` cannot be combined with a `rev_range` |
31-
| NoneIncrementExit | 21 | The commits found are not elegible to be bumped |
31+
| NoneIncrementExit | 21 | The commits found are not eligible to be bumped |
32+
| CharacterSetDecodeError | 22 | The character encoding of the command output could not be determined |

‎tests/test_cmd.py‎

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pytest
2+
3+
from commitizen import cmd
4+
from commitizen.exceptions import CharacterSetDecodeError
5+
6+
7+
# https://docs.python.org/3/howto/unicode.html
8+
def test_valid_utf8_encoded_strings():
9+
valid_strings = (
10+
"",
11+
"ascii",
12+
"🤦🏻‍♂️",
13+
"﷽",
14+
"\u0000",
15+
)
16+
assert all(s == cmd._try_decode(s.encode("utf-8")) for s in valid_strings)
17+
18+
19+
# A word of caution: just because an encoding can be guessed for a given
20+
# sequence of bytes and because that guessed encoding may yield a decoded
21+
# string, does not mean that that string was the original! For more, see:
22+
# https://docs.python.org/3/library/codecs.html#standard-encodings
23+
24+
25+
# Pick a random, non-utf8 encoding to test.
26+
def test_valid_cp1250_encoded_strings():
27+
valid_strings = (
28+
"",
29+
"ascii",
30+
"äöüß",
31+
"ça va",
32+
"jak se máte",
33+
)
34+
for s in valid_strings:
35+
assert cmd._try_decode(s.encode("cp1250")) or True
36+
37+
38+
def test_invalid_bytes():
39+
invalid_bytes = (b"\x73\xe2\x9d\xff\x00",)
40+
for s in invalid_bytes:
41+
with pytest.raises(CharacterSetDecodeError):
42+
cmd._try_decode(s)
43+
44+
45+
def test_always_fail_decode():
46+
class _bytes(bytes):
47+
def decode(self, encoding="utf-8", errors="strict"):
48+
raise UnicodeDecodeError(
49+
encoding, self, 0, 0, "Failing intentionally for testing"
50+
)
51+
52+
with pytest.raises(CharacterSetDecodeError):
53+
cmd._try_decode(_bytes())

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /