Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 67c1a0e

Browse files
committed
text byte
1 parent 403fc7d commit 67c1a0e

File tree

7 files changed

+183
-0
lines changed

7 files changed

+183
-0
lines changed

‎04-text-byte/default_encodings.py‎

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import locale
4+
import sys
5+
6+
7+
expressions = """
8+
locale.getpreferredencoding()
9+
type(my_file)
10+
my_file.encoding
11+
sys.stdout.isatty()
12+
sys.stdout.encoding
13+
sys.stdin.isatty()
14+
sys.stdin.encoding
15+
sys.stderr.isatty()
16+
sys.stderr.encoding
17+
sys.getdefaultencoding()
18+
sys.getfilesystemencoding()
19+
"""
20+
21+
my_file = open('dummy', 'w')
22+
23+
for expression in expressions.split():
24+
value = eval(expression)
25+
print(expression.rjust(30), '->', repr(value))

‎04-text-byte/dummy‎

Whitespace-only changes.

‎04-text-byte/normeq.py‎

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Utility functions for normalized Unicode string comparison.
4+
Using Normal Form C, case sensitive:
5+
>>> s1 = 'café'
6+
>>> s2 = 'cafe\u0301'
7+
>>> s1 == s2
8+
False
9+
>>> nfc_equal(s1, s2)
10+
True
11+
>>> nfc_equal('A', 'a')
12+
False
13+
Using Normal Form C with case folding:
14+
>>> s3 = 'Straße'
15+
>>> s4 = 'strasse'
16+
>>> s3 == s4
17+
False
18+
>>> nfc_equal(s3, s4)
19+
False
20+
>>> fold_equal(s3, s4)
21+
True
22+
>>> fold_equal(s1, s2)
23+
True
24+
>>> fold_equal('A', 'a')
25+
True
26+
"""
27+
28+
from unicodedata import normalize
29+
30+
31+
def nfc_equal(str1, str2):
32+
return normalize('NFC', str1) == normalize('NFC', str2)
33+
34+
35+
def fold_equal(str1, str2):
36+
return (normalize('NFC', str1).casefold() ==
37+
normalize('NFC', str2).casefold())

‎04-text-byte/numerics_demo.py‎

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import unicodedata
4+
import re
5+
6+
re_digit = re.compile(r'\d')
7+
8+
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
9+
10+
for char in sample:
11+
print('U+%04x' % ord(char), # <1>
12+
char.center(6), # <2>
13+
're_dig' if re_digit.match(char) else '-', # <3>
14+
'isdig' if char.isdigit() else '-', # <4>
15+
'isnum' if char.isnumeric() else '-', # <5>
16+
format(unicodedata.numeric(char), '5.2f'), # <6>
17+
unicodedata.name(char), # <7>
18+
sep='\t')

‎04-text-byte/ola.py‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# -*- coding: cp1252 -*-
2+
3+
print('Olá, Mundo!')

‎04-text-byte/ramanujan.py‎

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import re
4+
5+
re_numbers_str = re.compile(r'\d+')
6+
re_words_str = re.compile(r'\w+')
7+
re_numbers_bytes = re.compile(rb'\d+')
8+
re_words_bytes = re.compile(rb'\w+')
9+
10+
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
11+
" as 1729 = 13 +たす 123 = 93 +たす 103.")
12+
text_bytes = text_str.encode('utf-8')
13+
14+
print('Text', repr(text_str), sep='\n ')
15+
print('Numbers')
16+
print(' str :', re_numbers_str.findall(text_str))
17+
print(' bytes:', re_numbers_bytes.findall(text_bytes))
18+
print('Words')
19+
print(' str :', re_words_str.findall(text_str))
20+
print(' bytes:', re_words_bytes.findall(text_bytes))

‎04-text-byte/sanitize.py‎

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
3+
Radical folding and text sanitizing.
4+
5+
Handling a string with `cp1252` symbols:
6+
>>> order = '"Herr Voß: • 1⁄2 cup of ŒtkerTM caffè latte • bowl of açaí."'
7+
>>> shave_marks(order)
8+
'"Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai."'
9+
>>> shave_marks_latin(order)
10+
'"Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai."'
11+
>>> dewinize(order)
12+
'"Herr Voß: - 1⁄2 cup of OEtker(TM) caffè latte - bowl of açaí."'
13+
>>> asciize(order)
14+
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
15+
16+
Handling a string with Greek and Latin accented characters:
17+
>>> greek = 'Ζέφυρος, Zéfiro'
18+
>>> shave_marks(greek)
19+
'Ζεφυρος, Zefiro'
20+
>>> shave_marks_latin(greek)
21+
'Ζέφυρος, Zefiro'
22+
>>> dewinize(greek)
23+
'Ζέφυρος, Zéfiro'
24+
>>> asciize(greek)
25+
'Ζέφυρος, Zefiro'
26+
27+
"""
28+
29+
import unicodedata
30+
import string
31+
32+
33+
def shave_marks(txt):
34+
"""Remove all diacritic marks"""
35+
norm_txt = unicodedata.normalize('NFD', txt)
36+
shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
37+
return unicodedata.normalize('NFC', shaved)
38+
39+
40+
def shave_marks_latin(txt):
41+
"""Remove all diacritic marks from Latin base characters"""
42+
norm_txt = unicodedata.normalize('NFD', txt)
43+
latin_base = False
44+
keepers = []
45+
for c in norm_txt:
46+
if unicodedata.combining(c) and latin_base:
47+
continue # ignore diacritic on Latin base char
48+
keepers.append(c)
49+
# if it isn't combining char, it's a new base char
50+
if not unicodedata.combining(c):
51+
latin_base = c in string.ascii_letters
52+
shaved = ''.join(keepers)
53+
return unicodedata.normalize('NFC', shaved)
54+
55+
56+
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’""•–— ̃›""",
57+
"""'f"*^<''""---~>""")
58+
59+
multi_map = str.maketrans({ # <2>
60+
'€': '<euro>',
61+
'...': '...',
62+
'Œ': 'OE',
63+
'TM': '(TM)',
64+
'œ': 'oe',
65+
'‰': '<per mille>',
66+
'‡': '**',
67+
})
68+
69+
multi_map.update(single_map)
70+
71+
72+
def dewinize(txt):
73+
"""Replace Win1252 symbols with ASCII chars or sequences"""
74+
return txt.translate(multi_map)
75+
76+
77+
def asciize(txt):
78+
no_marks = shave_marks_latin(dewinize(txt))
79+
no_marks = no_marks.replace('ß', 'ss')
80+
return unicodedata.normalize('NFKC', no_marks)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /