Commit 67c1a0e

committed

text byte

1 parent 403fc7d commit 67c1a0eCopy full SHA for 67c1a0e

File tree

7 files changed

+183

-0

lines changed

04-text-byte

7 files changed

+183

-0

lines changed

`‎04-text-byte/default_encodings.py‎`

Lines changed: 25 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,25 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+import locale`
	`4`	`+import sys`
	`5`	`+`
	`6`	`+`
	`7`	`+expressions = """`
	`8`	`+ locale.getpreferredencoding()`
	`9`	`+ type(my_file)`
	`10`	`+ my_file.encoding`
	`11`	`+ sys.stdout.isatty()`
	`12`	`+ sys.stdout.encoding`
	`13`	`+ sys.stdin.isatty()`
	`14`	`+ sys.stdin.encoding`
	`15`	`+ sys.stderr.isatty()`
	`16`	`+ sys.stderr.encoding`
	`17`	`+ sys.getdefaultencoding()`
	`18`	`+ sys.getfilesystemencoding()`
	`19`	`+"""`
	`20`	`+`
	`21`	`+my_file = open('dummy', 'w')`
	`22`	`+`
	`23`	`+for expression in expressions.split():`
	`24`	`+ value = eval(expression)`
	`25`	`+ print(expression.rjust(30), '->', repr(value))`

`‎04-text-byte/dummy‎`

Whitespace-only changes.

`‎04-text-byte/normeq.py‎`

Lines changed: 37 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,37 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+"""`
	`3`	`+Utility functions for normalized Unicode string comparison.`
	`4`	`+Using Normal Form C, case sensitive:`
	`5`	`+ >>> s1 = 'café'`
	`6`	`+ >>> s2 = 'cafe\u0301'`
	`7`	`+ >>> s1 == s2`
	`8`	`+ False`
	`9`	`+ >>> nfc_equal(s1, s2)`
	`10`	`+ True`
	`11`	`+ >>> nfc_equal('A', 'a')`
	`12`	`+ False`
	`13`	`+Using Normal Form C with case folding:`
	`14`	`+ >>> s3 = 'Straße'`
	`15`	`+ >>> s4 = 'strasse'`
	`16`	`+ >>> s3 == s4`
	`17`	`+ False`
	`18`	`+ >>> nfc_equal(s3, s4)`
	`19`	`+ False`
	`20`	`+ >>> fold_equal(s3, s4)`
	`21`	`+ True`
	`22`	`+ >>> fold_equal(s1, s2)`
	`23`	`+ True`
	`24`	`+ >>> fold_equal('A', 'a')`
	`25`	`+ True`
	`26`	`+"""`
	`27`	`+`
	`28`	`+from unicodedata import normalize`
	`29`	`+`
	`30`	`+`
	`31`	`+def nfc_equal(str1, str2):`
	`32`	`+ return normalize('NFC', str1) == normalize('NFC', str2)`
	`33`	`+`
	`34`	`+`
	`35`	`+def fold_equal(str1, str2):`
	`36`	`+ return (normalize('NFC', str1).casefold() ==`
	`37`	`+ normalize('NFC', str2).casefold())`

`‎04-text-byte/numerics_demo.py‎`

Lines changed: 18 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,18 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+import unicodedata`
	`4`	`+import re`
	`5`	`+`
	`6`	`+re_digit = re.compile(r'\d')`
	`7`	`+`
	`8`	`+sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'`
	`9`	`+`
	`10`	`+for char in sample:`
	`11`	`+ print('U+%04x' % ord(char), # <1>`
	`12`	`+ char.center(6), # <2>`
	`13`	`+ 're_dig' if re_digit.match(char) else '-', # <3>`
	`14`	`+ 'isdig' if char.isdigit() else '-', # <4>`
	`15`	`+ 'isnum' if char.isnumeric() else '-', # <5>`
	`16`	`+ format(unicodedata.numeric(char), '5.2f'), # <6>`
	`17`	`+ unicodedata.name(char), # <7>`
	`18`	`+ sep='\t')`

`‎04-text-byte/ola.py‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# -- coding: cp1252 --`
	`2`	`+`
	`3`	`+print('Olá, Mundo!')`

`‎04-text-byte/ramanujan.py‎`

Lines changed: 20 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,20 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+`
	`3`	`+import re`
	`4`	`+`
	`5`	`+re_numbers_str = re.compile(r'\d+')`
	`6`	`+re_words_str = re.compile(r'\w+')`
	`7`	`+re_numbers_bytes = re.compile(rb'\d+')`
	`8`	`+re_words_bytes = re.compile(rb'\w+')`
	`9`	`+`
	`10`	`+text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"`
	`11`	`+ " as 1729 =わ 13 +たす 123 =わ 93 +たす 103.")`
	`12`	`+text_bytes = text_str.encode('utf-8')`
	`13`	`+`
	`14`	`+print('Text', repr(text_str), sep='\n ')`
	`15`	`+print('Numbers')`
	`16`	`+print(' str :', re_numbers_str.findall(text_str))`
	`17`	`+print(' bytes:', re_numbers_bytes.findall(text_bytes))`
	`18`	`+print('Words')`
	`19`	`+print(' str :', re_words_str.findall(text_str))`
	`20`	`+print(' bytes:', re_words_bytes.findall(text_bytes))`

`‎04-text-byte/sanitize.py‎`

Lines changed: 80 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,80 @@`
	`1`	`+"""`
	`2`	`+`
	`3`	`+Radical folding and text sanitizing.`
	`4`	`+`
	`5`	+Handling a string with `cp1252` symbols:
	`6`	`+ >>> order = '"Herr Voß: • 1⁄2 cup of ŒtkerTM caffè latte • bowl of açaí."'`
	`7`	`+ >>> shave_marks(order)`
	`8`	`+ '"Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai."'`
	`9`	`+ >>> shave_marks_latin(order)`
	`10`	`+ '"Herr Voß: • 1⁄2 cup of ŒtkerTM caffe latte • bowl of acai."'`
	`11`	`+ >>> dewinize(order)`
	`12`	`+ '"Herr Voß: - 1⁄2 cup of OEtker(TM) caffè latte - bowl of açaí."'`
	`13`	`+ >>> asciize(order)`
	`14`	`+ '"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'`
	`15`	`+`
	`16`	`+Handling a string with Greek and Latin accented characters:`
	`17`	`+ >>> greek = 'Ζέφυρος, Zéfiro'`
	`18`	`+ >>> shave_marks(greek)`
	`19`	`+ 'Ζεφυρος, Zefiro'`
	`20`	`+ >>> shave_marks_latin(greek)`
	`21`	`+ 'Ζέφυρος, Zefiro'`
	`22`	`+ >>> dewinize(greek)`
	`23`	`+ 'Ζέφυρος, Zéfiro'`
	`24`	`+ >>> asciize(greek)`
	`25`	`+ 'Ζέφυρος, Zefiro'`
	`26`	`+`
	`27`	`+"""`
	`28`	`+`
	`29`	`+import unicodedata`
	`30`	`+import string`
	`31`	`+`
	`32`	`+`
	`33`	`+def shave_marks(txt):`
	`34`	`+ """Remove all diacritic marks"""`
	`35`	`+ norm_txt = unicodedata.normalize('NFD', txt)`
	`36`	`+ shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))`
	`37`	`+ return unicodedata.normalize('NFC', shaved)`
	`38`	`+`
	`39`	`+`
	`40`	`+def shave_marks_latin(txt):`
	`41`	`+ """Remove all diacritic marks from Latin base characters"""`
	`42`	`+ norm_txt = unicodedata.normalize('NFD', txt)`
	`43`	`+ latin_base = False`
	`44`	`+ keepers = []`
	`45`	`+ for c in norm_txt:`
	`46`	`+ if unicodedata.combining(c) and latin_base:`
	`47`	`+ continue # ignore diacritic on Latin base char`
	`48`	`+ keepers.append(c)`
	`49`	`+ # if it isn't combining char, it's a new base char`
	`50`	`+ if not unicodedata.combining(c):`
	`51`	`+ latin_base = c in string.ascii_letters`
	`52`	`+ shaved = ''.join(keepers)`
	`53`	`+ return unicodedata.normalize('NFC', shaved)`
	`54`	`+`
	`55`	`+`
	`56`	`+single_map = str.maketrans("""‚ƒ„†ˆ‹‘’""•–— ̃›""",`
	`57`	`+ """'f"*^<''""---~>""")`
	`58`	`+`
	`59`	`+multi_map = str.maketrans({ # <2>`
	`60`	`+ '€': '<euro>',`
	`61`	`+ '...': '...',`
	`62`	`+ 'Œ': 'OE',`
	`63`	`+ 'TM': '(TM)',`
	`64`	`+ 'œ': 'oe',`
	`65`	`+ '‰': '<per mille>',`
	`66`	`+ '‡': '**',`
	`67`	`+})`
	`68`	`+`
	`69`	`+multi_map.update(single_map)`
	`70`	`+`
	`71`	`+`
	`72`	`+def dewinize(txt):`
	`73`	`+ """Replace Win1252 symbols with ASCII chars or sequences"""`
	`74`	`+ return txt.translate(multi_map)`
	`75`	`+`
	`76`	`+`
	`77`	`+def asciize(txt):`
	`78`	`+ no_marks = shave_marks_latin(dewinize(txt))`
	`79`	`+ no_marks = no_marks.replace('ß', 'ss')`
	`80`	`+ return unicodedata.normalize('NFKC', no_marks)`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 67c1a0e

File tree

7 files changed

7 files changed

`‎04-text-byte/default_encodings.py‎`

`‎04-text-byte/dummy‎`

`‎04-text-byte/normeq.py‎`

`‎04-text-byte/numerics_demo.py‎`

`‎04-text-byte/ola.py‎`

`‎04-text-byte/ramanujan.py‎`

`‎04-text-byte/sanitize.py‎`

0 commit comments