You've already got an answer, but here's a way to unscramble UTF-8-like Unicode sequences that is less likely to decode latin-1 Unicode sequences in error. The re.sub function:
- Matches Unicode characters < U+0100 that resemble valid UTF-8 sequences (ref: RFC 3629 RFC 3629).
- Encodes the Unicode sequence into its equivalent latin-1 byte sequence.
- Decodes the sequence using UTF-8 back into Unicode.
- Replaces the original UTF-8-like sequence with the matching Unicode character.
Note this could still match a Unicode sequence if just the right characters appear next to each other, but it is much less likely.
import re
# your example
a = u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba'
# printable Unicode characters < 256.
a += ''.join(chr(n) for n in range(32,256)).decode('latin1')
# a few UTF-8 characters decoded as latin1.
a += ''.join(unichr(n) for n in [2**7-1,2**7,2**11-1,2**11]).encode('utf8').decode('latin1')
# Some non-BMP characters
a += u'\U00010000\U0010FFFF'.encode('utf8').decode('latin1')
print repr(a)
# Unicode codepoint sequences that resemble UTF-8 sequences.
p = re.compile(ur'''(?x)
\xF0[\x90-\xBF][\x80-\xBF]{2} | # Valid 4-byte sequences
[\xF1-\xF3][\x80-\xBF]{3} |
\xF4[\x80-\x8F][\x80-\xBF]{2} |
\xE0[\xA0-\xBF][\x80-\xBF] | # Valid 3-byte sequences
[\xE1-\xEC][\x80-\xBF]{2} |
\xED[\x80-\x9F][\x80-\xBF] |
[\xEE-\xEF][\x80-\xBF]{2} |
[\xC2-\xDF][\x80-\xBF] # Valid 2-byte sequences
''')
def replace(m):
return m.group(0).encode('latin1').decode('utf8')
print
print repr(p.sub(replace,a))
###Output
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\xc2\x80\xdf\xbf\xe0\xa0\x80\xf0\x90\x80\x80\xf4\x8f\xbf\xbf'
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \u0435\u043a !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\x80\u07ff\u0800\U00010000\U0010ffff'
You've already got an answer, but here's a way to unscramble UTF-8-like Unicode sequences that is less likely to decode latin-1 Unicode sequences in error. The re.sub function:
- Matches Unicode characters < U+0100 that resemble valid UTF-8 sequences (ref: RFC 3629).
- Encodes the Unicode sequence into its equivalent latin-1 byte sequence.
- Decodes the sequence using UTF-8 back into Unicode.
- Replaces the original UTF-8-like sequence with the matching Unicode character.
Note this could still match a Unicode sequence if just the right characters appear next to each other, but it is much less likely.
import re
# your example
a = u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba'
# printable Unicode characters < 256.
a += ''.join(chr(n) for n in range(32,256)).decode('latin1')
# a few UTF-8 characters decoded as latin1.
a += ''.join(unichr(n) for n in [2**7-1,2**7,2**11-1,2**11]).encode('utf8').decode('latin1')
# Some non-BMP characters
a += u'\U00010000\U0010FFFF'.encode('utf8').decode('latin1')
print repr(a)
# Unicode codepoint sequences that resemble UTF-8 sequences.
p = re.compile(ur'''(?x)
\xF0[\x90-\xBF][\x80-\xBF]{2} | # Valid 4-byte sequences
[\xF1-\xF3][\x80-\xBF]{3} |
\xF4[\x80-\x8F][\x80-\xBF]{2} |
\xE0[\xA0-\xBF][\x80-\xBF] | # Valid 3-byte sequences
[\xE1-\xEC][\x80-\xBF]{2} |
\xED[\x80-\x9F][\x80-\xBF] |
[\xEE-\xEF][\x80-\xBF]{2} |
[\xC2-\xDF][\x80-\xBF] # Valid 2-byte sequences
''')
def replace(m):
return m.group(0).encode('latin1').decode('utf8')
print
print repr(p.sub(replace,a))
###Output
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\xc2\x80\xdf\xbf\xe0\xa0\x80\xf0\x90\x80\x80\xf4\x8f\xbf\xbf'
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \u0435\u043a !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\x80\u07ff\u0800\U00010000\U0010ffff'
You've already got an answer, but here's a way to unscramble UTF-8-like Unicode sequences that is less likely to decode latin-1 Unicode sequences in error. The re.sub function:
- Matches Unicode characters < U+0100 that resemble valid UTF-8 sequences (ref: RFC 3629).
- Encodes the Unicode sequence into its equivalent latin-1 byte sequence.
- Decodes the sequence using UTF-8 back into Unicode.
- Replaces the original UTF-8-like sequence with the matching Unicode character.
Note this could still match a Unicode sequence if just the right characters appear next to each other, but it is much less likely.
import re
# your example
a = u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba'
# printable Unicode characters < 256.
a += ''.join(chr(n) for n in range(32,256)).decode('latin1')
# a few UTF-8 characters decoded as latin1.
a += ''.join(unichr(n) for n in [2**7-1,2**7,2**11-1,2**11]).encode('utf8').decode('latin1')
# Some non-BMP characters
a += u'\U00010000\U0010FFFF'.encode('utf8').decode('latin1')
print repr(a)
# Unicode codepoint sequences that resemble UTF-8 sequences.
p = re.compile(ur'''(?x)
\xF0[\x90-\xBF][\x80-\xBF]{2} | # Valid 4-byte sequences
[\xF1-\xF3][\x80-\xBF]{3} |
\xF4[\x80-\x8F][\x80-\xBF]{2} |
\xE0[\xA0-\xBF][\x80-\xBF] | # Valid 3-byte sequences
[\xE1-\xEC][\x80-\xBF]{2} |
\xED[\x80-\x9F][\x80-\xBF] |
[\xEE-\xEF][\x80-\xBF]{2} |
[\xC2-\xDF][\x80-\xBF] # Valid 2-byte sequences
''')
def replace(m):
return m.group(0).encode('latin1').decode('utf8')
print
print repr(p.sub(replace,a))
###Output
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\xc2\x80\xdf\xbf\xe0\xa0\x80\xf0\x90\x80\x80\xf4\x8f\xbf\xbf'
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \u0435\u043a !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\x80\u07ff\u0800\U00010000\U0010ffff'
You've already got an answer, but here's a way to unscramble UTF-8-like Unicode sequences that is less likely to decode latin-1 Unicode sequences in error. The re.sub function:
- Matches Unicode characters < U+0100 that resemble valid UTF-8 sequences (ref: RFC 3629).
- Encodes the Unicode sequence into its equivalent latin-1 byte sequence.
- Decodes the sequence using UTF-8 back into Unicode.
- Replaces the original UTF-8-like sequence with the matching Unicode character.
Note this could still match a Unicode sequence if just the right characters appear next to each other, but it is much less likely.
import re
# your example
a = u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba'
# printable Unicode characters < 256.
a += ''.join(chr(n) for n in range(32,256)).decode('latin1')
# a few UTF-8 characters decoded as latin1.
a += ''.join(unichr(n) for n in [2**7-1,2**7,2**11-1,2**11]).encode('utf8').decode('latin1')
# Some non-BMP characters
a += u'\U00010000\U0010FFFF'.encode('utf8').decode('latin1')
print repr(a)
# Unicode codepoint sequences that resemble UTF-8 sequences.
p = re.compile(ur'''(?x)
\xF0[\x90-\xBF][\x80-\xBF]{2} | # Valid 4-byte sequences
[\xF1-\xF3][\x80-\xBF]{3} |
\xF4[\x80-\x8F][\x80-\xBF]{2} |
\xE0[\xA0-\xBF][\x80-\xBF] | # Valid 3-byte sequences
[\xE1-\xEC][\x80-\xBF]{2} |
\xED[\x80-\x9F][\x80-\xBF] |
[\xEE-\xEF][\x80-\xBF]{2} |
[\xC2-\xDF][\x80-\xBF] # Valid 2-byte sequences
''')
def replace(m):
return m.group(0).encode('latin1').decode('utf8')
print
print repr(p.sub(replace,a))
###Output
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \xd0\xb5\xd0\xba !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\xc2\x80\xdf\xbf\xe0\xa0\x80\xf0\x90\x80\x80\xf4\x8f\xbf\xbf'
u'\u0420\u0443\u0441\u0441\u043a\u0438\u0439 \u0435\u043a !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x7f\x80\u07ff\u0800\U00010000\U0010ffff'