Try this:
import re
def _callback(matches):
id = matches.group(1)
try:
return unichr(int(id))
except:
return id
def html_unidefdecode_unicode_references(data):
return re.sub("&#(\d+)(;|(?=\s))", _callback, data)
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unidefdecode_unicode_references(data)
Try this:
import re
def _callback(matches):
id = matches.group(1)
try:
return unichr(int(id))
except:
return id
def html_unidef(data):
return re.sub("&#(\d+)(;|(?=\s))", _callback, data)
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unidef(data)
Try this:
import re
def _callback(matches):
id = matches.group(1)
try:
return unichr(int(id))
except:
return id
def decode_unicode_references(data):
return re.sub("&#(\d+)(;|(?=\s))", _callback, data)
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print decode_unicode_references(data)
Try this:
import re
def html_unicode_decode_callback(datamatches):
_callbackid = lambdamatches.group(1)
m try:
return unichr(int(m.group(1id))
except:
return id
def html_unidef(data):
return re.sub("&#(\w+\d+)(;|(?:;|\s=\s))", _callback, data)
As per your example:
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unicode_decodehtml_unidef(data)
# U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’
Try this:
import re
def html_unicode_decode(data):
_callback = lambda m:unichr(int(m.group(1)))
return re.sub("&#(\w+)(?:;|\s)", _callback, data)
As per your example:
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unicode_decode(data)
# U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’
Try this:
import re
def _callback(matches):
id = matches.group(1)
try:
return unichr(int(id))
except:
return id
def html_unidef(data):
return re.sub("&#(\d+)(;|(?=\s))", _callback, data)
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unidef(data)
added 91 characters in body; added 148 characters in body; added 60 characters in body
Evan Fosmark
- 102.5k
- 36
- 110
- 118
Have you looked at htmlentitydefs ?Try this:
import re
def html_unicode_decode(data):
_callback = lambda m:unichr(int(m.group(1)))
return re.sub("&#(\w+)(?:;|\s)", _callback, data)
As per your example:
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unicode_decode(data)
# U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’
Have you looked at htmlentitydefs ?
Try this:
import re
def html_unicode_decode(data):
_callback = lambda m:unichr(int(m.group(1)))
return re.sub("&#(\w+)(?:;|\s)", _callback, data)
As per your example:
data = "U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’"
print html_unicode_decode(data)
# U.S. Adviser’s Blunt Memo on Iraq: Time ‘to Go Home’
Loading
lang-py