The aim of this module is to extract the title from possibly broken hypertext. Without using any awesome third party modules like bs4
or lxml
.
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
'''
Extract the title from a web page using
the standard lib.
'''
from html.parser import HTMLParser
from urllib.request import urlopen
import urllib
def error_callback(*_, **__):
pass
def is_string(data):
return isinstance(data, str)
def is_bytes(data):
return isinstance(data, bytes)
def to_ascii(data):
if is_string(data):
data = data.encode('ascii', errors='ignore')
elif is_bytes(data):
data = data.decode('ascii', errors='ignore')
else:
data = str(data).encode('ascii', errors='ignore')
return data
class Parser(HTMLParser):
def __init__(self, url):
self.title = None
self.rec = False
HTMLParser.__init__(self)
try:
self.feed(to_ascii(urlopen(url).read()))
except urllib.error.HTTPError:
return
except urllib.error.URLError:
return
except ValueError:
return
self.rec = False
self.error = error_callback
def handle_starttag(self, tag, attrs):
if tag == 'title':
self.rec = True
def handle_data(self, data):
if self.rec:
self.title = data
def handle_endtag(self, tag):
if tag == 'title':
self.rec = False
def get_title(url):
return Parser(url).title
print(get_title('http://www.google.com'))
2 Answers 2
Looks pretty clean to me, good job! Here's what I recommend:
The
to_ascii
function is small and readable, but in my opinion theis_string
andis_bytes
functions are overkill:def to_ascii(data): if isinstance(data, str): ... elif isinstance(data, bytes): ... else: ...
We can make
to_ascii
less complex by returning immediately:def to_ascii(data): if isinstance(data, str): return data.encode("ascii", errors="ignore") elif isinstance(data, bytes): return data.decode("ascii", errors="ignore")
We can then leave out the
else
clause:def to_ascii(data): if isinstance(data, str): return data.encode("ascii", errors="ignore") elif isinstance(data, bytes): return data.decode("ascii", errors="ignore") return str(data).encode("ascii", errors="ignore")
error_callback
is pretty much useless. Maybe you planned on adding more functionality to theParser
class, but as is, you can just leave it out.I don't expect a parser's constructor to automatically parse the data I pass. How about:
class Parser(HTMLParser): def __init__(self, data): self._data = data ... def parse(self): self._parsed = self.feed( ... )
Your Parser
class both define the rules of the parser you’re building and process the data on its own. This is too much, you should only define the parser in this class and handle data retrieval (URL reading) in an other function.
That being said, keep the habit of using super()
instead of calling the base class method directly this will help you when you’ll get into trickier inheritance schemes.
Your to_ascii
function is harmful, as the feed
method of the HTMLParser
explicitly expects an str
. Your to_ascii
returns an str
if data
is of bytes
type and a bytes
otherwise. Don't.
Instead, you could either extend the feed
behaviour to allow for any type in your subclass; or explicitly convert the result of urlopen(..).read()
that you know to be of bytes
type.
And speaking of urlopen(..).read()
you should use the result of the urlopen
call as the contextmanager it is supposed to be to ease resources management.
Your error handling seems off, as you don't do anything on errors. I, personally would be happier to see why a request failed with a whole traceback rather than getting a title that is None
(i.e. I’m loosing information here). Besides, HTTPError
being a subclass of URLError
, you don't need both in your excepts.
Lastly, use an if __name__ == '__main__':
guard so you can more easily import your script for testing purposes.
#!/usr/bin/env python3
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
'''
Extract the title from a web page using
the standard lib.
'''
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.error import URLError
class Parser(HTMLParser):
def __init__(self):
super().__init__()
self.title = ''
self._in_title_tag = False
def handle_starttag(self, tag, attrs):
if tag == 'title':
self._in_title_tag = True
def handle_data(self, data):
if self._in_title_tag:
self.title += data
def handle_endtag(self, tag):
if tag == 'title':
self._in_title_tag = False
def get_title(url):
try:
with urlopen(url) as stream:
data = stream.read()
except URLError:
return
parser = Parser()
parser.feed(data.decode('utf-8', errors='ignore'))
return parser.title
if __name__ == '__main__':
print(get_title('http://www.google.com'))
-
\$\begingroup\$
to_ascii()
will always return a ASCII encodedstr
\$\endgroup\$Ricky Wilson– Ricky Wilson2017年12月19日 15:55:14 +00:00Commented Dec 19, 2017 at 15:55 -
\$\begingroup\$ @RickyWilson Sure...
>>> to_ascii('test') b'test'
\$\endgroup\$301_Moved_Permanently– 301_Moved_Permanently2017年12月19日 15:58:09 +00:00Commented Dec 19, 2017 at 15:58 -
\$\begingroup\$ how can this be fixed \$\endgroup\$Ricky Wilson– Ricky Wilson2017年12月19日 16:45:02 +00:00Commented Dec 19, 2017 at 16:45
error_callback
? \$\endgroup\$error_callback()
is used to overwriteHTMLParser
's error method. \$\endgroup\$