import tornado.web
from selenium import webdriver
import sys
import psycopg2
from selenium.common.exceptions import NoSuchElementException
class LevelHandler(tornado.web.RequestHandler):
def __init__(self, application, request, **kwargs):
super().__init__(application, request, **kwargs)
self.conn_string = "credentials_here"
self.conn = psycopg2.connect(self.conn_string)
self.cursor = self.conn.cursor()
def get(self, word):
driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
is_word_cached = self.check_cache(word)
if is_word_cached:
response = {'level': is_word_cached[0][0]}
elif self.check_word_404(driver):
response = {'level': "This word wasn't found"}
else:
try:
level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
level = level.text
except NoSuchElementException:
level = "The word level isn't known"
self.write_cache(word, level)
response = {'level': level}
self.write(response)
def check_cache(self, word):
self.cursor.execute("SELECT level FROM eng_level WHERE word = '{0}'".format(word))
records = self.cursor.fetchall()
return records
def write_cache(self, word, level):
self.cursor.execute("INSERT INTO eng_level (word, level) values (%s, %s)", (word, level,))
self.cursor.execute("COMMIT")
def check_word_404(self, driver):
try:
return driver.find_element_by_xpath(xpath="//h1[contains(text(),'404. Page not found.')]")
except NoSuchElementException:
return False
application = tornado.web.Application([
(r"/([A-Za-z]+)", LevelHandler)
])
if __name__ == "__main__":
application.listen(str(sys.argv[1]))
tornado.ioloop.IOLoop.instance().start()
The idea is to fetch the word complexity from a dictionary. I use Selenium and XPath to do this, but once a word is fetched from the external HTML I store it in the database as a cache.
Questions:
- Is everything asynchronous here?
- I'm only 1.5 times faster: the average response from the external website is
4s
and from Postgres cache2.5s
. Can I do better?
-
1\$\begingroup\$ The dictionary website isn't using much JavaScript, so even with it disabled I see content - do you even need Selenium? I'd assume that without it and just fetching and parsing raw HTML you should be faster (and consume fewer resources). \$\endgroup\$ferada– ferada2016年08月25日 22:04:22 +00:00Commented Aug 25, 2016 at 22:04
1 Answer 1
1.This handler isn't asynchronous.
The easiest way to accomplish this with tornado is to make a coroutine like so:
@gen.coroutine
def get(self, word):
driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
yield driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
is_word_cached = self.check_cache(word)
if is_word_cached:
response = {'level': is_word_cached[0][0]}
elif self.check_word_404(driver):
response = {'level': "This word wasn't found"}
else:
try:
level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
level = level.text
except NoSuchElementException:
level = "The word level isn't known"
self.write_cache(word, level)
response = {'level': level}
self.write(response)
The main thing is to decorate the get method with @gen.coroutine and then have a yield statement in the blocking line, in your case the fetching of the data. Documentation here
- You can store the cache in a in-memory database like Redis, Memcached, in plain Python or in a pickle.
-
\$\begingroup\$ You marked
get
method with@gen.asynchronous
annotation. In post you are referring to@gen.coroutine
. I assume@gen.asynchronous
is a typo, isn't it? \$\endgroup\$Rudziankoŭ– Rudziankoŭ2016年09月29日 11:53:31 +00:00Commented Sep 29, 2016 at 11:53 -
\$\begingroup\$ You're right that was a typo. It should be @gen.coroutine \$\endgroup\$Darko Kolev– Darko Kolev2016年09月30日 09:38:50 +00:00Commented Sep 30, 2016 at 9:38