2
\$\begingroup\$
import tornado.web
from selenium import webdriver
import sys
import psycopg2
from selenium.common.exceptions import NoSuchElementException
class LevelHandler(tornado.web.RequestHandler):
 def __init__(self, application, request, **kwargs):
 super().__init__(application, request, **kwargs)
 self.conn_string = "credentials_here"
 self.conn = psycopg2.connect(self.conn_string)
 self.cursor = self.conn.cursor()
 def get(self, word):
 driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
 driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
 is_word_cached = self.check_cache(word)
 if is_word_cached:
 response = {'level': is_word_cached[0][0]}
 elif self.check_word_404(driver):
 response = {'level': "This word wasn't found"}
 else:
 try:
 level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
 level = level.text
 except NoSuchElementException:
 level = "The word level isn't known"
 self.write_cache(word, level)
 response = {'level': level}
 self.write(response)
 def check_cache(self, word):
 self.cursor.execute("SELECT level FROM eng_level WHERE word = '{0}'".format(word))
 records = self.cursor.fetchall()
 return records
 def write_cache(self, word, level):
 self.cursor.execute("INSERT INTO eng_level (word, level) values (%s, %s)", (word, level,))
 self.cursor.execute("COMMIT")
 def check_word_404(self, driver):
 try:
 return driver.find_element_by_xpath(xpath="//h1[contains(text(),'404. Page not found.')]")
 except NoSuchElementException:
 return False
application = tornado.web.Application([
 (r"/([A-Za-z]+)", LevelHandler)
])
if __name__ == "__main__":
 application.listen(str(sys.argv[1]))
 tornado.ioloop.IOLoop.instance().start()

The idea is to fetch the word complexity from a dictionary. I use Selenium and XPath to do this, but once a word is fetched from the external HTML I store it in the database as a cache.

Questions:

  • Is everything asynchronous here?
  • I'm only 1.5 times faster: the average response from the external website is 4s and from Postgres cache 2.5s. Can I do better?
ferada
11.4k25 silver badges65 bronze badges
asked Aug 25, 2016 at 14:12
\$\endgroup\$
1
  • 1
    \$\begingroup\$ The dictionary website isn't using much JavaScript, so even with it disabled I see content - do you even need Selenium? I'd assume that without it and just fetching and parsing raw HTML you should be faster (and consume fewer resources). \$\endgroup\$ Commented Aug 25, 2016 at 22:04

1 Answer 1

4
\$\begingroup\$

1.This handler isn't asynchronous.

The easiest way to accomplish this with tornado is to make a coroutine like so:

@gen.coroutine
def get(self, word):
 driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
 yield driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
 is_word_cached = self.check_cache(word)
 if is_word_cached:
 response = {'level': is_word_cached[0][0]}
 elif self.check_word_404(driver):
 response = {'level': "This word wasn't found"}
 else:
 try:
 level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
 level = level.text
 except NoSuchElementException:
 level = "The word level isn't known"
 self.write_cache(word, level)
 response = {'level': level}
 self.write(response)

The main thing is to decorate the get method with @gen.coroutine and then have a yield statement in the blocking line, in your case the fetching of the data. Documentation here

  1. You can store the cache in a in-memory database like Redis, Memcached, in plain Python or in a pickle.
answered Sep 20, 2016 at 13:03
\$\endgroup\$
2
  • \$\begingroup\$ You marked get method with @gen.asynchronous annotation. In post you are referring to @gen.coroutine. I assume @gen.asynchronous is a typo, isn't it? \$\endgroup\$ Commented Sep 29, 2016 at 11:53
  • \$\begingroup\$ You're right that was a typo. It should be @gen.coroutine \$\endgroup\$ Commented Sep 30, 2016 at 9:38

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.