Tornado: async calls and DB caching

Question 1

import tornado.web
from selenium import webdriver
import sys
import psycopg2
from selenium.common.exceptions import NoSuchElementException
class LevelHandler(tornado.web.RequestHandler):
 def __init__(self, application, request, **kwargs):
 super().__init__(application, request, **kwargs)
 self.conn_string = "credentials_here"
 self.conn = psycopg2.connect(self.conn_string)
 self.cursor = self.conn.cursor()
 def get(self, word):
 driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
 driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
 is_word_cached = self.check_cache(word)
 if is_word_cached:
 response = {'level': is_word_cached[0][0]}
 elif self.check_word_404(driver):
 response = {'level': "This word wasn't found"}
 else:
 try:
 level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
 level = level.text
 except NoSuchElementException:
 level = "The word level isn't known"
 self.write_cache(word, level)
 response = {'level': level}
 self.write(response)
 def check_cache(self, word):
 self.cursor.execute("SELECT level FROM eng_level WHERE word = '{0}'".format(word))
 records = self.cursor.fetchall()
 return records
 def write_cache(self, word, level):
 self.cursor.execute("INSERT INTO eng_level (word, level) values (%s, %s)", (word, level,))
 self.cursor.execute("COMMIT")
 def check_word_404(self, driver):
 try:
 return driver.find_element_by_xpath(xpath="//h1[contains(text(),'404. Page not found.')]")
 except NoSuchElementException:
 return False
application = tornado.web.Application([
 (r"/([A-Za-z]+)", LevelHandler)
])
if __name__ == "__main__":
 application.listen(str(sys.argv[1]))
 tornado.ioloop.IOLoop.instance().start()

The idea is to fetch the word complexity from a dictionary. I use Selenium and XPath to do this, but once a word is fetched from the external HTML I store it in the database as a cache.

Questions:

Is everything asynchronous here?
I'm only 1.5 times faster: the average response from the external website is 4s and from Postgres cache 2.5s. Can I do better?

Question 2

The dictionary website isn't using much JavaScript, so even with it disabled I see content - do you even need Selenium? I'd assume that without it and just fetching and parsing raw HTML you should be faster (and consume fewer resources).

Question 3

1.This handler isn't asynchronous.

The easiest way to accomplish this with tornado is to make a coroutine like so:

@gen.coroutine
def get(self, word):
 driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
 yield driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
 is_word_cached = self.check_cache(word)
 if is_word_cached:
 response = {'level': is_word_cached[0][0]}
 elif self.check_word_404(driver):
 response = {'level': "This word wasn't found"}
 else:
 try:
 level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
 level = level.text
 except NoSuchElementException:
 level = "The word level isn't known"
 self.write_cache(word, level)
 response = {'level': level}
 self.write(response)

The main thing is to decorate the get method with @gen.coroutine and then have a yield statement in the blocking line, in your case the fetching of the data. Documentation here

You can store the cache in a in-memory database like Redis, Memcached, in plain Python or in a pickle.

Question 4

You marked get method with @gen.asynchronous annotation. In post you are referring to @gen.coroutine. I assume @gen.asynchronous is a typo, isn't it?

Question 5

You're right that was a typo. It should be @gen.coroutine

Darko Kolev Darko Kolev 562 bronze badges · Accepted Answer · 2016-09-20 13:03:17Z

1.This handler isn't asynchronous.

The easiest way to accomplish this with tornado is to make a coroutine like so:

@gen.coroutine
def get(self, word):
 driver = webdriver.PhantomJS(executable_path=r'bin/phantomjs')
 yield driver.get(url="http://dictionary.cambridge.org/dictionary/english/%s" % word)
 is_word_cached = self.check_cache(word)
 if is_word_cached:
 response = {'level': is_word_cached[0][0]}
 elif self.check_word_404(driver):
 response = {'level': "This word wasn't found"}
 else:
 try:
 level = driver.find_element_by_xpath(xpath="//span[@class='def-info']/span[contains(@title,'A1-C2')]")
 level = level.text
 except NoSuchElementException:
 level = "The word level isn't known"
 self.write_cache(word, level)
 response = {'level': level}
 self.write(response)

The main thing is to decorate the get method with @gen.coroutine and then have a yield statement in the blocking line, in your case the fetching of the data. Documentation here

You can store the cache in a in-memory database like Redis, Memcached, in plain Python or in a pickle.

You marked get method with @gen.asynchronous annotation. In post you are referring to @gen.coroutine. I assume @gen.asynchronous is a typo, isn't it?

Stack Exchange Network

Tornado: async calls and DB caching

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Tornado: async calls and DB caching

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions