Return to Question

Tweeted twitter.com/StackCodeReview/status/797462692537700353

occurred Nov 12, 2016 at 15:34

removed API key and user ID

Source Link

edited Nov 7, 2016 at 18:07

Graipher

edited Nov 7, 2016 at 18:07

Graipher

41.6k
7
70
134

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850''**redacted**' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz''**redacted**' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
 """
 Used internally by the Review class. The review class has an ETree
 element representing an XML document of the form:
 <review>
 <tag1>value1</tag1>
 <tag2>value2</tag2>
 <book>
 <book_tag1>book_value1</book_tag1>
 <book_tag2>book_value2</book_tag2>
 </book>
 </review>
 This descriptor provides a cleaner interface for reading values from
 the XML.
 :param tag_name: Name of the tag in the XML. If prefixed with 'book/',
 reads the attribute from within <book>.
 :param factory: Applied to the value before returning, if supplied.
 For example, passing ``factory=int`` would cast the value to
 an integer. Otherwise the value is passed through as a string.
 """
 def __init__(self, tag_name, factory=None):
 self.tag_name = tag_name
 self.factory = factory
 if factory is None:
 self.factory = lambda x: x
 def __get__(self, obj, type):
 assert type is Review
 if self.tag_name.startswith('book/'):
 val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
 else:
 val = obj._data.find(self.tag_name).text
 return self.factory(val)
def date_factory(date_str):
 """
 Convert a date string returned by the Goodreads API into an
 ISO-8601 UTC string.
 """
 # We may get ``None`` if the API is missing any information for a
 # particular date field -- for example, the ``date_read`` field is
 if date_str is None:
 return None
 else:
 # In the API responses, dates are returned in the form
 # "Mon Oct 24 12:26:31 -0700 2016"
 date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
 return str(date_obj.astimezone(timezone.utc))
class Review(object):
 """
 Wrapper class around an xml.etree.ElementTree.Element object that
 contains the data from a review. Does all the messy handling of
 getting the interesting bits out of the XML response.
 """
 def __init__(self, data):
 self._data = data
 book_id = TagDescriptor('book/id')
 title = TagDescriptor('book/title')
 isbn = TagDescriptor('book/isbn')
 isbn13 = TagDescriptor('book/isbn13')
 average_rating = TagDescriptor('book/average_rating')
 publisher = TagDescriptor('book/publisher')
 binding = TagDescriptor('book/format')
 page_count = TagDescriptor('book/num_pages', int)
 year_published = TagDescriptor('book/publication_year')
 orig_year_published = TagDescriptor('book/published')
 date_read = TagDescriptor('read_at', date_factory)
 date_added = TagDescriptor('date_added', date_factory)
 review = TagDescriptor('body', lambda x: x.strip())
 @property
 def authors(self):
 return [
 a.find('name').text
 for a in self._data.find('book').find('authors').findall('author')
 ]
 @property
 def my_rating(self):
 # The Goodreads API returns '0' to indicate an unrated book; make
 # this a proper null type.
 rating = self._data.find('rating').text
 if rating == '0':
 return None
 else:
 return rating
 @property
 def bookshelves(self):
 return [
 shelf.attrib['name']
 for shelf in self._data.find('shelves').findall('shelf')
 ]
def asdict(obj):
 """
 Given an object, return a dictionary of its public attributes suitable
 for JSON serialisation.
 """
 return {
 key: getattr(obj, key)
 for key in dir(obj) if not key.startswith('_')
 }
def get_reviews(page_no=1):
 """
 Generate all the reviews associated with a Goodreads account.
 :param page_no: (optional) API results are paginated in batches of 200,
 which page to start on.
 """
 # reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
 # all the books on somebody's shelf.
 req = requests.get('https://www.goodreads.com/review/list.xml', params={
 'v': '2',
 'key': API_KEY,
 'id': USER_ID,
 'page': str(page_no)
 })
 if req.status_code != 200:
 print(
 "Unexpected error code from Goodreads API: %s\n"
 "Error message: %r" % (req.status_code, req.text),
 file=sys.stderr
 )
 sys.exit(1)
 root = ET.fromstring(req.text)
 for review in root.find('reviews').findall('review'):
 yield Review(review)
 # Do we need to get the next page?
 # TODO: Have enough books to test this step!
 total = int(root.find('reviews').attrib['total'])
 end = int(root.find('reviews').attrib['end'])
 if end < total:
 yield from get_reviews(page_no=page_no+1)
def main():
 try:
 path = sys.argv[1]
 except IndexError:
 path = 'goodreads_backup.json'
 json_str = json.dumps(
 [asdict(rev) for rev in get_reviews()],
 indent=2,
 sort_keys=True
 )
 with open(path, 'w', encoding='utf-8') as f:
 f.write(json_str)
 print('Written backup to %s' % path)
if __name__ == '__main__':
 main()

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
 """
 Used internally by the Review class. The review class has an ETree
 element representing an XML document of the form:
 <review>
 <tag1>value1</tag1>
 <tag2>value2</tag2>
 <book>
 <book_tag1>book_value1</book_tag1>
 <book_tag2>book_value2</book_tag2>
 </book>
 </review>
 This descriptor provides a cleaner interface for reading values from
 the XML.
 :param tag_name: Name of the tag in the XML. If prefixed with 'book/',
 reads the attribute from within <book>.
 :param factory: Applied to the value before returning, if supplied.
 For example, passing ``factory=int`` would cast the value to
 an integer. Otherwise the value is passed through as a string.
 """
 def __init__(self, tag_name, factory=None):
 self.tag_name = tag_name
 self.factory = factory
 if factory is None:
 self.factory = lambda x: x
 def __get__(self, obj, type):
 assert type is Review
 if self.tag_name.startswith('book/'):
 val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
 else:
 val = obj._data.find(self.tag_name).text
 return self.factory(val)
def date_factory(date_str):
 """
 Convert a date string returned by the Goodreads API into an
 ISO-8601 UTC string.
 """
 # We may get ``None`` if the API is missing any information for a
 # particular date field -- for example, the ``date_read`` field is
 if date_str is None:
 return None
 else:
 # In the API responses, dates are returned in the form
 # "Mon Oct 24 12:26:31 -0700 2016"
 date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
 return str(date_obj.astimezone(timezone.utc))
class Review(object):
 """
 Wrapper class around an xml.etree.ElementTree.Element object that
 contains the data from a review. Does all the messy handling of
 getting the interesting bits out of the XML response.
 """
 def __init__(self, data):
 self._data = data
 book_id = TagDescriptor('book/id')
 title = TagDescriptor('book/title')
 isbn = TagDescriptor('book/isbn')
 isbn13 = TagDescriptor('book/isbn13')
 average_rating = TagDescriptor('book/average_rating')
 publisher = TagDescriptor('book/publisher')
 binding = TagDescriptor('book/format')
 page_count = TagDescriptor('book/num_pages', int)
 year_published = TagDescriptor('book/publication_year')
 orig_year_published = TagDescriptor('book/published')
 date_read = TagDescriptor('read_at', date_factory)
 date_added = TagDescriptor('date_added', date_factory)
 review = TagDescriptor('body', lambda x: x.strip())
 @property
 def authors(self):
 return [
 a.find('name').text
 for a in self._data.find('book').find('authors').findall('author')
 ]
 @property
 def my_rating(self):
 # The Goodreads API returns '0' to indicate an unrated book; make
 # this a proper null type.
 rating = self._data.find('rating').text
 if rating == '0':
 return None
 else:
 return rating
 @property
 def bookshelves(self):
 return [
 shelf.attrib['name']
 for shelf in self._data.find('shelves').findall('shelf')
 ]
def asdict(obj):
 """
 Given an object, return a dictionary of its public attributes suitable
 for JSON serialisation.
 """
 return {
 key: getattr(obj, key)
 for key in dir(obj) if not key.startswith('_')
 }
def get_reviews(page_no=1):
 """
 Generate all the reviews associated with a Goodreads account.
 :param page_no: (optional) API results are paginated in batches of 200,
 which page to start on.
 """
 # reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
 # all the books on somebody's shelf.
 req = requests.get('https://www.goodreads.com/review/list.xml', params={
 'v': '2',
 'key': API_KEY,
 'id': USER_ID,
 'page': str(page_no)
 })
 if req.status_code != 200:
 print(
 "Unexpected error code from Goodreads API: %s\n"
 "Error message: %r" % (req.status_code, req.text),
 file=sys.stderr
 )
 sys.exit(1)
 root = ET.fromstring(req.text)
 for review in root.find('reviews').findall('review'):
 yield Review(review)
 # Do we need to get the next page?
 # TODO: Have enough books to test this step!
 total = int(root.find('reviews').attrib['total'])
 end = int(root.find('reviews').attrib['end'])
 if end < total:
 yield from get_reviews(page_no=page_no+1)
def main():
 try:
 path = sys.argv[1]
 except IndexError:
 path = 'goodreads_backup.json'
 json_str = json.dumps(
 [asdict(rev) for rev in get_reviews()],
 indent=2,
 sort_keys=True
 )
 with open(path, 'w', encoding='utf-8') as f:
 f.write(json_str)
 print('Written backup to %s' % path)
if __name__ == '__main__':
 main()

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '**redacted**' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = '**redacted**' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
 """
 Used internally by the Review class. The review class has an ETree
 element representing an XML document of the form:
 <review>
 <tag1>value1</tag1>
 <tag2>value2</tag2>
 <book>
 <book_tag1>book_value1</book_tag1>
 <book_tag2>book_value2</book_tag2>
 </book>
 </review>
 This descriptor provides a cleaner interface for reading values from
 the XML.
 :param tag_name: Name of the tag in the XML. If prefixed with 'book/',
 reads the attribute from within <book>.
 :param factory: Applied to the value before returning, if supplied.
 For example, passing ``factory=int`` would cast the value to
 an integer. Otherwise the value is passed through as a string.
 """
 def __init__(self, tag_name, factory=None):
 self.tag_name = tag_name
 self.factory = factory
 if factory is None:
 self.factory = lambda x: x
 def __get__(self, obj, type):
 assert type is Review
 if self.tag_name.startswith('book/'):
 val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
 else:
 val = obj._data.find(self.tag_name).text
 return self.factory(val)
def date_factory(date_str):
 """
 Convert a date string returned by the Goodreads API into an
 ISO-8601 UTC string.
 """
 # We may get ``None`` if the API is missing any information for a
 # particular date field -- for example, the ``date_read`` field is
 if date_str is None:
 return None
 else:
 # In the API responses, dates are returned in the form
 # "Mon Oct 24 12:26:31 -0700 2016"
 date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
 return str(date_obj.astimezone(timezone.utc))
class Review(object):
 """
 Wrapper class around an xml.etree.ElementTree.Element object that
 contains the data from a review. Does all the messy handling of
 getting the interesting bits out of the XML response.
 """
 def __init__(self, data):
 self._data = data
 book_id = TagDescriptor('book/id')
 title = TagDescriptor('book/title')
 isbn = TagDescriptor('book/isbn')
 isbn13 = TagDescriptor('book/isbn13')
 average_rating = TagDescriptor('book/average_rating')
 publisher = TagDescriptor('book/publisher')
 binding = TagDescriptor('book/format')
 page_count = TagDescriptor('book/num_pages', int)
 year_published = TagDescriptor('book/publication_year')
 orig_year_published = TagDescriptor('book/published')
 date_read = TagDescriptor('read_at', date_factory)
 date_added = TagDescriptor('date_added', date_factory)
 review = TagDescriptor('body', lambda x: x.strip())
 @property
 def authors(self):
 return [
 a.find('name').text
 for a in self._data.find('book').find('authors').findall('author')
 ]
 @property
 def my_rating(self):
 # The Goodreads API returns '0' to indicate an unrated book; make
 # this a proper null type.
 rating = self._data.find('rating').text
 if rating == '0':
 return None
 else:
 return rating
 @property
 def bookshelves(self):
 return [
 shelf.attrib['name']
 for shelf in self._data.find('shelves').findall('shelf')
 ]
def asdict(obj):
 """
 Given an object, return a dictionary of its public attributes suitable
 for JSON serialisation.
 """
 return {
 key: getattr(obj, key)
 for key in dir(obj) if not key.startswith('_')
 }
def get_reviews(page_no=1):
 """
 Generate all the reviews associated with a Goodreads account.
 :param page_no: (optional) API results are paginated in batches of 200,
 which page to start on.
 """
 # reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
 # all the books on somebody's shelf.
 req = requests.get('https://www.goodreads.com/review/list.xml', params={
 'v': '2',
 'key': API_KEY,
 'id': USER_ID,
 'page': str(page_no)
 })
 if req.status_code != 200:
 print(
 "Unexpected error code from Goodreads API: %s\n"
 "Error message: %r" % (req.status_code, req.text),
 file=sys.stderr
 )
 sys.exit(1)
 root = ET.fromstring(req.text)
 for review in root.find('reviews').findall('review'):
 yield Review(review)
 # Do we need to get the next page?
 # TODO: Have enough books to test this step!
 total = int(root.find('reviews').attrib['total'])
 end = int(root.find('reviews').attrib['end'])
 if end < total:
 yield from get_reviews(page_no=page_no+1)
def main():
 try:
 path = sys.argv[1]
 except IndexError:
 path = 'goodreads_backup.json'
 json_str = json.dumps(
 [asdict(rev) for rev in get_reviews()],
 indent=2,
 sort_keys=True
 )
 with open(path, 'w', encoding='utf-8') as f:
 f.write(json_str)
 print('Written backup to %s' % path)
if __name__ == '__main__':
 main()

Source Link

asked Nov 7, 2016 at 18:05

alexwlchan

asked Nov 7, 2016 at 18:05

alexwlchan

8.7k
1
23
55

Python script for backing up reading information from Goodreads

I’ve written a script for automatically backing up my Goodreads data. This mimics the Export function on the Goodreads website, which returns a CSV file – but this doesn’t require going through a website, and provides JSON instead of CSV. The idea is that I could run this on a cron job, and have regular backups of my user data.

The script can be invoked without any arguments, or with one argument to specify the filename to write the backup to:

$ python3 backup_goodreads.py
$ python3 backup_goodreads.py my backup.json

I’d be interested in any feedback, but particularly around:

My use of xml.etree. The Goodreads API mostly serves XML, but I’m not very familiar with this library, so I might not be making the best use of it. For ease of reviewing, I’ve posted an API response in a Gist so you can see what the API responses look like.
Is the code easy to follow? It makes a lot of sense to me, but I wrote it! I’m not sure whether it’s easily comprehensible to somebody else.
Uncaught errors. Are there any obvious error cases I’m failing to handle correctly? (Assume I trust that if I get a successful response, the XML will have the right structure.)

Here’s the code:

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
 """
 Used internally by the Review class. The review class has an ETree
 element representing an XML document of the form:
 <review>
 <tag1>value1</tag1>
 <tag2>value2</tag2>
 <book>
 <book_tag1>book_value1</book_tag1>
 <book_tag2>book_value2</book_tag2>
 </book>
 </review>
 This descriptor provides a cleaner interface for reading values from
 the XML.
 :param tag_name: Name of the tag in the XML. If prefixed with 'book/',
 reads the attribute from within <book>.
 :param factory: Applied to the value before returning, if supplied.
 For example, passing ``factory=int`` would cast the value to
 an integer. Otherwise the value is passed through as a string.
 """
 def __init__(self, tag_name, factory=None):
 self.tag_name = tag_name
 self.factory = factory
 if factory is None:
 self.factory = lambda x: x
 def __get__(self, obj, type):
 assert type is Review
 if self.tag_name.startswith('book/'):
 val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
 else:
 val = obj._data.find(self.tag_name).text
 return self.factory(val)
def date_factory(date_str):
 """
 Convert a date string returned by the Goodreads API into an
 ISO-8601 UTC string.
 """
 # We may get ``None`` if the API is missing any information for a
 # particular date field -- for example, the ``date_read`` field is
 if date_str is None:
 return None
 else:
 # In the API responses, dates are returned in the form
 # "Mon Oct 24 12:26:31 -0700 2016"
 date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
 return str(date_obj.astimezone(timezone.utc))
class Review(object):
 """
 Wrapper class around an xml.etree.ElementTree.Element object that
 contains the data from a review. Does all the messy handling of
 getting the interesting bits out of the XML response.
 """
 def __init__(self, data):
 self._data = data
 book_id = TagDescriptor('book/id')
 title = TagDescriptor('book/title')
 isbn = TagDescriptor('book/isbn')
 isbn13 = TagDescriptor('book/isbn13')
 average_rating = TagDescriptor('book/average_rating')
 publisher = TagDescriptor('book/publisher')
 binding = TagDescriptor('book/format')
 page_count = TagDescriptor('book/num_pages', int)
 year_published = TagDescriptor('book/publication_year')
 orig_year_published = TagDescriptor('book/published')
 date_read = TagDescriptor('read_at', date_factory)
 date_added = TagDescriptor('date_added', date_factory)
 review = TagDescriptor('body', lambda x: x.strip())
 @property
 def authors(self):
 return [
 a.find('name').text
 for a in self._data.find('book').find('authors').findall('author')
 ]
 @property
 def my_rating(self):
 # The Goodreads API returns '0' to indicate an unrated book; make
 # this a proper null type.
 rating = self._data.find('rating').text
 if rating == '0':
 return None
 else:
 return rating
 @property
 def bookshelves(self):
 return [
 shelf.attrib['name']
 for shelf in self._data.find('shelves').findall('shelf')
 ]
def asdict(obj):
 """
 Given an object, return a dictionary of its public attributes suitable
 for JSON serialisation.
 """
 return {
 key: getattr(obj, key)
 for key in dir(obj) if not key.startswith('_')
 }
def get_reviews(page_no=1):
 """
 Generate all the reviews associated with a Goodreads account.
 :param page_no: (optional) API results are paginated in batches of 200,
 which page to start on.
 """
 # reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
 # all the books on somebody's shelf.
 req = requests.get('https://www.goodreads.com/review/list.xml', params={
 'v': '2',
 'key': API_KEY,
 'id': USER_ID,
 'page': str(page_no)
 })
 if req.status_code != 200:
 print(
 "Unexpected error code from Goodreads API: %s\n"
 "Error message: %r" % (req.status_code, req.text),
 file=sys.stderr
 )
 sys.exit(1)
 root = ET.fromstring(req.text)
 for review in root.find('reviews').findall('review'):
 yield Review(review)
 # Do we need to get the next page?
 # TODO: Have enough books to test this step!
 total = int(root.find('reviews').attrib['total'])
 end = int(root.find('reviews').attrib['end'])
 if end < total:
 yield from get_reviews(page_no=page_no+1)
def main():
 try:
 path = sys.argv[1]
 except IndexError:
 path = 'goodreads_backup.json'
 json_str = json.dumps(
 [asdict(rev) for rev in get_reviews()],
 indent=2,
 sort_keys=True
 )
 with open(path, 'w', encoding='utf-8') as f:
 f.write(json_str)
 print('Written backup to %s' % path)
if __name__ == '__main__':
 main()

The master version of this code, along with a README and a requirements.txt, is on GitHub.

python python-3.x

lang-py