#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850''**redacted**' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz''**redacted**' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
"""
Used internally by the Review class. The review class has an ETree
element representing an XML document of the form:
<review>
<tag1>value1</tag1>
<tag2>value2</tag2>
<book>
<book_tag1>book_value1</book_tag1>
<book_tag2>book_value2</book_tag2>
</book>
</review>
This descriptor provides a cleaner interface for reading values from
the XML.
:param tag_name: Name of the tag in the XML. If prefixed with 'book/',
reads the attribute from within <book>.
:param factory: Applied to the value before returning, if supplied.
For example, passing ``factory=int`` would cast the value to
an integer. Otherwise the value is passed through as a string.
"""
def __init__(self, tag_name, factory=None):
self.tag_name = tag_name
self.factory = factory
if factory is None:
self.factory = lambda x: x
def __get__(self, obj, type):
assert type is Review
if self.tag_name.startswith('book/'):
val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
else:
val = obj._data.find(self.tag_name).text
return self.factory(val)
def date_factory(date_str):
"""
Convert a date string returned by the Goodreads API into an
ISO-8601 UTC string.
"""
# We may get ``None`` if the API is missing any information for a
# particular date field -- for example, the ``date_read`` field is
if date_str is None:
return None
else:
# In the API responses, dates are returned in the form
# "Mon Oct 24 12:26:31 -0700 2016"
date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
return str(date_obj.astimezone(timezone.utc))
class Review(object):
"""
Wrapper class around an xml.etree.ElementTree.Element object that
contains the data from a review. Does all the messy handling of
getting the interesting bits out of the XML response.
"""
def __init__(self, data):
self._data = data
book_id = TagDescriptor('book/id')
title = TagDescriptor('book/title')
isbn = TagDescriptor('book/isbn')
isbn13 = TagDescriptor('book/isbn13')
average_rating = TagDescriptor('book/average_rating')
publisher = TagDescriptor('book/publisher')
binding = TagDescriptor('book/format')
page_count = TagDescriptor('book/num_pages', int)
year_published = TagDescriptor('book/publication_year')
orig_year_published = TagDescriptor('book/published')
date_read = TagDescriptor('read_at', date_factory)
date_added = TagDescriptor('date_added', date_factory)
review = TagDescriptor('body', lambda x: x.strip())
@property
def authors(self):
return [
a.find('name').text
for a in self._data.find('book').find('authors').findall('author')
]
@property
def my_rating(self):
# The Goodreads API returns '0' to indicate an unrated book; make
# this a proper null type.
rating = self._data.find('rating').text
if rating == '0':
return None
else:
return rating
@property
def bookshelves(self):
return [
shelf.attrib['name']
for shelf in self._data.find('shelves').findall('shelf')
]
def asdict(obj):
"""
Given an object, return a dictionary of its public attributes suitable
for JSON serialisation.
"""
return {
key: getattr(obj, key)
for key in dir(obj) if not key.startswith('_')
}
def get_reviews(page_no=1):
"""
Generate all the reviews associated with a Goodreads account.
:param page_no: (optional) API results are paginated in batches of 200,
which page to start on.
"""
# reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
# all the books on somebody's shelf.
req = requests.get('https://www.goodreads.com/review/list.xml', params={
'v': '2',
'key': API_KEY,
'id': USER_ID,
'page': str(page_no)
})
if req.status_code != 200:
print(
"Unexpected error code from Goodreads API: %s\n"
"Error message: %r" % (req.status_code, req.text),
file=sys.stderr
)
sys.exit(1)
root = ET.fromstring(req.text)
for review in root.find('reviews').findall('review'):
yield Review(review)
# Do we need to get the next page?
# TODO: Have enough books to test this step!
total = int(root.find('reviews').attrib['total'])
end = int(root.find('reviews').attrib['end'])
if end < total:
yield from get_reviews(page_no=page_no+1)
def main():
try:
path = sys.argv[1]
except IndexError:
path = 'goodreads_backup.json'
json_str = json.dumps(
[asdict(rev) for rev in get_reviews()],
indent=2,
sort_keys=True
)
with open(path, 'w', encoding='utf-8') as f:
f.write(json_str)
print('Written backup to %s' % path)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
"""
Used internally by the Review class. The review class has an ETree
element representing an XML document of the form:
<review>
<tag1>value1</tag1>
<tag2>value2</tag2>
<book>
<book_tag1>book_value1</book_tag1>
<book_tag2>book_value2</book_tag2>
</book>
</review>
This descriptor provides a cleaner interface for reading values from
the XML.
:param tag_name: Name of the tag in the XML. If prefixed with 'book/',
reads the attribute from within <book>.
:param factory: Applied to the value before returning, if supplied.
For example, passing ``factory=int`` would cast the value to
an integer. Otherwise the value is passed through as a string.
"""
def __init__(self, tag_name, factory=None):
self.tag_name = tag_name
self.factory = factory
if factory is None:
self.factory = lambda x: x
def __get__(self, obj, type):
assert type is Review
if self.tag_name.startswith('book/'):
val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
else:
val = obj._data.find(self.tag_name).text
return self.factory(val)
def date_factory(date_str):
"""
Convert a date string returned by the Goodreads API into an
ISO-8601 UTC string.
"""
# We may get ``None`` if the API is missing any information for a
# particular date field -- for example, the ``date_read`` field is
if date_str is None:
return None
else:
# In the API responses, dates are returned in the form
# "Mon Oct 24 12:26:31 -0700 2016"
date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
return str(date_obj.astimezone(timezone.utc))
class Review(object):
"""
Wrapper class around an xml.etree.ElementTree.Element object that
contains the data from a review. Does all the messy handling of
getting the interesting bits out of the XML response.
"""
def __init__(self, data):
self._data = data
book_id = TagDescriptor('book/id')
title = TagDescriptor('book/title')
isbn = TagDescriptor('book/isbn')
isbn13 = TagDescriptor('book/isbn13')
average_rating = TagDescriptor('book/average_rating')
publisher = TagDescriptor('book/publisher')
binding = TagDescriptor('book/format')
page_count = TagDescriptor('book/num_pages', int)
year_published = TagDescriptor('book/publication_year')
orig_year_published = TagDescriptor('book/published')
date_read = TagDescriptor('read_at', date_factory)
date_added = TagDescriptor('date_added', date_factory)
review = TagDescriptor('body', lambda x: x.strip())
@property
def authors(self):
return [
a.find('name').text
for a in self._data.find('book').find('authors').findall('author')
]
@property
def my_rating(self):
# The Goodreads API returns '0' to indicate an unrated book; make
# this a proper null type.
rating = self._data.find('rating').text
if rating == '0':
return None
else:
return rating
@property
def bookshelves(self):
return [
shelf.attrib['name']
for shelf in self._data.find('shelves').findall('shelf')
]
def asdict(obj):
"""
Given an object, return a dictionary of its public attributes suitable
for JSON serialisation.
"""
return {
key: getattr(obj, key)
for key in dir(obj) if not key.startswith('_')
}
def get_reviews(page_no=1):
"""
Generate all the reviews associated with a Goodreads account.
:param page_no: (optional) API results are paginated in batches of 200,
which page to start on.
"""
# reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
# all the books on somebody's shelf.
req = requests.get('https://www.goodreads.com/review/list.xml', params={
'v': '2',
'key': API_KEY,
'id': USER_ID,
'page': str(page_no)
})
if req.status_code != 200:
print(
"Unexpected error code from Goodreads API: %s\n"
"Error message: %r" % (req.status_code, req.text),
file=sys.stderr
)
sys.exit(1)
root = ET.fromstring(req.text)
for review in root.find('reviews').findall('review'):
yield Review(review)
# Do we need to get the next page?
# TODO: Have enough books to test this step!
total = int(root.find('reviews').attrib['total'])
end = int(root.find('reviews').attrib['end'])
if end < total:
yield from get_reviews(page_no=page_no+1)
def main():
try:
path = sys.argv[1]
except IndexError:
path = 'goodreads_backup.json'
json_str = json.dumps(
[asdict(rev) for rev in get_reviews()],
indent=2,
sort_keys=True
)
with open(path, 'w', encoding='utf-8') as f:
f.write(json_str)
print('Written backup to %s' % path)
if __name__ == '__main__':
main()
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '**redacted**' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = '**redacted**' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
"""
Used internally by the Review class. The review class has an ETree
element representing an XML document of the form:
<review>
<tag1>value1</tag1>
<tag2>value2</tag2>
<book>
<book_tag1>book_value1</book_tag1>
<book_tag2>book_value2</book_tag2>
</book>
</review>
This descriptor provides a cleaner interface for reading values from
the XML.
:param tag_name: Name of the tag in the XML. If prefixed with 'book/',
reads the attribute from within <book>.
:param factory: Applied to the value before returning, if supplied.
For example, passing ``factory=int`` would cast the value to
an integer. Otherwise the value is passed through as a string.
"""
def __init__(self, tag_name, factory=None):
self.tag_name = tag_name
self.factory = factory
if factory is None:
self.factory = lambda x: x
def __get__(self, obj, type):
assert type is Review
if self.tag_name.startswith('book/'):
val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
else:
val = obj._data.find(self.tag_name).text
return self.factory(val)
def date_factory(date_str):
"""
Convert a date string returned by the Goodreads API into an
ISO-8601 UTC string.
"""
# We may get ``None`` if the API is missing any information for a
# particular date field -- for example, the ``date_read`` field is
if date_str is None:
return None
else:
# In the API responses, dates are returned in the form
# "Mon Oct 24 12:26:31 -0700 2016"
date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
return str(date_obj.astimezone(timezone.utc))
class Review(object):
"""
Wrapper class around an xml.etree.ElementTree.Element object that
contains the data from a review. Does all the messy handling of
getting the interesting bits out of the XML response.
"""
def __init__(self, data):
self._data = data
book_id = TagDescriptor('book/id')
title = TagDescriptor('book/title')
isbn = TagDescriptor('book/isbn')
isbn13 = TagDescriptor('book/isbn13')
average_rating = TagDescriptor('book/average_rating')
publisher = TagDescriptor('book/publisher')
binding = TagDescriptor('book/format')
page_count = TagDescriptor('book/num_pages', int)
year_published = TagDescriptor('book/publication_year')
orig_year_published = TagDescriptor('book/published')
date_read = TagDescriptor('read_at', date_factory)
date_added = TagDescriptor('date_added', date_factory)
review = TagDescriptor('body', lambda x: x.strip())
@property
def authors(self):
return [
a.find('name').text
for a in self._data.find('book').find('authors').findall('author')
]
@property
def my_rating(self):
# The Goodreads API returns '0' to indicate an unrated book; make
# this a proper null type.
rating = self._data.find('rating').text
if rating == '0':
return None
else:
return rating
@property
def bookshelves(self):
return [
shelf.attrib['name']
for shelf in self._data.find('shelves').findall('shelf')
]
def asdict(obj):
"""
Given an object, return a dictionary of its public attributes suitable
for JSON serialisation.
"""
return {
key: getattr(obj, key)
for key in dir(obj) if not key.startswith('_')
}
def get_reviews(page_no=1):
"""
Generate all the reviews associated with a Goodreads account.
:param page_no: (optional) API results are paginated in batches of 200,
which page to start on.
"""
# reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
# all the books on somebody's shelf.
req = requests.get('https://www.goodreads.com/review/list.xml', params={
'v': '2',
'key': API_KEY,
'id': USER_ID,
'page': str(page_no)
})
if req.status_code != 200:
print(
"Unexpected error code from Goodreads API: %s\n"
"Error message: %r" % (req.status_code, req.text),
file=sys.stderr
)
sys.exit(1)
root = ET.fromstring(req.text)
for review in root.find('reviews').findall('review'):
yield Review(review)
# Do we need to get the next page?
# TODO: Have enough books to test this step!
total = int(root.find('reviews').attrib['total'])
end = int(root.find('reviews').attrib['end'])
if end < total:
yield from get_reviews(page_no=page_no+1)
def main():
try:
path = sys.argv[1]
except IndexError:
path = 'goodreads_backup.json'
json_str = json.dumps(
[asdict(rev) for rev in get_reviews()],
indent=2,
sort_keys=True
)
with open(path, 'w', encoding='utf-8') as f:
f.write(json_str)
print('Written backup to %s' % path)
if __name__ == '__main__':
main()
Python script for backing up reading information from Goodreads
I’ve written a script for automatically backing up my Goodreads data. This mimics the Export function on the Goodreads website, which returns a CSV file – but this doesn’t require going through a website, and provides JSON instead of CSV. The idea is that I could run this on a cron job, and have regular backups of my user data.
The script can be invoked without any arguments, or with one argument to specify the filename to write the backup to:
$ python3 backup_goodreads.py
$ python3 backup_goodreads.py my backup.json
I’d be interested in any feedback, but particularly around:
My use of xml.etree. The Goodreads API mostly serves XML, but I’m not very familiar with this library, so I might not be making the best use of it. For ease of reviewing, I’ve posted an API response in a Gist so you can see what the API responses look like.
Is the code easy to follow? It makes a lot of sense to me, but I wrote it! I’m not sure whether it’s easily comprehensible to somebody else.
Uncaught errors. Are there any obvious error cases I’m failing to handle correctly? (Assume I trust that if I get a successful response, the XML will have the right structure.)
Here’s the code:
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
"""
A script for backing up your Goodreads library. Writes a backup to
`goodreads_backup.json`, or you can provide an alternative name as the
first command-line argument.
See the README for more details.
"""
from datetime import datetime, timezone
import json
import sys
import xml.etree.ElementTree as ET
import keyring
import requests
# Goodreads user ID
USER_ID = '60854850' # keyring.get_password('goodreads', 'user_id')
# Goodreads API key. Obtain one from https://www.goodreads.com/api
API_KEY = 'GRabcdef1234xyz' # keyring.get_password('goodreads', 'api_key')
class TagDescriptor(object):
"""
Used internally by the Review class. The review class has an ETree
element representing an XML document of the form:
<review>
<tag1>value1</tag1>
<tag2>value2</tag2>
<book>
<book_tag1>book_value1</book_tag1>
<book_tag2>book_value2</book_tag2>
</book>
</review>
This descriptor provides a cleaner interface for reading values from
the XML.
:param tag_name: Name of the tag in the XML. If prefixed with 'book/',
reads the attribute from within <book>.
:param factory: Applied to the value before returning, if supplied.
For example, passing ``factory=int`` would cast the value to
an integer. Otherwise the value is passed through as a string.
"""
def __init__(self, tag_name, factory=None):
self.tag_name = tag_name
self.factory = factory
if factory is None:
self.factory = lambda x: x
def __get__(self, obj, type):
assert type is Review
if self.tag_name.startswith('book/'):
val = obj._data.find('book').find(self.tag_name.split('/')[1]).text
else:
val = obj._data.find(self.tag_name).text
return self.factory(val)
def date_factory(date_str):
"""
Convert a date string returned by the Goodreads API into an
ISO-8601 UTC string.
"""
# We may get ``None`` if the API is missing any information for a
# particular date field -- for example, the ``date_read`` field is
if date_str is None:
return None
else:
# In the API responses, dates are returned in the form
# "Mon Oct 24 12:26:31 -0700 2016"
date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')
return str(date_obj.astimezone(timezone.utc))
class Review(object):
"""
Wrapper class around an xml.etree.ElementTree.Element object that
contains the data from a review. Does all the messy handling of
getting the interesting bits out of the XML response.
"""
def __init__(self, data):
self._data = data
book_id = TagDescriptor('book/id')
title = TagDescriptor('book/title')
isbn = TagDescriptor('book/isbn')
isbn13 = TagDescriptor('book/isbn13')
average_rating = TagDescriptor('book/average_rating')
publisher = TagDescriptor('book/publisher')
binding = TagDescriptor('book/format')
page_count = TagDescriptor('book/num_pages', int)
year_published = TagDescriptor('book/publication_year')
orig_year_published = TagDescriptor('book/published')
date_read = TagDescriptor('read_at', date_factory)
date_added = TagDescriptor('date_added', date_factory)
review = TagDescriptor('body', lambda x: x.strip())
@property
def authors(self):
return [
a.find('name').text
for a in self._data.find('book').find('authors').findall('author')
]
@property
def my_rating(self):
# The Goodreads API returns '0' to indicate an unrated book; make
# this a proper null type.
rating = self._data.find('rating').text
if rating == '0':
return None
else:
return rating
@property
def bookshelves(self):
return [
shelf.attrib['name']
for shelf in self._data.find('shelves').findall('shelf')
]
def asdict(obj):
"""
Given an object, return a dictionary of its public attributes suitable
for JSON serialisation.
"""
return {
key: getattr(obj, key)
for key in dir(obj) if not key.startswith('_')
}
def get_reviews(page_no=1):
"""
Generate all the reviews associated with a Goodreads account.
:param page_no: (optional) API results are paginated in batches of 200,
which page to start on.
"""
# reviews.list (https://www.goodreads.com/api/index#reviews.list) gets
# all the books on somebody's shelf.
req = requests.get('https://www.goodreads.com/review/list.xml', params={
'v': '2',
'key': API_KEY,
'id': USER_ID,
'page': str(page_no)
})
if req.status_code != 200:
print(
"Unexpected error code from Goodreads API: %s\n"
"Error message: %r" % (req.status_code, req.text),
file=sys.stderr
)
sys.exit(1)
root = ET.fromstring(req.text)
for review in root.find('reviews').findall('review'):
yield Review(review)
# Do we need to get the next page?
# TODO: Have enough books to test this step!
total = int(root.find('reviews').attrib['total'])
end = int(root.find('reviews').attrib['end'])
if end < total:
yield from get_reviews(page_no=page_no+1)
def main():
try:
path = sys.argv[1]
except IndexError:
path = 'goodreads_backup.json'
json_str = json.dumps(
[asdict(rev) for rev in get_reviews()],
indent=2,
sort_keys=True
)
with open(path, 'w', encoding='utf-8') as f:
f.write(json_str)
print('Written backup to %s' % path)
if __name__ == '__main__':
main()
The master version of this code, along with a README and a requirements.txt, is on GitHub.