Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit cca592b

Browse files
Adding Xkcdcomic Scraper
1 parent 2f18733 commit cca592b

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

‎Web-Scraping-Projects/downloadXkcd.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#! python3
2+
# downloadXkcd.py - Downloads every single XKCD comic.
3+
# Importing all the necessary libraries
4+
import requests
5+
import os
6+
import bs4
7+
8+
url = 'http://xkcd.com' # starting url
9+
directory = 'xkcd_comics'
10+
parent_dir = ".\Web-Scraping-Projects"
11+
path = os.path.join(parent_dir, directory)
12+
try:
13+
os.makedirs(path, exist_ok=True) # store comics in ./xkcd_comics
14+
print(f"Directory {directory} created successfully.")
15+
except OSError as error:
16+
print(f"Directory {error} can not be created.")
17+
18+
while not url.endswith('#'):
19+
# TODO: Download the page.
20+
# save the html data for the given webpage
21+
res = requests.get(url)
22+
# Handling all the exceptions (If encountered):
23+
try:
24+
res.raise_for_status()
25+
# good practice to ensure that the program stops if a bad download occurs.
26+
except Exception as err:
27+
print(f'There was a problem: {err}')
28+
# creating the beautifulsoup4 object from the xkcd webpage stored in the requests response object
29+
xkcdsoup = bs4.BeautifulSoup(res.text)
30+
# TODO: Find the URL of the comic image.
31+
# extracts: All elements named <img> within an `id` attribute of `comic`
32+
# Can be found by using the inspect element.
33+
comicElem = xkcdsoup.select('#comic img')
34+
if comicElem == []:
35+
print('Could not find the comic image.')
36+
else:
37+
try:
38+
comicUrl = 'http:' + comicElem[0].get('src') # get the image url.
39+
# Download the image.
40+
print(f"Downloading the image {comicUrl}")
41+
res = requests.get(comicUrl)
42+
res.raise_for_status()
43+
except requests.exceptions.MissingSchema:
44+
# Skip this comic
45+
prevlink = xkcdsoup.select('a[rel="prev"]')[0]
46+
url = 'http://xkcd.com' + prevlink.get('href')
47+
continue
48+
# TODO: Download the image.
49+
imageFile = open(os.path.join(path, os.path.basename(comicUrl)), 'wb')
50+
for chunk in res.iter_content(100000):
51+
imageFile.write(chunk)
52+
imageFile.close()
53+
# TODO: Save the image to ./xkcd_comics
54+
55+
# TODO: Get the Prev button's url.
56+
prevLink = xkcdsoup.select('a[rel="prev"]')[0]
57+
url = 'http://xkcd.com' + prevLink.get('href')
58+
59+
print('Done.')

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /