I went about this the way I have because Selenium is slow and inconvenient (opens browser) and I was unable to find the href
attribute in the link element using BeautifulSoup. Html included below. On the downside, this appears to only ever find 25 images.
#! python3
# Saves photos to file from flickr.com using specified search term
import bs4
import logging
import os
import re
import requests
import shutil
import sys
import time
def find_link(element):
"""Finds link using a regular expression"""
link_regex = re.compile(r"//c\d+.staticflickr.com/\d+/\d+/\w+\.jpg")
# dictionary of element attributes
element_attr_dict = element.attrs
# get list of element attribute values wherein image link lies
attr_list = [element_attr_dict[key] for key in element_attr_dict.keys()]
attr_string = ""
# link exists within a string list element
for element in attr_list:
if type(element) == str:
attr_string += element
match = link_regex.search(attr_string)
if match:
link = "https:" + match.group()
else:
link = None
return link
def main():
"""Downloads specified type/number of 'flickr' images to folder
Takes three command line arguments: "filename, search_term, number_
images". Saves images to folder. Number of images saved based upon
number requested by user or number found during search. Whichever is
lower.
Arguments:
search_term -- search image site using this term
number_images -- maximum number of images to save to folder
"""
try:
search_term, number_images = sys.argv[1:]
number_images = int(number_images)
except ValueError:
print("Something went wrong. Command line input must be of \
format: 'filename searchterm numberimages'")
return
links = []
# make folder to store photos and name using search term
html_path = r"C:\Users\Dave\Desktop2016円Coding\AutomateBoring11円" + \
r"-WebScraping\flickrhtml.txt"
path = \
r"C:\Users\Dave\Desktop2016円Coding\AutomateBoring11円-WebScraping" + \
r"\gimages\requests"
folder_path = os.path.join(path, search_term)
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
os.makedirs(folder_path)
print("Finding photos...")
# get links to photos
res = requests.get("https://www.flickr.com/search/?text=" + search_term)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
found_elems = soup.select(".photo-list-photo-view")
# incase number found images < requested
number_save_images = min(number_images, len(found_elems))
print("Found {} images".format(number_save_images))
for found_elem in found_elems[:number_save_images]:
link = find_link(found_elem)
links.append(link)
# write images to file
print("Writing images to folder...")
for image_link in links:
basename = os.path.basename(image_link)
save_file_name = os.path.join(folder_path, basename)
res = requests.get(image_link)
res.raise_for_status()
with open(save_file_name, "wb") as f:
for chunk in res.iter_content(100000):
f.write(chunk)
print("Images saved at: {}".format(folder_path))
print("*****Done*****")
if __name__ == "__main__":
main()
3 Answers 3
According to PEP8 standard library import should be first.
Function find_link(element)
:
- The docstring of the function
find_link
should probably say something about the argument type. - Why is the argument element (and not
element_attributes
) when it looks like all you use iselement.attrs
? - The list comprehension could be written as:
attr_list = list(element_attr_dict.values())
But, you don't need it at all. You can simply writefor element in attr_dict.values()
.
To summarize, you can delete most lines in the beginning. Instead start with
def find_link(element_attributes):
link_regex = re.compile(r"//c\d+.staticflickr.com/\d+/\d+/\w+\.jpg")
attr_string = ""
for element in element_attributes.values():
...
As for the main function:
- Investigate argparse.
- Use logging instead of
print
. - Split the main function to sub-functions. At least saving an image from a link should probably be one.
- The for loop where you append a link to a list can be expressed as a list comprehension.
I hope that helps!
-
\$\begingroup\$ Thanks, I appreciate it! I've sorted the list comprehension in the main function and I do use logging (removed it before posting). All my print statements are deliberate. Eg whilst catching a value error, it's just a neater (imo) way of reminding myself about the format to use while at the command line. Let me if that isn't a legitimate use" I'll get on the rest tomorrow! \$\endgroup\$Michael Johnson– Michael Johnson2016年11月11日 15:36:38 +00:00Commented Nov 11, 2016 at 15:36
-
\$\begingroup\$ Welcome to Code Review, your first answers looks good, enjoy your stay! \$\endgroup\$ferada– ferada2016年11月11日 16:40:54 +00:00Commented Nov 11, 2016 at 16:40
-
\$\begingroup\$ Argparse is great. It solves an problem I had with my own command line method; optionals. \$\endgroup\$Michael Johnson– Michael Johnson2016年11月14日 13:57:55 +00:00Commented Nov 14, 2016 at 13:57
Scraping is better than using Selenium, but you've still missed the easiest, most maintainable, and least likely to get yourself IP-banned way: the Flickr API. There's a search method and a method that returns URLs of the variously-sized versions of an image. And of course, there are many different wrappers already written to make querying the API easy in Python.
Aside from that, the most obvious thing that stands out to me is the lack of any argument parsing. kaidokuuppa already mentioned argparse
, which is nice because it's in the standard library. I personally prefer the approach of docopt (Python implementation here), since it encourages writing good, useful help messages.
Somewhat still on that subject, I favor doing the command-line parsing separately from the rest of your main()
; this allows someone else (perhaps you!) who wants to use that function as a library to import it and pass the appropriate parameters in.
-
\$\begingroup\$ Appreciate it! This was just a challenge in a beginners book and so I was operating with a very low level of scraping knowledge but I will certainly investigate your suggestions. \$\endgroup\$Michael Johnson– Michael Johnson2016年11月14日 12:46:43 +00:00Commented Nov 14, 2016 at 12:46
Hope I've covered everything you guys suggested, with the exception of the flickr api, and that it's been implemented correctly(unsure regarding argparse; I completed only a short tutorial). Revised code:
#! python3
# Saves photos to file from flickr.com using specified search term
import argparse
import logging
import os
import re
import shutil
import sys
import time
import bs4
import requests
def handle_input():
"""Parses user input and provides guidance as to correct input format"""
parser = argparse.ArgumentParser()
parser.add_argument("search_term",
help="The term to use in the 'flickr' search")
parser.add_argument("number_images", type=int,
help="The maximum number of images to save to folder.")
args = parser.parse_args()
return args.search_term, args.number_images
def find_link(element_attr_dict):
"""Finds link using a regular expression
Arguments:
element -- html element containing a link to image src within its
attribute value
"""
link_regex = re.compile(r"//c\d+.staticflickr.com/\d+/\d+/\w+\.jpg")
# link resides within a string. Get all string dict values and search
attr_string = ""
for value in element_attr_dict.values():
if type(value) == str:
attr_string += value
match = link_regex.search(attr_string)
if match:
link = "https:" + match.group()
else:
link = None
return link
def save_images(link_list, path):
"""Saves images to 'path' using the basename of each link as filename
link_list -- a list of src links which are used to download images
path -- the absolute path in which to save the images
"""
print("Writing images to folder...")
for image_link in link_list:
basename = os.path.basename(image_link)
save_file_name = os.path.join(path, basename)
res = requests.get(image_link)
res.raise_for_status()
with open(save_file_name, "wb") as f:
for chunk in res.iter_content(100000):
f.write(chunk)
print("Images saved at: {}".format(path))
def main():
"""Downloads specified type/number of 'flickr' images to folder
Takes three command line arguments: "filename, search_term, number_
images". Saves images to folder. Number of images saved based upon
number requested by user or number found during search. Whichever is
lower.
Arguments:
search_term -- search image site using this term
number_images -- maximum number of images to save to folder
"""
search_term, number_images = handle_input()
links = []
# make folder to store photos and name using search term
path = \
r"C:\Users\Dave\Desktop2016円Coding\AutomateBoring11円-WebScraping" + \
r"\gimages\requests"
folder_path = os.path.join(path, search_term)
if os.path.exists(folder_path):
shutil.rmtree(folder_path)
os.makedirs(folder_path)
print("Finding photos...")
# get links to photos
res = requests.get("https://www.flickr.com/search/?text=" + search_term)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
found_elems = soup.select(".photo-list-photo-view")
# incase number found images < requested
number_save_images = min(number_images, len(found_elems))
print("Found {} images".format(number_save_images))
links = [find_link(found_elem.attrs) for found_elem in \
found_elems[:number_save_images]]
# save images to folder
save_images(links, folder_path)
print("*****Done*****")
if __name__ == "__main__":
main()