This was my first attempt at scraping approximately 1800 open datasets from the web. On the first go-around it takes approximately 1 hour to complete. Subsequent iterations only take 4 seconds. I'm guessing there are much better ways to implement the logic that I did with regard to filename structure and overall efficiency.
def scraper(url,downloadDirectory):
start = time.time()
baseUrl = r"{some_url}"
html = urlopen(url)
bsObj = BeautifulSoup(html.read())
table = bsObj.findAll("table")[0]
links = table.findAll("a")
count = 0
broken_links = []
for link in links:
try:
count += 1
link = str(link).split("\"")
if len(link) > 1:
print(link)
link = link[1]
linkBreak = link.split("_")
else:
if link[0] == "<a></a>":
print("Skipping")
continue
else:
print(link)
linkBreak = link.split("_")
title = re.findall(r"[\w']+",str(linkBreak))[9].strip("'")
if title == "nyc":
title = re.findall(r"[\w']+",str(linkBreak))[10].strip("'")
print("# " + str(count), "Title: " + str(title))
dir_path = os.path.join(downloadDirectory,title)
if os.path.isdir(dir_path) == False:
print("Creating directory: " + str(os.path.join(downloadDirectory,title)))
os.mkdir(dir_path)
file_path = urllib.parse.urljoin(baseUrl,link)
print("File Path: " + str(file_path), "\n" + "Directory Path: " + str(dir_path))
print("Split array and length: ", linkBreak, len(linkBreak))
if len(linkBreak) == 1:
if os.path.isfile(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])))
urlretrieve(file_path,os.path.join(dir_path,str(linkBreak[0]).split("/")[7]))
elif len(linkBreak) == 2:
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1])):
print("Skipping")
continue
elif str(os.path.join(dir_path,title + "_" + linkBreak[1])).endswith(".zip") == False:
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip"))
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1])))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1]))
elif len(linkBreak) == 3:
if "?" in linkBreak[2]:
linkBreak[2] = linkBreak[2].split("?", 1)[0]
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
if title == "sidewalkcafe":
linkBreak[2] = str(linkBreak[1]) + str(linkBreak[2])
if os.path.isfile(os.path.join(dir_path,title + linkBreak[2])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + linkBreak[2])))
urlretrieve(file_path,os.path.join(dir_path,title + linkBreak[2]))
else:
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
elif len(linkBreak) == 4:
if "?" in linkBreak[3]:
linkBreak[3] = linkBreak[3].split("?",1)[0]
linkBreak[2] = str(linkBreak[2]) + "_" + str(linkBreak[3])
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
else:
if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
print("Skipping")
continue
else:
print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
except HTTPError as e:
if e.code == 404:
print(e)
print(count,"__________")
broken_links.append([count,title,link])
continue
else:
raise
end = time.time()
fp = os.path.join(downloadDirectory,"BrokenLinks.txt")
file = open(fp,"w+")
for link in broken_links:
file.write(str(link) + "\n")
file.write(str(datetime.now()))
file.close()
return("Script completed in: " + str(end - start) + " seconds.")
archURL = {some_url}
archDownloadDirectory = {some_localpath}
scraper(archURL,archDownloadDirectory)
1 Answer 1
- Whereas your code might work now, I strongly recommend using
requests
instead ofurlopen
. Requests is awesome, and makes a lot of complicated stuff simple, especially for web scrapers. - I believe
BeautifulSoup
will complain unless you pass another parameter to its constructor,'html.parser'
. - You have everything in one function. This is very difficult to read, understand and debug. I recommend subdividing this into smaller functions called from your top-level function.
- You have a bunch of repeated code - for instance, your
if isfile
logic. Again, factor this out into functions where possible. - You do not need to call
str
onfile_path
. It's already a str. - Rather than calling
file.close()
, you can put this in awith
block. Read about them here - https://docs.python.org/3/reference/compound_stmts.html#with - There's a better way to do your printing. This:
print("File Path: " + str(file_path), "\n" + "Directory Path: " + str(dir_path))
Is better written as:
print('File Path: %s' % file_path)
print('Directory Path: %s' % dir_path)
You get the idea. It's easier to read if you have one print
statement per line. Also,
title + "_" + linkBreak[2]
is more easily read as
'%s_%s' % (title, linkBreak[2])
With that expression in particular, you reuse it many times throughout the code, so it should be assigned to a variable.
Here:
linkBreak[3] = linkBreak[3].split("?",1)[0]
You're dropping the query parameters from the URL. You probably shouldn't be doing this parsing yourself. You should be calling urlparse
.
downloadDirectory
, pythonically, should use snake_case, i.e.download_dir
. Same with other identifiers likearchDownloadDirectory
-
\$\begingroup\$ (Installing and) using
'lxml'
instead of'html.parser'
will give you a nice speedboost (but will add an external dependency). \$\endgroup\$Graipher– Graipher2018年11月19日 17:05:08 +00:00Commented Nov 19, 2018 at 17:05 -
\$\begingroup\$ Thank you so much for all of the suggestions! \$\endgroup\$MapPeddler– MapPeddler2018年11月19日 20:46:12 +00:00Commented Nov 19, 2018 at 20:46
Explore related questions
See similar questions with these tags.