I have the following code:
items = set() # use a set to not have duplicate items
a = requests.get(f"{BASE_URL}?{BASE_QUERIES}&cursor=*")
amount = a.json()["totalResults"] # in the range of 30 million
items.update(item["guid"].split("?")[0] for item in a.json()["items"]) # we only want the url before the query strings
cursor = a.json()["nextCursor"] # we have a cursor as each request only returns 100 items, the cursor shows use where to start
while len(items) < amount: # ensure we get all the items
try:
a = requests.get(f"{BASE_URL}?{BASE_QUERIES}&cursor={cursor}")
items.update(item["guid"].split("?")[0] for item in a.json()["items"])
except:
continue
try:
cursor = urllib.parse.quote(a.json()["nextCursor"])
except KeyError:
if len(items) == amount: # when we reach the final iteration the cursor will not be there
break
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
for link in items: # iterate over each item
for _ in range(3):
response = requests.get(link, headers=headers, stream=True)
response.raw.decode_content = True
parser = etree.HTMLParser()
tree = etree.parse(response.raw, parser)
setting = tree.xpath(
"/html/body/div[1]/div/div/main/div/div[2]/div/div[2]/div/div/div[1]/a/span[1]/text()"
)[0].strip()
try:
image = tree.xpath(
"/html/body/div[1]/div/div/main/div/div[2]/div/div[1]/div[1]/div/div/a/@href"
)[0] # most of the time this works
except:
try:
image = tree.xpath(
"/html/body/div[1]/div/div/main/div/div[2]/div/div[1]/div[1]/div[1]/a/@href"
)[0] # about 1 in 10 times the above fails and this works
except:
print(f"Image not found for link: {link}")
break
title = tree.findtext(".//title")
title_for_file = (
fr"{os.getcwd()}\IMAGES\IMAGE - " + "".join(c for c in title if c in valid_chars) + ".jpeg"
) # sometimes the file contains characters which aren't allowed in file names
description = "".join(
tree.xpath(
"/html/body/div[1]/div/div/main/div/div[3]/div[1]/div/div/div/div/p/text()"
)
)
if setting not in [x, y]: # we only want items which have a specific setting
break
try:
image_req = requests.get(image)
with open(title_for_file, "wb") as f:
f.write(image_req.content) # write the image to the directory
img = pyexiv2.Image(title_for_file)
metadata = {"Xmp.xmp.Title": title, "Xmp.xmp.Description": description} # edit the metadata
img.modify_xmp(metadata)
img.close()
break
except Exception as e:
print(
f"ERROR parsing image: {title_for_file}, it is most likely corrupt, "
f"retrying ({link}), "
f"error: {e}"
)
os.remove(title_for_file)
else:
print("ERROR more than 3 times moving to next")
The code has some fairly detailed comments to explain what it does, my issue is that it takes about 7 hours to do all the requests to get the links in items
, then another (I'm not 100% sure but, I think about 1500 hours) to check each image and add the metadata. Is there anything in this code that can be sped up/refined to speed up this process?
Similarly, is there anywhere I could reduce memory usage as I suspect this will use alot of memory with the amount of data it is parsing.
EDIT
A possible consideration would be to use Threading
to download multiple images at once, how would one find the optimal number of threads to run at once? Or perhaps I would start each Thread
with a small delay, perhaps 0.5 seconds?
1 Answer 1
Before you can start thinking about doing fancy things like parallelizing this code, you should start by putting your code into functions that do one thing each.
def get_items(response):
# we only want the url before the query strings
# use a set to not have duplicate items
return {item["guid"].split("?")[0] for item in response["items"]}
def get_all_items():
url = f"{BASE_URL}?{BASE_QUERIES}"
response = requests.get(url, params={"cursor": "*"}).json()
amount = response["totalResults"] # in the range of 30 million
items = get_items(response)
while cursor := response.get("nextCursor"):
try:
response = requests.get(url, params={"cursor": cursor}).json()
items.update(get_items(response))
except Exception:
continue
if len(items) != amount:
raise ValueError("Did not get all items")
return items
Note that I changed your bare except
to an except Exception
, otherwise the user pressing Ctrl+C would also be ignored.
I also used the new walrus operator to continue iterating as long as a nextCursor
is given in the response.
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
}
ALLOWED_SETTINGS = {x, y}
def handle_link(link, retries=3):
for _ in range(retries):
if parse(link, ALLOWED_SETTINGS):
break
def parse(link, allowed_settings):
response = requests.get(link, headers=HEADERS, stream=True)
response.raw.decode_content = True
parser = etree.HTMLParser()
tree = etree.parse(response.raw, parser)
setting = get_setting(tree)
if setting not in allowed_settings: # we only want items which have a specific setting
return False
image = get_image(tree)
if image is None:
return False
title = get_title(tree)
file_name = get_file_name(title)
description = get_description(tree)
return save_image(image, file_name,
{"Xmp.xmp.Title": title, "Xmp.xmp.Description": description})
Note that I aborted as soon as possible, no need to continue parsing if the setting
already tells you that you will throw the result away.
def get_setting(tree):
return tree.xpath(
"/html/body/div[1]/div/div/main/div/div[2]/div/div[2]/div/div/div[1]/a/span[1]/text()"
)[0].strip()
def get_image(tree):
xpaths = ["/html/body/div[1]/div/div/main/div/div[2]/div/div[1]/div[1]/div/div/a/@href",
"/html/body/div[1]/div/div/main/div/div[2]/div/div[1]/div[1]/div[1]/a/@href"]
for xpath in xpaths:
try:
return tree.xpath(xpath)[0]
except Exception:
continue
print(f"Image not found for link: {link}")
def get_title(tree):
return tree.findtext(".//title")
def get_file_name(title):
return (
fr"{os.getcwd()}\IMAGES\IMAGE - " + "".join(c for c in title if c in valid_chars) + ".jpeg"
) # sometimes the file contains characters which aren't allowed in file names
def get_description(tree):
return "".join(
tree.xpath(
"/html/body/div[1]/div/div/main/div/div[3]/div[1]/div/div/div/div/p/text()"
)
)
def save_image(url, file_name, metadata):
try:
image_req = requests.get(url)
with open(file_name, "wb") as f:
f.write(image_req.content) # write the image to the directory
img = pyexiv2.Image(file_name)
img.modify_xmp(metadata)
img.close()
return True
except Exception as e:
print(
f"ERROR parsing image: {file_name}, it is most likely corrupt, "
f"error: {e}"
)
os.remove(file_name)
return False
With this you could now start thinking about parallelization, since you actually have an encapsulated target function to execute.
Since there are now also separate functions for getting some information, you can consider changing how you parse the links. I personally prefer using BeautifulSoup
over xpath
, as using CSS selectors on classes or IDs is IMO more readable than your lengthy paths.
Note that in the end you will always be limited by your available internet connection. If you need to download 30 million 1 MB images with a 100Mbit/s internet connection, it will take at least 27 days and if you have a 1 Gbit/s connection it will still take almost three days no matter how you improve your code. If you download two images in parallel, each will just download at half the speed.
-
2\$\begingroup\$ Crikey, thanks so much a lot that's a lot of help! I actually have decided to use BeautifulSoup now, I didn't originally as from my experience it uses a lot more memory (stackoverflow.com/questions/11284643/…, for example) but in the end, the ease of BeautifulSoup outweighed the benefits of saving a small amount of memory using the lxml lib. \$\endgroup\$GAP2002– GAP20022021年02月09日 13:51:29 +00:00Commented Feb 9, 2021 at 13:51
-
2\$\begingroup\$ Just one small thing you might have done by accident, you have an accent in there in
ǵet_title(tree)
. \$\endgroup\$GAP2002– GAP20022021年02月09日 13:53:17 +00:00Commented Feb 9, 2021 at 13:53
Explore related questions
See similar questions with these tags.
x
ory
\$\endgroup\$