Parse JSON from HTML Python

Question 1

I'm parsing out specific values on web pages with BeautifulSoup. However, since I'm using RegEx, my program is taking forever to run. Would love ideas on how to speed this up.

from bs4 import BeautifulSoup
import datetime
import json
from progressbar import progressbar
import pdb
import pickle
import re
class Listing():
 def __init__(self, custom_name, **entries):
 self.__dict__.update(entries)
 self.custom_name = custom_name
 self.date_accessed = datetime.datetime.today()
 def __hash__(self):
 return hash(self.custom_name)
 def __eq__(self, other):
 return self.custom_name == other.custom_name
 def __repr__(self):
 return self.custom_name
def list_to_dict(rlist):
 # QUEST: There are multiple colons in many of the entries. I couldn't
 # figure out how to use re.split where it only split the first occurence
 # so instead I replace only the first occurence and then split that new str
 list_with_replace_str = [re.sub(":", ":REPLACE", e, 1) for e in rlist]
 temp_dict = dict(f.split(":REPLACE") for f in list_with_replace_str)
 clean_dict = {}
 for key in temp_dict.keys():
 clean_key = key.strip()
 clean_value = temp_dict[key].strip()
 clean_dict[clean_key] = clean_value
 return clean_dict
def parse_listings(listing_objs):
 def parse_financials_div(financials_soup, listing_obj):
 try:
 financials_text = financials_soup.text
 financials_list = financials_text.split("\r\n")[:-1]
 financials_dict = list_to_dict(financials_list)
 not_included = []
 for key in financials_dict:
 if "*" in financials_dict[key]:
 not_included.append(key)
 financials_dict["notIncluded"] = not_included
 for key in financials_dict:
 try:
 financials_dict[key] = int(
 re.sub("[^0-9]", "", financials_dict[key]))
 except Exception:
 continue
 return financials_dict
 except Exception as e:
 print(f"error {e}")
 pdb.set_trace()
 def parse_details_div(details_soup, listing_obj):
 try:
 details_tag_list = details_soup.contents
 details_str = " ".join([str(element)
 for element in details_tag_list])
 details_list = details_str.split("<dt>")[1:]
 strs_to_tags = [BeautifulSoup(detail, "html.parser")
 for detail in details_list]
 details_text = [tag.text for tag in strs_to_tags]
 details_dict = list_to_dict(details_text)
 return details_dict
 except Exception as e:
 print(f"error {e}")
 pdb.set_trace()
 def parse_category(product_json_soup, listing_obj):
 product_json_str = product_json_soup.contents[0].replace(
 "\r", "").replace("\n", "")
 product_json_str = product_json_str.replace(
 "\'", "").replace('\\"', '').replace("\t", "")
 product_dict = json.loads(product_json_str)
 category_str = product_dict["category"]
 category_list = category_str.split(">")
 category_list = [category.strip() for category in category_list]
 listing_obj.category = {}
 listing_obj.category["parent_category"] = category_list[0]
 try:
 listing_obj.category["sub_category"] = category_list[1]
 except Exception:
 listing_obj.category["sub_category"] = "Not Present"
 def parse_address(address_json_soup, listing_obj):
 address_json_str = address_json_soup.contents[0].replace(
 "\r", "").replace("\n", "")
 address_json_str = address_json_str.replace(
 "\'", "").replace('\\"', '').replace("\t", "")
 address_dict = json.loads(address_json_str)
 listing_obj.address = address_dict["address"]
 # Parse available listing fields into a dict
 print("Parse financials and details for listings")
 for listing_obj in progressbar(listing_objs):
 try:
 index = listing_objs.index(listing_obj)
 length = len(listing_objs)
 soup = BeautifulSoup(listing_obj.response_text, "html.parser")
 # Parse category
 category_json_pattern = re.compile(r"\"@type\" : \"Product\"")
 category_json_soup = soup.find(
 "script", {"type": "application/ld+json"}, text=category_json_pattern)
 if category_json_soup:
 parse_category(category_json_soup, listing_obj)
 # Parse address
 address_json_pattern = re.compile(r"LocalBusiness")
 address_json_soup = soup.find(
 "script", {"type": "application/ld+json"}, text=address_json_pattern)
 if address_json_soup:
 parse_address(address_json_soup, listing_obj)
 # Price details
 financials_span_pattern = re.compile(r"Asking Price:")
 financials_span_soup = soup.find(
 "span", text=financials_span_pattern)
 if financials_span_soup:
 financials_soup = financials_span_soup.parent.parent.parent.parent
 financials_dict = parse_financials_div(
 financials_soup, listing_obj)
 listing_obj.financials = financials_dict
 else:
 print(
 f"Financials not present #{index} of {length} {listing_obj.url}")
 print(soup)
 # Listing Details
 details_soup = soup.find("dl", {"class": "listingProfile_details"})
 if details_soup:
 details_dict = parse_details_div(details_soup, listing_obj)
 listing_obj.details = details_dict
 except Exception as e:
 print(f"error {e}")
def run_listing_calculations(listing_obj):
 # All in price
 extra_costs = 0
 price = listing_obj.financials["Asking Price"]
 for item in listing_obj.financials["notIncluded"]:
 if "Real Estate" not in item:
 extra_costs += listing_obj.financials[item]
 if isinstance(price, int):
 all_in_price = listing_obj.financials["Asking Price"] + extra_costs
 else:
 all_in_price = listing_obj.financials["Asking Price"]
 listing_obj.financials["allInPrice"] = all_in_price
 # Multiple
 all_in_price = listing_obj.financials["allInPrice"]
 cashflow = listing_obj.financials["Cash Flow"]
 try:
 listing_obj.financials["Multiple"] = all_in_price / cashflow
 except Exception:
 listing_obj.financials["Multiple"] = "N/A"
def parse_listings_from_pkl():
 with open("/Users/work/Dropbox/Projects/Working Data/bizbuysell/listings20191231.pkl", "rb") as infile:
 listing_objs = pickle.load(infile)
 print("Validate listing responses")
 listing_resp_validated = []
 for listing_obj in progressbar(listing_objs):
 try:
 if "Soup test failed" not in listing_obj.response_text:
 listing_resp_validated.append(listing_obj)
 except Exception:
 continue
 parse_listings(listing_resp_validated)
 print("Perform listing calculations")
 for listing_obj in progressbar(listing_resp_validated):
 financials_present = hasattr(listing_obj, "financials")
 if financials_present:
 run_listing_calculations(listing_obj)
 pdb.set_trace()
if __name__ == "__main__":
 parse_listings_from_pkl()

Here's a link to the .pkl file needed to run this.

Here's a gist with the example HTML response and product_json_soup.

Question 2

product_json_soup is undefined in your code. Add more context and post a testable category_json_soup content

Question 3

Cleaned up the naming and added a gist with the responses to make it testable.

Question 4

"program is taking forever to run" - there should be other bottlenecks on your side, the above fragment takes about 1 second to run on my machine. Does your actual script implies some looping and more extended parsing?

Question 5

You're right @RomanPerekhrest. It's fast for one, but when I iterate over 40,000 it is a very slow step. I've added the cProfile for the larger program to the gist. Let me know if that helps. I could put in the entire function, but it's more or less just tweaks on this, and it seems the re operations take the longest.

Question 6

Can you share those 40,000 urls so I could test the loop and get the actual estimates?

Question 7

The most time is consumed by BeautifulSoup conversions, namely

soup = BeautifulSoup(listing_obj.response_text, "html.parser")

For proof, firstly create a .pkl file of a reasonable size for debugging:

if __name__ == "__main__":
 with open("D:\\Downloads\\listings20191231.pkl", "rb") as infile:
 listing_objs = pickle.load(infile)
 data = listing_objs[222:666]
 with open("D:\\Python\\CR\\listings20191231.pkl", "wb") as oufile:
 pickle.dump(data, oufile, pickle.HIGHEST_PROTOCOL)

Then, check and compare consumed time using following adapted code (moreover, I removed all the progressbar stuff from the rest of original code):

if __name__ == "__main__":
 import time
 import sys
 argcnt = len(sys.argv) - 1
 argtxt = 'parse_listings_from_pkl()' if argcnt == 0 else 'BeautifulSoup'
 startload = time.time()
 with open("D:\\Python\\CR\\listings20191231.pkl", "rb") as infile:
 listing_objs = pickle.load(infile)
 length = len(listing_objs)
 print( 'checking time: ', argtxt, length, 'records')
 start0 = time.time()
 if argcnt == 0:
 parse_listings_from_pkl()
 else:
 for listing_obj in listing_objs: #progressbar(listing_objs):
 soap = BeautifulSoup(listing_obj.response_text, "html.parser")
 start1 = time.time()
 print("time consumed: ", argtxt, start1 - start0)

Output shows that cca 86 % of time (100 * 32.761232137680054 / 38.00445818901062) is consumed by converting original html to BeautifulSoup format:

D:\Python\CR234876円.py

checking time: parse_listings_from_pkl() 444 records
Validate listing responses
Parse financials and details for listings
Perform listing calculations
time consumed: parse_listings_from_pkl() 38.00445818901062

D:\Python\CR234876円.py 1

checking time: BeautifulSoup 444 records
time consumed: BeautifulSoup 32.761232137680054

Although there are some optimizable parts in the rest of pure python code (and I tried them with only minor performance improvements), I found that the BeautifulSoup conversion time corresponds to original html size and there is most of gubbins of no use inside the analyzed html.

Hence, I'd try cutting the listing_obj.response_text into pieces of useful parts and convert merely those parts to <class 'bs4.BeautifulSoup'> type. Maybe Speeding up beautifulsoup or Simple HTML and XHTML parser could help extracting useful info from the original html?

JosefZ 5541 gold badge7 silver badges17 bronze badges · Accepted Answer · 2020-01-09 14:52:06Z

The most time is consumed by BeautifulSoup conversions, namely

soup = BeautifulSoup(listing_obj.response_text, "html.parser")

For proof, firstly create a .pkl file of a reasonable size for debugging:

if __name__ == "__main__":
 with open("D:\\Downloads\\listings20191231.pkl", "rb") as infile:
 listing_objs = pickle.load(infile)
 data = listing_objs[222:666]
 with open("D:\\Python\\CR\\listings20191231.pkl", "wb") as oufile:
 pickle.dump(data, oufile, pickle.HIGHEST_PROTOCOL)

Then, check and compare consumed time using following adapted code (moreover, I removed all the progressbar stuff from the rest of original code):

if __name__ == "__main__":
 import time
 import sys
 argcnt = len(sys.argv) - 1
 argtxt = 'parse_listings_from_pkl()' if argcnt == 0 else 'BeautifulSoup'
 startload = time.time()
 with open("D:\\Python\\CR\\listings20191231.pkl", "rb") as infile:
 listing_objs = pickle.load(infile)
 length = len(listing_objs)
 print( 'checking time: ', argtxt, length, 'records')
 start0 = time.time()
 if argcnt == 0:
 parse_listings_from_pkl()
 else:
 for listing_obj in listing_objs: #progressbar(listing_objs):
 soap = BeautifulSoup(listing_obj.response_text, "html.parser")
 start1 = time.time()
 print("time consumed: ", argtxt, start1 - start0)

Output shows that cca 86 % of time (100 * 32.761232137680054 / 38.00445818901062) is consumed by converting original html to BeautifulSoup format:

D:\Python\CR234876円.py

checking time: parse_listings_from_pkl() 444 records
Validate listing responses
Parse financials and details for listings
Perform listing calculations
time consumed: parse_listings_from_pkl() 38.00445818901062

D:\Python\CR234876円.py 1

checking time: BeautifulSoup 444 records
time consumed: BeautifulSoup 32.761232137680054

Although there are some optimizable parts in the rest of pure python code (and I tried them with only minor performance improvements), I found that the BeautifulSoup conversion time corresponds to original html size and there is most of gubbins of no use inside the analyzed html.

Hence, I'd try cutting the listing_obj.response_text into pieces of useful parts and convert merely those parts to <class 'bs4.BeautifulSoup'> type. Maybe Speeding up beautifulsoup or Simple HTML and XHTML parser could help extracting useful info from the original html?

Stack Exchange Network

Parse JSON from HTML Python

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Parse JSON from HTML Python

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions