I am currently working on a project for an online course, my goal is to create a bookmark manager web app. So I created this python script to parse a chrome/firefox HTML bookmarks file (Netscape-Bookmark-file) into a JSON object, while preserving the hierarchy and location of the folders and urls.
The code works fine and parses the HTML file to JSON correctly.
I feel like the code is messy and that the approach I am using is not the best. I would appreciate any critique/criticism in any aspect of the code.
The code runs by passing the html file location to the main()
function:
output = main("html_file_location")
Here is the Code:
from bs4 import BeautifulSoup
# Counter for the id of each item (folders and urls)
ID = 1
def indexer(item, index):
"""
Add position index for urls and folders
"""
if item.get("type") in ["url", "folder"]:
item["index"] = index
index += 1
return index
def parse_url(child, parent_id):
"""
Function that parses a url tag <DT><A>
"""
global ID
result = {
"type": "url",
"id": ID,
"index": None,
"parent_id": parent_id,
"url": child.get("href"),
"title": child.text,
"date_added": child.get("add_date"),
"icon": child.get("icon"),
}
# getting icon_uri & tags are only applicable in Firefox
icon_uri = child.get("icon_uri")
if icon_uri:
result["icon_uri"] = icon_uri
tags = child.get("tags")
if tags:
result["tags"] = tags.split(",")
ID += 1
return result
def parse_folder(child, parent_id):
"""
Function that parses a folder tag <DT><H3>
"""
global ID
result = {
"type": "folder",
"id": ID,
"index": None,
"parent_id": parent_id,
"title": child.text,
"date_added": child.get("add_date"),
"date_modified": child.get("last_modified"),
"special": None,
"children": [],
}
# for Bookmarks Toolbar in Firefox and Bookmarks bar in Chrome
if child.get("personal_toolbar_folder"):
result["special"] = "toolbar"
# for Other Bookmarks in Firefox
if child.get("unfiled_bookmarks_folder"):
result["special"] = "other_bookmarks"
ID += 1
return result
def recursive_parse(node, parent_id):
"""
Function that recursively parses folders and lists <DL><p>
"""
index = 0
# case were node is a folder
if node.name == "dt":
folder = parse_folder(node.contents[0], parent_id)
items = recursive_parse(node.contents[2], folder["id"])
folder["children"] = items
return folder
# case were node is a list
elif node.name == "dl":
data = []
for child in node:
tag = child.contents[0].name
if tag == "h3":
folder = recursive_parse(child, parent_id)
index = indexer(folder, index)
data.append(folder)
elif tag == "a":
url = parse_url(child.contents[0], parent_id)
index = indexer(url, index)
data.append(url)
return data
def parse_root_firefox(root):
"""
Function to parse the root of the firefox bookmark tree
"""
# create bookmark menu folder and give it an ID
global ID
bookmarks = {
"type": "folder",
"id": ID,
"index": 0,
"parent_id": 0,
"title": "Bookmarks Menu",
"date_added": None,
"date_modified": None,
"special": "main",
"children": [],
}
ID += 1
index = 0 # index for bookmarks/bookmarks menu
main_index = 1 # index for root level
result = [0] # root contents
for node in root:
# skip node if not <DT>
if node.name != "dt":
continue
# get tag of first node child
tag = node.contents[0].name
if tag == "a":
url = parse_url(node.contents[0], 1)
index = indexer(node, index)
bookmarks["children"].append(url)
if tag == "h3":
folder = recursive_parse(node, 1)
# check for special folders (Other Bookmarks / Toolbar)
# add them to root level instead of inside bookmarks
if folder["special"]:
folder["parent_id"] = 0
main_index = indexer(folder, main_index)
result.append(folder)
else:
index = indexer(folder, index)
bookmarks["children"].append(folder)
result[0] = bookmarks
return result
def parse_root_chrome(root):
"""
Function to parse the root of the chrome bookmark tree
"""
global ID
# Create "other bookmarks" folder and give it an ID
other_bookmarks = {
"type": "folder",
"id": ID,
"index": 1,
"parent_id": 0,
"title": "Other Bookmarks",
"date_added": None,
"date_modified": None,
"special": "other_bookmarks",
"children": [],
}
ID += 1
result = [0]
index = 0
for node in root:
if node.name != "dt":
continue
# get the first child element (<H3> or <A>)
element = node.contents[0]
tag = element.name
# if an url tag is found at root level, add it to "Other Bookmarks" children
if tag == "a":
url = parse_url(node.contents[0], 1)
index = indexer(node, index)
other_bookmarks["children"].append(url)
elif tag == "h3":
# if a folder tag is found at root level, check if its the main "Bookmarks Bar", else append to "Other Bookmarks" children
if element.get("personal_toolbar_folder"):
folder = recursive_parse(node, 0)
folder["index"] = 0
folder["special"] = "main"
result[0] = folder
else:
parent_id = other_bookmarks["id"]
folder = recursive_parse(node, parent_id)
index = indexer(folder, index)
other_bookmarks["children"].append(folder)
# add "Other Bookmarks" folder to root if it has children
if len(other_bookmarks["children"]) > 0:
result.append(other_bookmarks)
return result
# Main function
def main(bookmarks_file):
"""
Main function, takes in a HTML bookmarks file from Chrome/Firefox and returns a JSON nested tree of the bookmarks.
"""
# Open HTML Bookmark file and pass contents into beautifulsoup
with open(bookmarks_file, encoding="Utf-8") as f:
soup = BeautifulSoup(markup=f, features="html5lib", from_encoding="Utf-8")
# Check if HTML Bookmark version is Chrome or Firefox
# Prepare the data to be parsed
# Parse the root of the bookmarks tree
heading = soup.find("h1")
root = soup.find("dl")
if heading.text == "Bookmarks":
bookmarks = parse_root_chrome(root)
elif heading.text == "Bookmarks Menu":
bookmarks = parse_root_firefox(root)
return bookmarks
1 Answer 1
Global state
This:
# Counter for the id of each item (folders and urls)
ID = 1
has issues. It will prevent your code from being re-entrant. Instead, this should either be passed around in your function parameters, or made a member of a class.
Type hints
def indexer(item, index):
could stand to get some type hints. Probably index: int
, return value is -> int
, and item
is a : dict
. However,
- You're better off using
Dict[str, ???]
- I don't know what the values are; and - You're even better off representing the item not as a dictionary, but as a more strongly-typed class instance - maybe a
@dataclass
, or at least a named tuple - to gain confidence that your data are valid and your code is correct.
Enums
Another aspect of strengthening your types is to reframe this:
item.get("type") in ["url", "folder"]:
as an Enum
. Also, you shouldn't in
-compare to a list; do it to a set
literal instead, i.e. {'url', 'folder'}
. This will work equally well for strings or enums.
Generators
Consider replacing this:
data = []
for child in node:
data.append(folder)
return data
with
for child in node:
yield folder
It's easier to write, and will use up less memory - though the last bit will only matter if you're processing millions of these.
Returns from main
def main(bookmarks_file):
return bookmarks
This means that your main
isn't really a main
; something else (that you unfortunately haven't shown) is calling it. This method needs to be renamed, and your actual main
needs to call it.
-
\$\begingroup\$ Thank you for your detailed reply, there were quite somethings I didn't know about that I have learned. I do have some questions tho. 1. from my experience it seems that passing
ID
through 3 or 4 functions will cause the code to be quite difficult to maintain. (I guess changing to a class will be better?) 2. Wouldn't using Enum just to increment a dictionary value be over-complicating the code? 3. You mentioned using the Generator for returning the values, but since I am creating a nested structure, I will have to add them to a list after each yield anyway. won't that make it usless? \$\endgroup\$Adam– Adam2020年07月26日 21:02:22 +00:00Commented Jul 26, 2020 at 21:02 -
\$\begingroup\$ passing ID through 3 or 4 functions will cause the code to be quite difficult to maintain - Your thinking on this may evolve as you mature as a programmer. Ease of maintenance overlaps with ease of testability, and having functions that rely on their parameters only and have no side effects really enhances testability. \$\endgroup\$Reinderien– Reinderien2020年07月27日 00:04:02 +00:00Commented Jul 27, 2020 at 0:04
-
\$\begingroup\$ Wouldn't using Enum just to increment a dictionary value be over-complicating the code - It may very well end up producing code that is longer, but it will also be code that is inherently resistant to certain classes of errors where the data are not checked for validity until it's too late. \$\endgroup\$Reinderien– Reinderien2020年07月27日 00:04:56 +00:00Commented Jul 27, 2020 at 0:04
-
\$\begingroup\$ since I am creating a nested structure, I will have to add them to a list after each yield anyway - Nested generators are entirely possible. \$\endgroup\$Reinderien– Reinderien2020年07月27日 00:06:06 +00:00Commented Jul 27, 2020 at 0:06
Explore related questions
See similar questions with these tags.