After much searching around the web, I couldn't find a single module which did a decent prettyXML or prettyHTML. Because:
<node>
<child>
Text
</child>
</node>
Is flat out ridiculous, and not a good use of space.
<node>
<child>Text</child>
</node>
Is much cleaner.
That's what spurred this project. I also created a similar, but works completely different prettyHTML. These are designed to work with the build in module, miniDOM. I'm looking for a full critique (i.e. anything that needs to be).
Helper for prettyHTML:
def addLineBreaks(txt, max_char, level, indent, indentSubLines = False):
# If the length is already shorter, or if there is no space to be found
if(len(txt) <= max_char or txt.find(" ") < 0):
# Just return it
return txt
# end if
# Up the level if indentSubLines
if(indentSubLines): ++level
## Remember the last breakpoint's position
# Start it where the last \n is found (in case what is passed has already been broken up some)
lastPos = txt.rfind("\n")
# If there was no "\n" then zero
if(lastPos < 0): lastPos = 0
# Get how many breaks we need to make. (-1) because 3 lines has 2 line breaks.
numBreaks = math.ceil((len(txt) - lastPos) / max_char) - 1
# Get the first space
pos = tempPos = txt.find(" ", lastPos + 1)
# Place that many minus one line breaks
for i in range(numBreaks):
# Keep searching until we find the furthest position we can, which will be
# at tempPos - lastPos, where lastPos is the last break point.
while((tempPos - lastPos) < max_char):
# Assign the last result
pos = tempPos
# Find the next result after that one
tempPos = txt.find(" ", tempPos + 1)
# end while
# If no break was found, then up to the next spot, which may
# break what is wanted, but also prevents an infinite loop.
if(pos == lastPos): pos = tempPos
# Add the line break
txt = txt[0:pos + 1] + "\n" + (indent * level) + txt[pos + 1:]
# Remember where the break was put (+1 so it's past the \n)
lastPos = pos + 1
# end for
# Return the new string
return txt
# end addLineBreaks
PrettyHTML:
def prettyHTML(xml, indent = " ", level = 0, max_char = -1):
# If given text, just return it
if(xml.nodeType in (xml.TEXT_NODE, xml.CDATA_SECTION_NODE)):
return xml.toxml()
# end if
# Elements who's children will be indented
indtCldrn = ['datalist', 'details', 'dl',
'map', 'math', 'menu',
'ol',
'select', 'svg',
'tbody', 'tfoot', 'thead', 'tr',
'ul'
]
# Elements which will be placed on their own line, with all content / children
ownLine = ['!DOCTYPE',
'button',
'dd', 'dt',
'figcaption',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'label', 'legend', 'li',
'menuitem', 'meter',
'optgroup', 'option', 'output',
'progress',
'rect',
'style', 'summary',
'td', 'textarea', 'th', 'title'
]
# Elements which are self closing / void / whatever you want to call them
selfClosing = ['area', 'base', 'br', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
# Elements who's start / end tags will sandwich the content
spaceAfter = ['address', 'article', 'aside', 'audio',
'blockquote', 'body',
'canvas', 'code',
'div',
'fieldset', 'figure', 'footer', 'form',
'head', 'hader', 'html',
'iframe',
'main',
'nav', 'noscript',
'object',
'p', 'pre',
'ruby',
'samp', 'script', 'section',
'table', 'template',
'video'
]
## This is just here so all HTML elements are accounted for
"""inline = ['a', 'abbr',
'b', 'bdi', 'bdo',
'cite',
'data',
'del', 'dfn',
'em',
'i', 'ins',
'kbd',
'mark',
'q',
'rp', 'rt',
's', 'small', 'span', 'strong', 'sub', 'sup',
'time',
'u',
'var',
]"""
# Holds the pretty text
pretty = ""
# HTML is case insensitive, so make all node's lowercase
nodeName = xml.nodeName.lower()
## Make the opening tag
# Indent the tag, and add the node name
openTag = (indent * level) + "<" + nodeName
# Add the attributes (With the attribute name as lowercase)
# This is tried as a precautionary
try:
for item in xml.attributes.items():
openTag += " " + item[0].lower() + '="' + item[1] + '"'
# end for
except AttributeError:
pass
# end try
# If we have a node which is self-closing
if(nodeName in selfClosing):
# Add the closing />
openTag += " />"
# Ensure it's not too long
if(max_char > 0):
openTag = addLineBreaks(openTag, max_char, level, indent, True)
# end if
# If not a <wbr />, then we also need to add a \n
if(nodeName != "wbr"): openTag += "\n"
# And return it, no need to process further
return openTag
# else, it's not a self closing tag
else:
# Add the closing >
openTag += ">"
# end if
# If we want to indent the children
if(nodeName in indtCldrn):
# Add the open tag with a line break (indentation is already included)
pretty += (openTag + "\n")
# Get the prettyHTML for all the children, placing them one level deeper
for child in xml.childNodes:
pretty += prettyHTML(child, indent, level + 1, max_char)
# end for
# Add the closing tag with indentation
pretty += ((indent * level) + "</" + nodeName + ">\n")
# else if we want to sandwich everything.
elif(nodeName in spaceAfter):
# We assume that there is already a line break before (indentation is already included)
pretty += (openTag + "\n")
# Holds the inner text
temp = ""
# Get the prettyHTML for all the children
for child in xml.childNodes:
temp += prettyHTML(child, indent, level, max_char)
# end for
# If we have a limit
if(max_char > 0):
# Break it up
temp = addLineBreaks(temp, max_char, level, indent)
# end if
# Append temp to pretty
pretty += temp
# Add the closing tag
# If the last character isn't a \n, make it one so there is a break before the closing tag
if(pretty[-1] != "\n"): pretty += "\n"
# Add the closing tag with indentation
pretty += ((indent * level) + "</" + nodeName + ">\n")
# Else, we either have an inline, or a tag which should sit on its own line
else:
# We assume that there is already a line break before (indentation included)
pretty += openTag
# Add all children
for child in xml.childNodes:
pretty += child.toxml()
# end for
# Add the closing tag
pretty += ("</" + nodeName + ">")
# Then be sure to cut it up, if we have a max_char
if(max_char > 0):
# Break it up (If necessary)
pretty = addLineBreaks(pretty, max_char, level, indent, True)
# end if
# If we have a tag which is supposed to be on its own line, add a line break
if(nodeName in ownLine):
pretty += "\n"
# end if
# end if
# Return the now prettified text
return pretty
# end prettyHTML
Pretty XML:
def prettyXML(xml, indent = " ", level = 0):
# Flag for whether or not to indent, or just wrap it
hasNonText = False
# For each node in the childNodes
for node in xml.childNodes:
# Check if there is something that isn't text
if(node.nodeType not in (node.TEXT_NODE, node.CDATA_SECTION_NODE)):
# If so, set the flag, and kill the loop
hasNonText = True
break
# end if
# end for
# Store the pretty XML
pretty = ""
# If we have nonText
if(hasNonText):
# Add the indentation and start tag
pretty += ((indent * level) + "<" + xml.nodeName)
# Add the attributes (This is tried as a precaution)
try:
for item in xml.attributes.items():
pretty += " " + item[0] + '="' + item[1] + '"'
# end for
except AttributeError:
pass
# end try
# add the closing > and a line break
pretty += ">\n"
# Loop through each child
for child in xml.childNodes:
# And add it to pretty, moving it one level deeper
pretty += prettyXML(child, indent, level + 1)
# end for
# Add the closing tag with indentation
pretty += ((indent * level) + "</" + xml.nodeName + ">\n")
# Else if it had no children
elif(not xml.childNodes):
# Just add the raw XML with indentation. Probably a self-closing tag
pretty += ((indent * level) + xml.toxml() + "\n")
# Else, it only had text children
else:
# Add the indentation and start tag
pretty += ((indent * level) + "<" + xml.nodeName)
# Add the attributes (like above)
try:
for item in xml.attributes.items():
pretty += " " + item[0] + '="' + item[1] + '"'
# end for
except AttributeError:
pass
# end try
# Add the closing >
pretty += ">"
# Add all children (which will just be text)
for node in xml.childNodes:
pretty += node.toxml()
# end for
# Add the closing tag
pretty += "</" + xml.nodeName + ">\n"
# end if
return pretty
# end prettyXML
1 Answer 1
You have in total three functions over 300 lines, so ~100 lines to a function. This is 5-10x too few functions for this amount of code.
A lot of your formatting looks like you've used a C-like language and can't handle writing Python. Stuff like
if(x):
...
# end if
which should just be
if x:
...
You even have
++level
which actually just means
+ ( + level )
Namely, this is a no-op.
My first criticism before I look at this more in-depth is to read PEP 8 and stick to it. As the code is written, it would alienate any current Python user.
I'm looking at prettyXML
since it's the simpler of the two. You start with has_non_text
, which is a great candidate for a function:
def has_non_text(xml):
ret = False
# For each node in the childNodes
for node in xml.childNodes:
# Check if there is something that isn't text
if node.nodeType not in (node.TEXT_NODE, node.CDATA_SECTION_NODE):
# If so, set the flag, and kill the loop
ret = True
break
return ret
Some of these comments are trivial, so remove them. Also, use an early return, since it's now a stand-alone function.
def has_non_text(xml):
for node in xml.childNodes:
# If not text
if node.nodeType not in (node.TEXT_NODE, node.CDATA_SECTION_NODE):
return True
return False
This would be even simpler with an is_text
function:
def is_text(node):
return node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE)
def has_non_text(xml):
return not all(is_text(node) for node in xml.childNodes)
Your comment
# Flag for whether or not to indent, or just wrap it
would be much better if it explained which action corresponded to which flag state. Something like
# Indent iff has non-text
prettyXML
now looks like:
def prettyXML(xml, indent=" ", level=0):
# Store the pretty XML
pretty = ""
# Indent iff has non-text
if has_non_text(xml):
...
# Else if it had no children
elif not xml.childNodes:
...
# Else, it only had text children
else:
...
return pretty
This would be better as
def prettyXML(xml, indent=" ", level=0):
# Indent iff has non-text
if has_non_text(xml):
pretty = ""
...
return pretty
elif not xml.childNodes:
pretty = ""
...
return pretty
# Only had text children
else:
pretty = ""
...
return pretty
since each subsection is a logically separate action. In fact, you could make each a separate function if you wanted.
Consdier the first branch.
pretty = ""
# Add the indentation and start tag
pretty += (indent * level) + "<" + xml.nodeName
# Add the attributes (This is tried as a precaution)
try:
for item in xml.attributes.items():
pretty += " " + item[0] + '="' + item[1] + '"'
except AttributeError:
pass
# add the closing > and a line break
pretty += ">\n"
# Loop through each child
for child in xml.childNodes:
# And add it to pretty, moving it one level deeper
pretty += prettyXML(child, indent, level + 1)
# Add the closing tag with indentation
pretty += ((indent * level) + "</" + xml.nodeName + ">\n")
return pretty
You should never do +=
on strings in a loop unless you know enough to know it doesn't apply. In this case it definitely doesn't.
Here's an alternative:
pretty += "".join(" " + item[0] + '="' + item[1] + '"' for item in xml.attributes.items())
You should also use formatting, unpacking and move the loop out of the try
:
try:
attributes = xml.attributes.items()
except AttributeError:
pass
else:
pretty += "".join(' {}="{}"'.format(k, v) for k, v in attributes)
The "{}"
can be better replaced with {!r}
, which uses the repr
of the string. This is a better match although it's not entirely safe. Further, the try
can be replaced with an or
:
attributes = (xml.attributes or {}).items()
pretty += "".join(' {}={!r}'.format(k, v) for k, v in attributes)
and change the outer parts.
You can then do the same for the child nodes:
pretty += "".join(prettyXML(child, indent, level + 1) for child in xml.childNodes)
and then put it together with a formatting string:
attributes = (xml.attributes or {}).items()
node_attrs = "".join(' {}={!r}'.format(k, v) for k, v in attributes)
children = "".join(prettyXML(child, indent, level + 1) for child in xml.childNodes)
make_node = (
"{tab}<{xml.nodeName}{node_attrs}>\n"
"{children}"
"{tab}</{xml.nodeName}>\n"
).format
return make_node(node_attrs=node_attrs, children=children, tab=indent * level, xml=xml)
The second option is
elif not xml.childNodes:
# Just add the raw XML with indentation. Probably a self-closing tag
pretty = ""
pretty += ((indent * level) + xml.toxml() + "\n")
return pretty
Obviously this can become just
elif not xml.childNodes:
# Just add the raw XML with indentation. Probably a self-closing tag
return (indent * level) + xml.toxml() + "\n"
Finally, the last option is really similar and gets simplified to
attributes = (xml.attributes or {}).items()
node_attrs = "".join(' {}={!r}'.format(k, v) for k, v in attributes)
children = "".join(node.toxml() for node in xml.childNodes)
make_node = "{tab}<{xml.nodeName}{node_attrs}>{children}</{xml.nodeName}>\n".format
return make_node(node_attrs=node_attrs, children=children, tab=indent * level, xml=xml)
Since the first and last options share quite a lot of logic, reshape the control flow to accomodate:
def prettyXML(xml, indent=" ", level=0):
if not xml.childNodes:
# Just add the raw XML with indentation. Probably a self-closing tag
return (indent * level) + xml.toxml() + "\n"
attributes = (xml.attributes or {}).items()
node_attrs = "".join(' {}={!r}'.format(k, v) for k, v in attributes)
# Indent iff has non-text
if has_non_text(xml):
children = "".join(prettyXML(child, indent, level + 1) for child in xml.childNodes)
make_node = (
"{tab}<{xml.nodeName}{node_attrs}>\n"
"{children}"
"{tab}</{xml.nodeName}>\n"
).format
# Only had text children
else:
children = "".join(node.toxml() for node in xml.childNodes)
make_node = "{tab}<{xml.nodeName}{node_attrs}>{children}</{xml.nodeName}>\n".format
return make_node(node_attrs=node_attrs, children=children, tab=indent * level, xml=xml)
The make_node
s could do with being a fully-fledged function. This gives:
def is_text(node):
return node.nodeType in (node.TEXT_NODE, node.CDATA_SECTION_NODE)
def has_non_text(xml):
return not all(is_text(node) for node in xml.childNodes)
def make_node(xml, node_attrs, children, tab, *, indented):
if indented:
fmt = (
"{tab}<{xml.nodeName}{node_attrs}>\n"
"{children}"
"{tab}</{xml.nodeName}>\n"
)
else:
fmt = "{tab}<{xml.nodeName}{node_attrs}>{children}</{xml.nodeName}>\n"
return fmt.format(xml=xml, node_attrs=node_attrs, children=children, tab=tab)
def prettyXML(xml, indent=" ", level=0):
if not xml.childNodes:
# Just add the raw XML with indentation. Probably a self-closing tag
return (indent * level) + xml.toxml() + "\n"
attributes = (xml.attributes or {}).items()
node_attrs = "".join(' {}={!r}'.format(k, v) for k, v in attributes)
indented = has_non_text(xml)
if indented:
children = "".join(prettyXML(child, indent, level + 1) for child in xml.childNodes)
else:
children = "".join(node.toxml() for node in xml.childNodes)
return make_node(xml, node_attrs, children, tab=indent*level, indented=indented)
-
-
\$\begingroup\$ I finished reading the spec, aside from the #end comments, I don't see anything that breaks the standard. I use suggested indentation, () on if statements are shown as okay.
++
I forgot didn't work in Python: I'm used to C based languages where that does something, so thanks for pointing that out. Now to dig through the rest of your critique... :) \$\endgroup\$David– David2015年01月10日 14:44:58 +00:00Commented Jan 10, 2015 at 14:44 -
1\$\begingroup\$ @Caridorc, I just used a regex replace in Notepad++. \$\endgroup\$David– David2015年01月10日 14:51:04 +00:00Commented Jan 10, 2015 at 14:51
-
\$\begingroup\$ @David You missed naming conventions (most things as
snake_case
). I'm surprised it doesn't explicitly mention()
onif
statements as being bad, but it definitely doesn't say they're OK either. The only uses used brackets for line continuation, which is different to what you did. Plus, you should have a space in that case anyway (if (x or\ny)
). There's also "Compound statements (multiple statements on the same line) are generally discouraged." \$\endgroup\$Veedrac– Veedrac2015年01月10日 14:54:39 +00:00Commented Jan 10, 2015 at 14:54
asdf
s not wrapped. I think I get it; themax_char
is actually a misnamedmin_char
. \$\endgroup\$