lua-users home
lua-l archive

Re: What would be a good representation of XML tree as Lua tables?

[Date Prev][Date Next][Thread Prev][Thread Next] [Date Index] [Thread Index]


On 21-Aug-16 16:45, Marc Balmer wrote:
I am working on code to easily create XML trees from Lua tables. What would,
in your opinion, be a comfortable Lua table notation of XML trees, expressed
as Lua tables?
Some years ago I wrote a minimalist, gmatch-based XML parser, ~50 lines apart from testing code, to read .ods (Open Document Spreadsheet) files. I never published it because it had limitations and I did not have the time to finish the job properly beyond my immediate needs, but I attach it here in case its approach to table organization (line 61) could have something useful for your design... perhaps just to see what is best avoided ;-) The parser both builds an XML tree and calls an external handler (if any) when tags are encountered.
('+++' comments mean 'work in progress')
P.S. I put this code (from 2008) under the Lua license.
--
 Enrico
function Test()
 local fname = 'spazi.xml' -- +++ test +++
 -- fname = 'content.xml'
 local f = io.open(fname, 'r')
 local txt = f:read('*a')
 f:close()
 
 local startf = function(element) 
 print('start: ' .. element.name)
 for attr, val in pairs(element.attrib) do
 print(' ' .. attr .. ' = ' .. val)
 end
 end
 
 local endf = function(element) 
 print('end: ' .. element.name)
 end
 
 local dataf = function(element) 
 print('data: ' .. element.data)
 end
 
 local root = Parse(txt, startf, endf, dataf)
 
 io.read()
 PrintTree(root) -- +++
end
---------------------------------------------------------------------------
-- todo: +++
-- handle CDATA (put away in table, get them later?)
-- note: space in character data is preserved (not really standard)
-- note: currently only latest data chunk is preserved in tree
-- numeric chars for escape currently not supported
-- handle comments?
-- numeric escape (e.g.    ) not supported because of UTF-8,
-- can be handled at content level
-- Print (use write)
-- test/demo files
-- +++ data --> text
-- +++ multiple text: use array for children & data, select on type
-- +++ children/data order is not preserved, use handlers +++
-- +++ is this a design problem?
---------------------------------------------------------------------------
-- parse xml text, build element tree,
-- call startTagHandler(element), if any, at every start or empty tag,
-- call endTagHandler(element), if any, at every start or empty tag,
-- call dataHandler(element), if any, at every character data chunk,
-- return root element of the created element tree:
--
-- element = {
-- parent = parent (nil for root)
-- name = name
-- data = character data (as string)
-- attrib = { attr=val, attr=val, ... }
-- [1] = child element
-- [2] = child element
-- [n] = child element
-- }
function Parse(xmlText, startTagHandler, endTagHandler, dataHandler)
 -- unescape function (numeric escape currently not supported)
 local escapeTable = { 
 ['&amp;']='&', ['&lt;']='<', ['&gt;']='>', ['&apos;']="'", ['&quot;']='"'
 }
 local function unescape(s)
 return string.gsub(s, '(%&%a+%;)', escapeTable)
 end
 
 -- start with root element
 local root = { parent=nil, name='root', data=nil, attrib={} }
 currentElement = root
 -- get (<startTag attributes> | <emptyTag/> | </endTag>) and following data (if any)
 local namePattern = '%a[%w%.%-%_%:%&%;]*' -- (primitive but enough for basic parsing)
 local tagPattern = '%<([%/]?)(' .. namePattern .. ')(.-)([%/]?)%>([^%<]*)' -- (5 captures)
 local attributePattern = '(' .. namePattern .. ')%s*%=%s*([\'\"])(.-)%2'
 for endTagChar, name, attributes, emptyTagChar, data in string.gmatch(xmlText, tagPattern) do
 local isEndTag = (endTagChar ~= '')
 local isEmptyTag = (emptyTagChar ~= '')
 assert(not (isEndTag and isEmptyTag), 'endTag-emptyTag conflict in: ' .. name)
 -- handle start tag or empty tag
 if (not isEndTag) then
 -- create a new child element, add it to current element
 local newElement = { parent=currentElement, name=name, data=nil, attrib={} }
 currentElement[#currentElement + 1] = newElement
 -- move to the new element
 currentElement = newElement
 -- add attributes and their values to current element
 if attributes ~= '' then
 local attribTable = currentElement.attrib
 for attr, _, val, _ in string.gmatch(attributes, attributePattern) do
 attribTable[attr] = unescape(val)
 end
 end
 -- call start tag handler, if any
 if startTagHandler then
 startTagHandler(currentElement)
 end
 end
 -- handle close tag or empty tag
 if (isEndTag or isEmptyTag) then
 assert((name == currentElement.name), 'inconsistent startTag/endTag in :' .. name)
 -- call end tag handler, if any
 if endTagHandler then
 endTagHandler(currentElement)
 end
 -- return to parent element
 currentElement = currentElement.parent
 end
 
 -- if any character data, store it and call data handler, if any
 -- (note: only last data chunk is stored, use handlers to get full data)
 if data and (data ~= '') then
 currentElement.data = unescape(data)
 dataHandler(currentElement)
 end
 end
 
 return root
end
---------------------------------------------------------------------------
-- recursively print element tree from given element
-- with optional indent step (default 2) and initial indent (defautl 0)
function PrintTree(element, indentStep, indent)
 indentStep = indentStep or 2
 indent = indent or 0
 local spc = string.rep(' ', indent)
 -- print as empty tag if no attributes and no children
 local emptyTag = ((next(element.attrib) == nil) and (#element == 0)) -- +++ no ??
 
 -- show open tag or empty tag
 if emptyTag then
 io.write(spc, '<', element.name, '/>\n')
 else
 io.write(spc, '<', element.name, '>\n')
 end
 -- show attributes-value pairs
 for attr, val in pairs(element.attrib) do
 io.write(spc, '-', attr, '=', val, '\n')
 end
 -- show (latest) data
 if element.data then
 io.write(spc, element.data, '\n')
 end
 -- show children
 for _, child in ipairs(element) do
 PrintTree(child, indentStep, indent + indentStep)
 end
 -- show close tag if any children
 if not emptyTag then
 io.write(spc, '</', element.name, '>\n')
 end
end
---------------------------------------------------------------------------
Test()

AltStyle によって変換されたページ (->オリジナル) /