Module:Sandbox/AbstractWikipedia/TemplateParser
This module is part of user:AGutman-WMF's prototype implementation of Abstract Wikipedia's template language in Scribunto.
This module specifically implements a template-language parser. Its parse function returns three return values, described below.
An structured representation of the template elements
[edit ]This is a list of tables, each table representing a single element of the template (either a slot or a textual element). The order of the list corresponds to the order of the elements in the table. The table have the following fields:
type:
- For textual elements this can be
punctuation,spacingortext. - For slots, this can be one of
function,lexeme(for L-ids),item(for Q-ids),interpolation(for identifiers which are assumed to interpolate an argument),text(quoted strings within slots),number(an integer number) orundefined(yielding later an error).
text: for all above types, exceptfunctionandinterpolation, this field contains the content of the slot or the textual element (unquoted if it is text within a slot).
function: used only in thefunctiontype, this field represent the name of the function.
args: used only in thefunctiontype, this is a (possibly-empty) list of the arguments to the function, which are themselves tables of the type discussed here. Given that an argument to a function may be a function itself, this yields a tree representation of the function arguments.
arg: used only in theinterpolationtype, this field represent the name of the interpolated argument.
role: For slots with a dependency label, this contains the grammatical role indicated by the label (i.e. the label without any additional indexes or source indication.
index: for debugging purposes, this gives the sequential number of the element within the template (only for top-level elements). These indexes are referred to by the relations table (described below).
Relations table
[edit ]The relations table is a list of tables indicating a relation to be applied on the slots of the template. The order is immaterial. Each table in the list has the following three fields:
role: the name of the relation to be applied.target: the index of the target slot (i.e. the slot where the label is given).source: the index of the source slot (by default the root slot, unless another source label is given).
Root of the template
[edit ]The last return value is the index of the slot marked as `root`. If none is given (which is only allowed if dependency labels are not used at all), this will be the index of the first slot (or in the absence of slots, this will be the index of the last element).
Editors can experiment in this module’s sandbox (create | mirror) and testcases (create) pages.
Please add categories to the /doc subpage. Subpages of this module.
localp={} functionsegmentize(template) localsegments={} localsegment="" localinsideSlot=false functionpushSegmentIfNotEmpty() if(segment~="")then table.insert(segments,segment) segment="" end end forcharintemplate:gmatch"."do if(notinsideSlot)then if(char=='{')then pushSegmentIfNotEmpty() segment='{' insideSlot=true elseif(char:match"%p")then if(segment:match"^%p*$")then segment=segment..char else pushSegmentIfNotEmpty() segment=char end elseif(char:match"%s")then if(segment:match"^%s*$")then segment=segment..char else pushSegmentIfNotEmpty() segment=char end else segment=segment..char end else-- inside slot if(char=='}')then segment=segment..char pushSegmentIfNotEmpty() insideSlot=false else segment=segment..char end end end if(insideSlot)then error("Template ends without closing a slot!") end pushSegmentIfNotEmpty() returnsegments end functionisSlot(segment) returnsegment:match("{.+}") end -- Remove index part of label, e.g. "subj" from "subj_1" localfunctiongetRole(label) returnlabel:match("^%a+") end functionbreakDownSlot(slot) localresult={} -- strip {} characters slot=slot:sub(2,-2) localcolon=slot:find(':') if(colon)then result.invocation=slot:sub(colon+1) locallabel=slot:sub(1,colon-1) localsource_indicator=label:find('<') if(source_indicator)then result.source=label:sub(source_indicator+1) label=label:sub(1,source_indicator-1) end result.role=getRole(label) result.label=label else result.invocation=slot end returnresult end -- Helper funciton that try to match string with pattern and returns true/false -- The actual match and its length are given in result localfunctionmatches(str,pattern,result) -- Anchor the pattern at the beginning of the string and allow spaces around pattern="^%s*("..pattern..")%s*" _,end_pos,match=str:find(pattern) ifmatchthen result.match=match result.length=end_pos returntrue end returnfalse end functionparseInvocation(invocation) localresult={} localmatch={match='',length=0} ifmatches(invocation,"%a+%b()",match)then-- function invocation -- for proto-typing simplicity, we current support at most -- a single function argument result.type='function' localpos=match.match:find('%(') result['function']=match.match:sub(1,pos-1) -- Parse comma-seperated list of arguments which can themselves be a -- function invocaiton, an interpolation etc. result.args={} localremaining_args=match.match:sub(pos+1,-2) whileremaining_args~=''do localarg,length=parseInvocation(remaining_args) iflength==0then error("Unknown element: "..remaining_args) end table.insert(result.args,arg) localnext_pos=pos+length+1 localnext_char=match.match:sub(next_pos,next_pos) ifnext_char==')'then remaining_args='' elseifnext_char==','then pos=next_pos remaining_args=match.match:sub(pos+1,-2) else error("Unexpected element in function invocation: "..match.match:sub(pos,-2)) end end elseifmatches(invocation,"L%d+",match)then result.type='lexeme' result.text=match.match elseifmatches(invocation,"Q%d+",match)then result.type='item' result.text=match.match elseifmatches(invocation,"[%a_]+",match)then result.type='interpolation' result.arg=match.match elseifmatches(invocation,'%"[^%"]*%"',match)then result.type='text' result.text=match.match:sub(2,-2) elseifmatches(invocation,"[+-]?%d+",match)then result.type='number' result.text=match.match else result.type='undefined' result.text=invocation end returnresult,match.length end functionenumerateRelations(labelIndexMap,labelSourceMap,rootIndex) localrelations={} forlabel,indexinpairs(labelIndexMap)do localrelation={} relation.role=getRole(label)-- remove index part, e.g. subj_1 relation.target=index if(labelSourceMap[label])then source=labelIndexMap[labelSourceMap[label]] if(notsource)then error("Source label not found: "..labelSourceMap[label]) end relation.source=source else relation.source=rootIndex end table.insert(relations,relation) end returnrelations end functionp.parse(template) localelements={} locallabelIndexMap={} locallabelSourceMap={} localrootSlot=nil localfirstSlot=nil forindex,segmentinpairs(segmentize(template))do if(isSlot(segment))then ifnotfirstSlotthen firstSlot=index end slotComponents=breakDownSlot(segment) localnew_element,length=parseInvocation(slotComponents.invocation) if(length~=#slotComponents.invocation)then extra=slotComponents.invocation:sub(length+1) -- We allow extrawhite space ifnotextra:match("^%s+$")then error("Unexpected element: "..slotComponents.invocation:sub(length+1)) end end new_element.index=index new_element.role=slotComponents.role table.insert(elements,new_element) if(slotComponents.label)then locallabel=slotComponents.label if(label=='root')then if(rootSlot)then error("Duplicate root label at position "..index..".") else rootSlot=index end else labelIndexMap[label]=index if(slotComponents.source)then labelSourceMap[label]=slotComponents.source end end end elseifsegment:match("^%p+$")then table.insert(elements,{type='punctuation',text=segment,index=index}) elseifsegment:match("^%s+$")then table.insert(elements,{type='spacing',text=segment,index=index}) else table.insert(elements,{type='text',text=segment,index=index}) end end if(notrootSlot)then ifnext(labelIndexMap)~=nilthen error("When using relations, you must specify root.") end -- We allow infering the root if no relations have been used rootSlot=firstSlotor#elements end localrelations=enumerateRelations(labelIndexMap,labelSourceMap,rootSlot) mw.logObject(elements) returnelements,relations,rootSlot end returnp