Jump to content
Wikimedia Meta-Wiki

Module:Sandbox/AbstractWikipedia/TemplateParser

From Meta, a Wikimedia project coordination wiki
Module documentation


This module is part of user:AGutman-WMF's prototype implementation of Abstract Wikipedia's template language in Scribunto.

This module specifically implements a template-language parser. Its parse function returns three return values, described below.

An structured representation of the template elements

[edit ]

This is a list of tables, each table representing a single element of the template (either a slot or a textual element). The order of the list corresponds to the order of the elements in the table. The table have the following fields:

  • type :
For textual elements this can be punctuation, spacing or text.
For slots, this can be one of function, lexeme (for L-ids), item (for Q-ids), interpolation (for identifiers which are assumed to interpolate an argument), text (quoted strings within slots), number (an integer number) or undefined (yielding later an error).
  • text: for all above types, except function and interpolation, this field contains the content of the slot or the textual element (unquoted if it is text within a slot).
  • function: used only in the function type, this field represent the name of the function.
  • args: used only in the function type, this is a (possibly-empty) list of the arguments to the function, which are themselves tables of the type discussed here. Given that an argument to a function may be a function itself, this yields a tree representation of the function arguments.
  • arg: used only in the interpolation type, this field represent the name of the interpolated argument.
  • role: For slots with a dependency label, this contains the grammatical role indicated by the label (i.e. the label without any additional indexes or source indication.
  • index: for debugging purposes, this gives the sequential number of the element within the template (only for top-level elements). These indexes are referred to by the relations table (described below).

Relations table

[edit ]

The relations table is a list of tables indicating a relation to be applied on the slots of the template. The order is immaterial. Each table in the list has the following three fields:

  • role: the name of the relation to be applied.
  • target: the index of the target slot (i.e. the slot where the label is given).
  • source: the index of the source slot (by default the root slot, unless another source label is given).

Root of the template

[edit ]

The last return value is the index of the slot marked as `root`. If none is given (which is only allowed if dependency labels are not used at all), this will be the index of the first slot (or in the absence of slots, this will be the index of the last element).


The above documentation is transcluded from Module:Sandbox/AbstractWikipedia/TemplateParser/doc. (edit | history)
Editors can experiment in this module’s sandbox (create | mirror) and testcases (create) pages.
Please add categories to the /doc subpage. Subpages of this module.

 localp={}

 functionsegmentize(template)
 localsegments={}
 localsegment=""
 localinsideSlot=false
 functionpushSegmentIfNotEmpty()
 if(segment~="")then
 table.insert(segments,segment)
 segment=""
 end
 end
 forcharintemplate:gmatch"."do
 if(notinsideSlot)then
 if(char=='{')then
 pushSegmentIfNotEmpty()
 segment='{'
 insideSlot=true
 elseif(char:match"%p")then
 if(segment:match"^%p*$")then
 segment=segment..char
 else
 pushSegmentIfNotEmpty()
 segment=char
 end
 elseif(char:match"%s")then
 if(segment:match"^%s*$")then
 segment=segment..char
 else
 pushSegmentIfNotEmpty()
 segment=char
 end
 else
 segment=segment..char
 end
 else-- inside slot
 if(char=='}')then
 segment=segment..char
 pushSegmentIfNotEmpty()
 insideSlot=false
 else
 segment=segment..char
 end
 end
 end
 if(insideSlot)then
 error("Template ends without closing a slot!")
 end
 pushSegmentIfNotEmpty()
 returnsegments
 end

 functionisSlot(segment)
 returnsegment:match("{.+}")
 end

 -- Remove index part of label, e.g. "subj" from "subj_1"
 localfunctiongetRole(label)
 returnlabel:match("^%a+")
 end

 functionbreakDownSlot(slot)
 localresult={}
 -- strip {} characters
 slot=slot:sub(2,-2)
 localcolon=slot:find(':')
 if(colon)then
 result.invocation=slot:sub(colon+1)
 locallabel=slot:sub(1,colon-1)
 localsource_indicator=label:find('<')
 if(source_indicator)then
 result.source=label:sub(source_indicator+1)
 label=label:sub(1,source_indicator-1)
 end
 result.role=getRole(label)
 result.label=label
 else
 result.invocation=slot
 end
 returnresult
 end

 -- Helper funciton that try to match string with pattern and returns true/false
 -- The actual match and its length are given in result
 localfunctionmatches(str,pattern,result)
 -- Anchor the pattern at the beginning of the string and allow spaces around
 pattern="^%s*("..pattern..")%s*"
 _,end_pos,match=str:find(pattern)
 ifmatchthen
 result.match=match
 result.length=end_pos
 returntrue
 end
 returnfalse
 end

 functionparseInvocation(invocation)
 localresult={}
 localmatch={match='',length=0}
 ifmatches(invocation,"%a+%b()",match)then-- function invocation
 -- for proto-typing simplicity, we current support at most
 -- a single function argument
 result.type='function'
 localpos=match.match:find('%(')
 result['function']=match.match:sub(1,pos-1)
 -- Parse comma-seperated list of arguments which can themselves be a
 -- function invocaiton, an interpolation etc.
 result.args={}
 localremaining_args=match.match:sub(pos+1,-2)
 whileremaining_args~=''do
 localarg,length=parseInvocation(remaining_args)
 iflength==0then
 error("Unknown element: "..remaining_args)
 end
 table.insert(result.args,arg)
 localnext_pos=pos+length+1
 localnext_char=match.match:sub(next_pos,next_pos)
 ifnext_char==')'then
 remaining_args=''
 elseifnext_char==','then
 pos=next_pos
 remaining_args=match.match:sub(pos+1,-2)
 else
 error("Unexpected element in function invocation: "..match.match:sub(pos,-2))
 end
 end
 elseifmatches(invocation,"L%d+",match)then
 result.type='lexeme'
 result.text=match.match
 elseifmatches(invocation,"Q%d+",match)then
 result.type='item'
 result.text=match.match
 elseifmatches(invocation,"[%a_]+",match)then
 result.type='interpolation'
 result.arg=match.match
 elseifmatches(invocation,'%"[^%"]*%"',match)then
 result.type='text'
 result.text=match.match:sub(2,-2)
 elseifmatches(invocation,"[+-]?%d+",match)then
 result.type='number'
 result.text=match.match
 else
 result.type='undefined'
 result.text=invocation
 end
 returnresult,match.length
 end

 functionenumerateRelations(labelIndexMap,labelSourceMap,rootIndex)
 localrelations={}
 forlabel,indexinpairs(labelIndexMap)do
 localrelation={}
 relation.role=getRole(label)-- remove index part, e.g. subj_1
 relation.target=index
 if(labelSourceMap[label])then
 source=labelIndexMap[labelSourceMap[label]]
 if(notsource)then
 error("Source label not found: "..labelSourceMap[label])
 end
 relation.source=source
 else
 relation.source=rootIndex
 end
 table.insert(relations,relation)
 end
 returnrelations
 end

 functionp.parse(template)
 localelements={}
 locallabelIndexMap={}
 locallabelSourceMap={}
 localrootSlot=nil
 localfirstSlot=nil

 forindex,segmentinpairs(segmentize(template))do
 if(isSlot(segment))then
 ifnotfirstSlotthen
 firstSlot=index
 end
 slotComponents=breakDownSlot(segment)
 localnew_element,length=parseInvocation(slotComponents.invocation)
 if(length~=#slotComponents.invocation)then
 extra=slotComponents.invocation:sub(length+1)
 -- We allow extrawhite space
 ifnotextra:match("^%s+$")then
 error("Unexpected element: "..slotComponents.invocation:sub(length+1))
 end
 end
 new_element.index=index
 new_element.role=slotComponents.role
 table.insert(elements,new_element)
 if(slotComponents.label)then
 locallabel=slotComponents.label
 if(label=='root')then
 if(rootSlot)then
 error("Duplicate root label at position "..index..".")
 else
 rootSlot=index
 end
 else
 labelIndexMap[label]=index
 if(slotComponents.source)then
 labelSourceMap[label]=slotComponents.source
 end
 end
 end
 elseifsegment:match("^%p+$")then
 table.insert(elements,{type='punctuation',text=segment,index=index})
 elseifsegment:match("^%s+$")then
 table.insert(elements,{type='spacing',text=segment,index=index})
 else
 table.insert(elements,{type='text',text=segment,index=index})
 end
 end

 if(notrootSlot)then
 ifnext(labelIndexMap)~=nilthen
 error("When using relations, you must specify root.")
 end
 -- We allow infering the root if no relations have been used
 rootSlot=firstSlotor#elements
 end

 localrelations=enumerateRelations(labelIndexMap,labelSourceMap,rootSlot)
 mw.logObject(elements)
 returnelements,relations,rootSlot
 end

 returnp

AltStyle によって変換されたページ (->オリジナル) /