Ex Pattern


xpattern is a Lua pattern matching library based on Lua patterns and in the style of LuaPeg.

Description

Lua patterns [1] are a limited form of regular expressions. Unlike regular expressions, patterns do not support grouping and quantification [2] operations on anything except single character sets. For full regular expression (or Perl-style regular expression) support, you may instead compile a regular expression module such as lrexlib [3]. For even greater flexibility, for parsing and lexical analysis applications or even for the simpler case, one can use parsing expression grammars (PEG) via the LuaPeg library [4] implemented in C.

Sometimes, however, Lua patterns are almost sufficient, as we may want to extend the functionality of patterns just a bit more without compiling in any additional C code. That's where this module comes in. This module builds extended pattern matching on top of Lua patterns. It takes a pattern expression written in a form somewhat similar to LPeg and translates it to a string of Lua code containing string.match calls, which is then compiled to a Lua function via loadstring. This is similar to the code you might write yourself, although this module automates it with code generation. It should have similar efficiency assuming you precompile any match objects that are used repeatedly.

Here is a simple example:

local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

which internally generates this Lua function:

local match = ... -- string.match
return function(s,pos)
 for pos1=(pos or 1),#s do
 local pos2
 local c1,c2,c3,c4
 local pos3 = pos1
 c1,pos3 = match(s, "^(b+)()", pos1)
 if not pos3 then
 c2,pos3 = match(s, "^(c+)()", pos1)
 end
 pos2 = pos3
 if pos2 then
 do
 local pos3=pos2
 pos2 = pos2
 while 1 do
 pos3 = match(s, "^[A-Z][a-z]()", pos3)
 if pos3 then
 pos2=pos3
 else break end
 end
 end
 end
 if pos2 then
 c3,c4,pos2 = match(s, "^(.)()()", pos2)
 end
 if pos2 then return c1,c2,c3,c4 end
 end
end

See the test suite below for additional examples.

Status

Warning: this code should be considered somewhat experimental. It is not fully developed or tested. Fix up the code if you want to use it in production.

Appendix: xpattern.lua

-- xpattern.lua
-- Preliminary regular expression-like support in Lua
-- Uses Lua patterns as the core building block.
--
-- Implemented in pure Lua with code generation technique.
-- It translates an expression into a snippet of Lua code
-- having a series of `string.match` calls, which is then
-- compiled (via `loadstring`).
--
-- Like lpeg, does not support backtracking.
--
-- WARNING: This is experimental code. The design and implementation
-- has not been thoroughly tested.
--
-- Version v20091021.
-- (c) 2008-2009 David Manura. Licensed under the same terms as Lua (MIT license).
-- Please post patches.
local M = {}
local string = string
local format = string.format
local match = string.match
local assert = assert
local error = error
local ipairs = ipairs
local loadstring = loadstring
local setmetatable = setmetatable
local type = type
local print = print
-- Adds whitespace to string `s`.
-- Whitespace string `ws` (default to '' if omitted) is prepended to each line
-- of `s`. Also ensures `s` is is terminated by a newline.
local function add_whitespace(s, ws)
 ws = ws or ''
 s = s:gsub('[^\r\n]+', ws .. '%1')
 if s:match('[^\r\n]$') then
 s = s .. '\n'
 end
 return s
end
-- Counts the number `count` of captures '()' in Lua pattern string `pat`.
local function count_captures(pat)
 local count = 0
 local pos = 1
 while pos <= #pat do
 local pos2 = pat:match('^[^%(%%%[]+()', pos)
 if pos2 then
 pos = pos2
 elseif pat:match('^%(', pos) then
 count = count + 1
 pos = pos + 1
 elseif pat:match('^%%b..', pos) then
 pos = pos + 3
 elseif pat:match('^%%.', pos) then
 pos = pos + 2
 else
 local pos2 = pat:match('^%[[^%]%%]*()', pos)
 if pos2 then
 pos = pos2
 while 1 do
 local pos2 = pat:match('^%%.[^%]%%]*()', pos)
 if pos2 then
 pos = pos2
 elseif pat:match('^%]', pos) then
 pos = pos + 1
 break
 else
 error('syntax', 2)
 end
 end
 else
 error('syntax', 2)
 end
 end
 end
 return count
end
M._count_captures = count_captures
-- Appends '()' to Lua pattern string `pat`.
local function pat_append_pos(pat)
 local prefix = pat:match'^(.*)%$$'
 pat = prefix and prefix .. '()$' or pat .. '()'
 return pat
end
-- Prepends '()' to Lua pattern string `pat`.
local function pat_prepend_pos(pat)
 local postfix = pat:match'^%^(.*)'
 pat = postfix and '^()' .. postfix or '()' .. pat
 return pat
end
-- Prepends '^' to Lua pattern string `pat`.
local function pat_prepend_carrot(pat)
 local postfix = pat:match'^%^(.*)'
 pat = postfix and pat or '^' .. pat
 return pat
end
-- Return a string listing pattern capture variables with indices `firstidx`
-- to `lastidx`.
-- Ex: code_vars(1,2) --> 'c1,c2'
local function code_vars(firstidx, lastidx)
 local code = ''
 for i=firstidx,lastidx do
 code = code .. (i == firstidx and '' or ',') .. 'c' .. i
 end
 return code
end
-- Metatable for expression objects
local epat_mt = {}
epat_mt.__index = epat_mt
-- Builds an extended pattern object `epat` from Lua string pattern `pat`.
local function pattern(pat)
 local epat = setmetatable({}, epat_mt)
 epat.call = function(srcidx0, destidx0, totncaptures0)
 local ncaptures = count_captures(pat)
 local lvars =
 code_vars(totncaptures0+1, totncaptures0+ncaptures)
 .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0
 local pat = pat_append_pos(pat)
 pat = pat_prepend_carrot(pat)
 local str = format('%q', pat)
 local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. ')\n'
 return code, ncaptures
 end
 epat.anchored = pat:sub(1,1) == '^'
 return epat
end
-- Generates code from pattern `anypat` (either Lua pattern string or extended
-- pattern object).
-- `anypat` - either Lua pattern string or extended pattern object
-- `srcidx0` - index of variable holding position to start matching at
-- `destidx0` - index of variable holding position to store subsequent
-- match position at. stores nil if no match
-- `totncaptures0` - number of captures prior to this match
-- `code` - Lua code string (code) and number of
-- `ncaptures` - number of captures in pattern.
local function gen(anypat, srcidx0, destidx0, totncaptures0)
 if type(anypat) == 'string' then
 anypat = pat_prepend_carrot(anypat)
 anypat = pattern(anypat)
 end
 local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0)
 return code, ncaptures
end
-- Creates a new extended pattern object `epat` that is the concatenation of
-- the given list (of size >= 0) of pattern objects.
-- Specify a single string argument to convert a Lua pattern to an extended
-- pattern object.
local function seq(...) -- epats...
 -- Ensure args are extended pattern objects.
 local epats = {...}
 for i=1,#epats do
 if type(epats[i]) == 'string' then
 epats[i] = pattern(epats[i])
 end
 end
 local epat = setmetatable({}, epat_mt)
 epat.call = function(srcidx0, destidx0, totncaptures0, ws)
 if #epats == 0 then
 return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
 elseif #epats == 1 then
 return epats[1](srcidx0, destidx0, totncaptures0, ws)
 else
 local ncaptures = 0
 local pat_code, pat_ncaptures =
 gen(epats[1], srcidx0, destidx0, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + pat_ncaptures
 local code = add_whitespace(pat_code, '')
 for i=2,#epats do
 local pat_code, pat_ncaptures =
 gen(epats[i], destidx0, destidx0, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + pat_ncaptures
 code =
 code ..
 'if pos' .. destidx0 .. ' then\n' ..
 add_whitespace(pat_code, ' ') ..
 'end\n'
 end
 return code, ncaptures
 end
 end
 if epats[1] and epats[1].anchored then
 epat.anchored = true
 end
 return epat
end
M.P = seq
-- Creates new extended pattern object `epat` that is the alternation of the
-- given list of pattern objects `epats...`.
local function alt(...) -- epats...
 -- Ensure args are extended pattern objects.
 local epats = {...}
 for i=1,#epats do
 if type(epats[i]) == 'string' then
 epats[i] = pattern(epats[i])
 end
 end
 local epat = setmetatable({}, epat_mt)
 epat.call = function(srcidx0, destidx0, totncaptures0)
 if #epats == 0 then
 return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0
 elseif #epats == 1 then
 return epats[1](srcidx0, destidx0, totncaptures0)
 else
 local ncaptures = 0
 local pat_code, pat_ncaptures =
 gen(epats[1], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + pat_ncaptures
 local code = 'local pos' .. destidx0+1 .. ' = pos' .. srcidx0 .. '\n' ..
 add_whitespace(pat_code, '')
 for i=2,#epats do
 local pat_code, pat_ncaptures =
 gen(epats[i], srcidx0, destidx0+1, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + pat_ncaptures
 code =
 code ..
 'if not pos' .. destidx0+1 .. ' then\n' ..
 add_whitespace(pat_code, ' ') ..
 'end\n'
 end
 code = code .. 'pos' .. destidx0 .. ' = pos' .. destidx0+1 .. '\n'
 return code, ncaptures
 end
 end
 return epat
end
M.alt = alt
-- Creates new extended pattern object `epat` that is zero or more repetitions
-- of the given pattern object `pat` (if one evaluates to false) or one or more
-- (if one evaluates to true).
local function star(pat, one)
 local epat = setmetatable({}, epat_mt)
 epat.call = function(srcidx0, destidx0, totncaptures0)
 local ncaptures = 0
 local destidx = destidx0 + 1
 local code = 'do\n' ..
 ' local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
 local pat_code, pat_ncaptures =
 gen(pat, destidx, destidx, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + pat_ncaptures
 code = code ..
 (one and (' pos' .. destidx0 .. ' = nil\n')
 or (' pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n')) ..
 ' while 1 do\n' ..
 add_whitespace(pat_code, ' ') ..
 ' if pos' .. destidx .. ' then\n' ..
 ' pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
 ' else break end\n' ..
 ' end\n' ..
 'end\n'
 return code, ncaptures
 end
 return epat
end
M.star = star
-- Creates new extended pattern object `epat` that is zero or one of the
-- given pattern object `epat0`.
local function zero_or_one(epat0)
 local epat = setmetatable({}, epat_mt)
 epat.call = function(srcidx0, destidx0, totncaptures0)
 local ncaptures = 0
 local destidx = destidx0 + 1
 local code = 'do\n' ..
 ' local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'
 local epat0_code, epat0_ncaptures =
 gen(epat0, destidx, destidx, totncaptures0+ncaptures, true)
 ncaptures = ncaptures + epat0_ncaptures
 code = code ..
 add_whitespace(epat0_code) ..
 ' if pos' .. destidx .. ' then\n' ..
 ' pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..
 ' else\n' ..
 ' pos' .. destidx0 .. '=pos' .. srcidx0 .. '\n' ..
 ' end\n' ..
 'end\n'
 return code, ncaptures
 end
 return epat
end
M.zero_or_one = zero_or_one
-- Gets Lua core code string `code` corresponding to pattern object `epat`
local function basic_code_of(epat)
 local pat_code, ncaptures = epat(1, 2, 0, true)
 local lvars = code_vars(1, ncaptures)
 if epat.anchored then
 local code =
 'local pos1=pos or 1\n' ..
 'local pos2\n' ..
 (lvars ~= '' and ' local ' .. lvars .. '\n' or '') ..
 add_whitespace(pat_code) .. '\n' ..
 'if pos2 then return ' .. (lvars ~= '' and lvars or 's:sub(pos1,pos2-1)') .. ' end\n'
 return code
 else
 local code =
 'for pos1=(pos or 1),#s do\n' ..
 ' local pos2\n'
 if lvars == '' then
 code =
 code ..
 add_whitespace(pat_code, ' ') ..
 ' if pos2 then return s:sub(pos1,pos2-1) end\n'
 else
 code =
 code ..
 ' local ' .. lvars .. '\n' ..
 add_whitespace(pat_code, ' ') ..
 ' if pos2 then return ' .. lvars .. ' end\n'
 end
 code =
 code ..
 'end\n'
 return code
 end
end
M.basic_code_of = basic_code_of
-- Gets Lua complete code string `code` corresponding to pattern object `epat`.
local function code_of(epat)
 local code =
 'local match = ...\n' ..
 'return function(s,pos)\n' ..
 add_whitespace(basic_code_of(epat), ' ') ..
 'end\n'
 return code
end
M.code_of = code_of
-- Compiles pattern object `epat` to Lua function `f`.
local function compile(epat)
 local code = code_of(epat)
 if M.debug then print('DEBUG:\n' .. code) end
 local f = assert(loadstring(code))(match)
 return f
end
M.compile = compile
-- operator for pattern matching
function epat_mt.__call(epat, ...)
 return epat.call(...)
end
-- operator for pattern alternation
function epat_mt.__add(a_epat, b_epat)
 return alt(a_epat, b_epat)
end
-- operator for pattern concatenation
function epat_mt.__mul(a_epat, b_epat)
 return seq(a_epat, b_epat)
end
-- operator for pattern repetition
function epat_mt.__pow(epat, n)
 if n == 0 then
 return star(epat)
 elseif n == 1 then
 return star(epat, true)
 elseif n == -1 then
 return zero_or_one(epat)
 else
 error 'FIX - unimplemented'
 end
end
-- IMPROVE design?
epat_mt.compile = compile
epat_mt.basic_code_of = basic_code_of
epat_mt.code_of = code_of
return M

Appendix: xpattern_test.lua

-- xpattern_test.lua - test suite for xpattern.lua
-- utility function: convert list of values to string.
local function str(...)
 local n = select('#', ...)
 local t = {...}
 for i=1,n do t[i] = tostring(t[i]) end
 return table.concat(t, ',')
end
local M = require "xpattern"
local P = M.P
M.debug = true
-- internal: _count_captures
assert(M._count_captures'' == 0)
assert(M._count_captures'a' == 0)
assert(not pcall(function() M._count_captures'%' end))
assert(M._count_captures'()' == 1)
assert(M._count_captures'%(%)' == 0) -- %(
assert(M._count_captures'[()]' == 0) -- () inside []
assert(M._count_captures'[%(%)]' == 0) -- %( inside []
assert(M._count_captures'[%]()]' == 0) -- %] inside []
assert(M._count_captures'[]()]' == 1)
assert(M._count_captures'%b()' == 0) -- () on %b..
assert(M._count_captures'(()().())' == 4) -- nested
-- more complex example
assert(M._count_captures'.(.%))[(]%(()' == 2)
-- simple matching
assert(str(P'':compile()('')) == '')
assert(str(P'':compile()('a')) == '')
assert(str(P'a':compile()('')) == '')
assert(str(P'a':compile()('a')) == 'a')
assert(str(P'a':compile()('ba')) == 'a')
assert(str(P'a+':compile()('baa')) == 'aa')
-- simple anchors
assert(str(P'^a+':compile()('aa')) == 'aa')
assert(str(P'^a+':compile()('baab')) == '') -- $ fail
assert(str(P'a+$':compile()('baa')) == 'aa')
assert(str(P'a+$':compile()('baab')) == '') -- $ fail
-- simple captures
assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b')
assert(str(P'^(a+)(b+)':compile()('baab')) == '')
-- simple sequences
local m = P():compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P(''):compile()
assert(str( m('') ) == '')
assert(str( m('a') ) == '')
local m = P('a', 'b', 'c'):compile()
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = (P'a' * P'b' * P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a', 'b', P'c'):compile() -- identical
assert(str( m('.a.') ) == '')
assert(str( m('.abc.') ) == 'abc')
local m = P(P'a+', 'b+', P'c+'):compile()
assert(str( m('.abaabcc.') ) == 'aabcc')
-- simple alternation
local m = (P'aa+' + P'bb+'):compile()
assert(str( m('abbaa') ) == 'bb')
local m = (P'aa+' + P'bb+' + P'cc+'):compile()
assert(str( m('abccaa') ) == 'cc')
-- simple combinations
local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile()
assert(str( m("aacccdd")) == 'aa,nil,ccc,6')
assert(str( m("bbcccdd")) == 'nil,b,ccc,6')
assert(str( m("bbdd")) == '')
print('?'..str( m("aabbcc")))
assert(str( m("aabbcc")) == 'nil,b,cc,7') -- alternative
-- simple replication (*)
local m = ( P'a'^0 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')
-- replication (*)
local m = ( (P'a+' + P'b+')^0 ):compile()
assert(str(m'zabaabbc') == '')
assert(str(m'abaabb') == 'abaabb')
local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile()
assert(str(m'aabbccddaa') == 'aabbccdd')
local m = ( P'aa'^0 * P'bb' * P'aa'^0 ):compile()
assert(str(m'aaccaaaabbaa') == 'aaaabbaa')
-- simple replication (+)
local m = ( P'a'^1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'aa')
-- replacation (+)
local m = ( P'b' * P'a'^1 ):compile()
print(m'b')
assert(str(m'b') == '')
assert(str(m'ba') == 'ba')
assert(str(m'baab') == 'baa')
-- simple replication (?)
local m = ( P'a'^-1 ):compile()
assert(str(m'') == '')
assert(str(m'a') == 'a')
assert(str(m'aab') == 'a')
-- replication (?)
local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile()
assert(str(m'caabb') == 'caa')
-- Some of these examples from Mastering Regular Expressions (MRE),
-- 2nd Ed. Jeffrey .Friedl.
-- MRE p.19
local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile()
assert(str(m('Subject: test')) == 'test')
-- MRE p.13
local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile()
assert(str(m'Jeffrey') == 'Jeffrey')
assert(str(m'Jeffery') == 'Jeffery')
assert(str(m'Geoffrey') == 'Geoffrey')
assert(str(m'Geoffery') == 'Geoffery')
assert(str(m'Jefery') == '')
assert(str(m'Geofferi') == '')
assert(str(m'GeoffrezGeoffery') == 'Geoffery') -- skips
assert(str(m'JefferzGeoffery') == 'Geoffery') -- skips
assert(str(m'GeoffJeffery') == 'Jeffery') -- skips
-- MRE p.24
local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile()
assert(str(m'20ドル.00') == '20ドル.00')
assert(str(m'20ドル') == '20ドル')
assert(str(m'20ドル.00.00') == '20ドル.00')
-- example
print 'example'
local M = require "xpattern"
local P = M.P
local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()
local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first
assert(a == nil and b == 'ccc' and c == 'b' and d == 11)
-- example
local m = P('foo', P'bar'+P'baz', 'qux'):compile()
assert(str(m'afoobazfoobarquxbar', 'foobarqux'))
local m = P('^foo', P'bar'+P'baz', 'qux'):compile() -- anchored
assert(str(m'afoobazfoobarquxbar', ''))
assert(str(m'foobarquxbar', ''))
-- http://lua-users.org/lists/lua-l/2009-10/msg00752.html
local m = (
 P'^' * ( ( P'ceil'+P'abs' +P'floor'+P'mod' +P'exp'+P'log'+P'pow'+
 P'sqrt'+P'acos'+P'asin' +P'atan'+P'cos'+P'sin'+P'tan'+
 P'deg' +P'rad' +P'random'
 ) * P'%('
 + P'[0-9%(%)%-%+%*%/%.%,]' + P'pi'
 )^1 * P'$'
):compile()
assert(m'cos(1+pi)' == 'cos(1+pi)')
assert(m'cos(1+p)' == nil) -- 'p'
assert(m'cos(12.3/2)+mod(2,3)' == 'cos(12.3/2)+mod(2,3)')
assert(m'cos(12.3/2)+mod(2,3) ' == nil) -- ' '
assert(m' cos(12.3/2)+mod(2,3)' == nil) -- ' '
assert(m'cos(12.3/2)+mod+2' == nil) -- no '('
print 'DONE'

Changes:

Author

DavidManura

See Also


RecentChanges · preferences
edit · history
Last edited January 6, 2010 8:27 pm GMT (diff)

AltStyle によって変換されたページ (->オリジナル) /