# -*- mode: perl; perl-indent-level: 2 -*-
#
# Compile and evaluate regular expressions in Perl without
# using built-in regular expressions.
#
# Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
# This program is in the PUBLIC DOMAIN.
#
#
# Note to readers:
# The code is in three parts.
# 1. A parser for regexes
# 2. A compiler that trnaslates regexes into machines
# 3. An executer that executes a machine on a specified input
#
# The parser is really turgid and hard to understand, because it isn't
# the point of the package. I recommend that you skip to part 2 or 3
# immediately, then work backwards. Parts 2 and 3 are much clearer.
package Regex;
# Regexps are handled in three phases. First, they're parsed from a string
# form into an internal parse tree form, thus:
#
# ABC => [ CONCAT => [ A, B, C ] ]
# A* => [ STAR => A ]
# A+ => [ PLUS => A ]
# A|B|C => [ ALTERN => [ A, B, C ] ]
# literal character x => [ LITERAL => x ]
#
# `parse' does this.
sub parse {
@t = split(//, $_[1]);
parse_altern(@t);
}
sub parse_altern {
my @alterns;
my @terms;
my $c;
while (defined($c = shift @_)) {
next if $c eq '';
push @seen, $c;
if ($c eq '(') {
my $next_term = &parse_altern;
push @terms, $next_term;
} elsif ($c eq ')') {
push @alterns, &joinup(CONCAT, @terms) if @terms;
return &joinup(ALTERN, @alterns);
} elsif ($c eq '|') {
push @alterns, &joinup(CONCAT, @terms) if @terms;
@terms = ();
} elsif ($c eq '*' || $c eq '+') {
if (@terms) {
$terms[-1] = [ ($c eq '*' ? STAR : PLUS) => $terms[-1] ];
} else {
$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
return undef;
}
} elsif ($c eq '\\') {
push @terms, [ LITERAL => (shift) ];
} else {
push @terms, [ LITERAL => $c ];
}
} # While there are tokens...
push @alterns, &joinup(CONCAT, @terms) if @terms;
return joinup(ALTERN, @alterns) if @alterns;
return undef;
}
sub joinup {
my $tag = shift;
if (@_ == 1) {
$_[0];
} else {
[ $tag => [ @_ ] ];
}
}
package NFA;
################################################################
#
# Compile parsed regexp into representation of NFA
#
################################################################
$S = 'aa00';
$STARTSYMBOL = 0;
$ENDSYMBOL = 1;
sub new {
compile(@_);
}
sub compile {
my $pack = shift;
my $rx = shift;
my ($operator, $operands) = @$rx;
# A literal has no suboperands to compile.
# So invoke the special atom-compiler and return that result instead.
if ($operator eq LITERAL) {
return $pack->literal($operands);
}
my $startsym = "S" . &gensym();
my $endsym = "E" . &gensym();
my $result = { Symbols => [ $startsym, $endsym ] };
# Compile the sub-operands first.
my @submachines;
if ($operator eq STAR || $operator eq PLUS) {
@submachines = ($pack->compile($operands));
} else {
foreach $operand (@$operands) {
push @submachines, $pack->compile($operand);
}
}
if ($operator eq CONCAT) {
return $submachines[0] if @submachines == 1;
&putin($result, @submachines);
my $i;
for ($i = 0; $i < @submachines - 1; $i++) { my $tail = $submachines[$i] {Symbols}[$ENDSYMBOL]; my $head = $submachines[$i+1]{Symbols}[$STARTSYMBOL]; $result->{$tail} = { '' => $head };
}
$result->{$startsym} = { '' => $submachines[0] {Symbols}[$STARTSYMBOL] };
$result->{$submachines[-1]{Symbols}[$ENDSYMBOL]} = { '' => $endsym };
} elsif ($operator eq STAR) {
my $sm = $submachines[0];
&putin($result, $sm);
my ($s, $e) = @{$sm->{Symbols}};
$result->{$e} = { '' => [$s, $endsym] };
$result->{$startsym} = { '' => [$s, $endsym] };
} elsif ($operator eq PLUS) {
my $sm = $submachines[0];
&putin($result, $sm);
my ($s, $e) = @{$sm->{Symbols}};
$result->{$e} = { '' => [$s, $endsym] };
$result->{$startsym} = { '' => $s };
} elsif ($operator eq ALTERN) {
return $submachines[0] if @submachines == 1;
&putin($result, @submachines);
my @startsyms = map { $_->{Symbols}[$STARTSYMBOL] } @submachines;
my @endsyms = map { $_->{Symbols}[$ENDSYMBOL] } @submachines;
$result->{$startsym} = { '' => \@startsyms };
foreach $es (@endsyms) {
$result->{$es} = { '' => $endsym };
}
} else {
warn "Bizarre oprerator `$operator' encountered.\n";
}
bless $result => $pack;
}
sub start_state {
$_[0]{Symbols}[$STARTSYMBOL];
}
sub is_end_state {
my $self = shift;
my $state = shift;
$state eq $self->{Symbols}[$ENDSYMBOL];
}
sub transition_table {
my $self = shift;
my $state = shift;
$self->{$state} || {};
}
sub literal {
my $pack = shift;
my $what = shift;
my $startsym = "S" . &gensym();
my $endsym = "E" . &gensym();
bless
{ Symbols => [ $startsym, $endsym ],
$startsym => { $what => $endsym } },
=> $pack;
}
# Given a list of machines, M1 ... Mn, put M2... Mn into M1.
sub putin {
my $master = shift;
foreach $m (@_) {
foreach $state (keys %$m) {
next if $state eq 'Symbols';
if (exists $master->{$state}) {
print STDERR "Warning: State name conflict for `$state'.\n";
}
$master->{$state} = $m->{$state};
}
}
$master;
}
sub gensym {
$S++;
}
################################################################
#
# Execute NFA on a given string
#
################################################################
package NFA_Exec;
sub match {
my $pack = shift;
my $nfa = shift;
my $string = shift;
my $machine = $pack->init($nfa, $string);
$machine->run();
$machine->final_state();
}
sub new {
&init(@_);
}
#
# Create a new execution of the specified NFS, and feed it
# the specified string as its input.
#
sub init {
my $pack = shift;
my $nfa = shift;
my $string = shift;
my $self = {};
$self->{nfa} = $nfa;
$self->{input} = $string;
$self->{pos} = 0;
$self->{states} = [ $self->{nfa}->start_state ];
bless $self => $pack;
$self->epsilon_transit();
$self;
}
#
# Run an execution to the end of the input
#
sub run {
my $self = shift;
until ($self->end_of_input() || $self->states() == 0) {
$self->step;
}
}
#
# Is this execution object at the end of its input?
#
sub end_of_input {
my $self = shift;
$self->{pos}>= length($self->{input});
}
#
# Advance an execution by one step.
#
sub step {
my $self = shift;
my $next_symbol = substr($self->{input}, $self->{pos}, 1);
if ($next_symbol eq '') {
# error
} else {
$self->transit($next_symbol);
$self->epsilon_transit();
}
$self->{pos}++;
}
#
# Perform e-transitions in an execution
#
sub epsilon_transit {
my $self = shift;
my @newstates = $self->states;
my @result = @newstates;
my %seen = map {($_ => 1)} @newstates;
for (;;) {
my $s;
my @nextstates;
foreach $s (@newstates) {
my $nextstate = $self->{nfa}->transition_table($s)->{''};
next unless defined $nextstate;
push @nextstates, ref $nextstate ? @$nextstate : $nextstate;
}
@newstates = grep {! $seen{$_}++} @nextstates;
last unless @newstates;
push @result, @newstates;
}
$self->{states} = \@result;
}
#
# Perform a transition
#
sub transit {
my $self = shift;
my $symbol = shift;
$self->{states} = $self->transition_table->{$symbol};
}
#
# Current states
#
sub states {
my $self = shift;
@{$self->{states}};
}
#
# Should we accept?
#
sub final_state {
my $self = shift;
my $s;
foreach $s ($self->states) {
return 1 if $self->{nfa}->is_end_state($s);
}
0;
}
#
# Get current transition table
# This is interesting because we have to merge the transition
# tables for several states.
sub transition_table {
my $self = shift;
my $s;
my %ttab;
foreach $s ($self->states) {
my $sub_ttab = $self->{nfa}->transition_table($s);
my ($symbol, $next_state);
while (($symbol, $next_state) = each %$sub_ttab) {
push @{$ttab{$symbol}}, ref $next_state ? @$next_state : $next_state;
}
}
\%ttab;
}
1;