# -*- mode: perl; perl-indent-level: 2 -*- # # Compile and evaluate regular expressions in Perl without # using built-in regular expressions. # # Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com) # This program is in the PUBLIC DOMAIN. # # # Note to readers: # The code is in three parts. # 1. A parser for regexes # 2. A compiler that trnaslates regexes into machines # 3. An executer that executes a machine on a specified input # # The parser is really turgid and hard to understand, because it isn't # the point of the package. I recommend that you skip to part 2 or 3 # immediately, then work backwards. Parts 2 and 3 are much clearer. package Regex; # Regexps are handled in three phases. First, they're parsed from a string # form into an internal parse tree form, thus: # # ABC => [ CONCAT => [ A, B, C ] ] # A* => [ STAR => A ] # A+ => [ PLUS => A ] # A|B|C => [ ALTERN => [ A, B, C ] ] # literal character x => [ LITERAL => x ] # # `parse' does this. sub parse { @t = split(//, $_[1]); parse_altern(@t); } sub parse_altern { my @alterns; my @terms; my $c; while (defined($c = shift @_)) { next if $c eq ''; push @seen, $c; if ($c eq '(') { my $next_term = &parse_altern; push @terms, $next_term; } elsif ($c eq ')') { push @alterns, &joinup(CONCAT, @terms) if @terms; return &joinup(ALTERN, @alterns); } elsif ($c eq '|') { push @alterns, &joinup(CONCAT, @terms) if @terms; @terms = (); } elsif ($c eq '*' || $c eq '+') { if (@terms) { $terms[-1] = [ ($c eq '*' ? STAR : PLUS) => $terms[-1] ]; } else { $PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n"; return undef; } } elsif ($c eq '\\') { push @terms, [ LITERAL => (shift) ]; } else { push @terms, [ LITERAL => $c ]; } } # While there are tokens... push @alterns, &joinup(CONCAT, @terms) if @terms; return joinup(ALTERN, @alterns) if @alterns; return undef; } sub joinup { my $tag = shift; if (@_ == 1) { $_[0]; } else { [ $tag => [ @_ ] ]; } } package NFA; ################################################################ # # Compile parsed regexp into representation of NFA # ################################################################ $S = 'aa00'; $STARTSYMBOL = 0; $ENDSYMBOL = 1; sub new { compile(@_); } sub compile { my $pack = shift; my $rx = shift; my ($operator, $operands) = @$rx; # A literal has no suboperands to compile. # So invoke the special atom-compiler and return that result instead. if ($operator eq LITERAL) { return $pack->literal($operands); } my $startsym = "S" . &gensym(); my $endsym = "E" . &gensym(); my $result = { Symbols => [ $startsym, $endsym ] }; # Compile the sub-operands first. my @submachines; if ($operator eq STAR || $operator eq PLUS) { @submachines = ($pack->compile($operands)); } else { foreach $operand (@$operands) { push @submachines, $pack->compile($operand); } } if ($operator eq CONCAT) { return $submachines[0] if @submachines == 1; &putin($result, @submachines); my $i; for ($i = 0; $i < @submachines - 1; $i++) { my $tail = $submachines[$i] {Symbols}[$ENDSYMBOL]; my $head = $submachines[$i+1]{Symbols}[$STARTSYMBOL]; $result->{$tail} = { '' => $head }; } $result->{$startsym} = { '' => $submachines[0] {Symbols}[$STARTSYMBOL] }; $result->{$submachines[-1]{Symbols}[$ENDSYMBOL]} = { '' => $endsym }; } elsif ($operator eq STAR) { my $sm = $submachines[0]; &putin($result, $sm); my ($s, $e) = @{$sm->{Symbols}}; $result->{$e} = { '' => [$s, $endsym] }; $result->{$startsym} = { '' => [$s, $endsym] }; } elsif ($operator eq PLUS) { my $sm = $submachines[0]; &putin($result, $sm); my ($s, $e) = @{$sm->{Symbols}}; $result->{$e} = { '' => [$s, $endsym] }; $result->{$startsym} = { '' => $s }; } elsif ($operator eq ALTERN) { return $submachines[0] if @submachines == 1; &putin($result, @submachines); my @startsyms = map { $_->{Symbols}[$STARTSYMBOL] } @submachines; my @endsyms = map { $_->{Symbols}[$ENDSYMBOL] } @submachines; $result->{$startsym} = { '' => \@startsyms }; foreach $es (@endsyms) { $result->{$es} = { '' => $endsym }; } } else { warn "Bizarre oprerator `$operator' encountered.\n"; } bless $result => $pack; } sub start_state { $_[0]{Symbols}[$STARTSYMBOL]; } sub is_end_state { my $self = shift; my $state = shift; $state eq $self->{Symbols}[$ENDSYMBOL]; } sub transition_table { my $self = shift; my $state = shift; $self->{$state} || {}; } sub literal { my $pack = shift; my $what = shift; my $startsym = "S" . &gensym(); my $endsym = "E" . &gensym(); bless { Symbols => [ $startsym, $endsym ], $startsym => { $what => $endsym } }, => $pack; } # Given a list of machines, M1 ... Mn, put M2... Mn into M1. sub putin { my $master = shift; foreach $m (@_) { foreach $state (keys %$m) { next if $state eq 'Symbols'; if (exists $master->{$state}) { print STDERR "Warning: State name conflict for `$state'.\n"; } $master->{$state} = $m->{$state}; } } $master; } sub gensym { $S++; } ################################################################ # # Execute NFA on a given string # ################################################################ package NFA_Exec; sub match { my $pack = shift; my $nfa = shift; my $string = shift; my $machine = $pack->init($nfa, $string); $machine->run(); $machine->final_state(); } sub new { &init(@_); } # # Create a new execution of the specified NFS, and feed it # the specified string as its input. # sub init { my $pack = shift; my $nfa = shift; my $string = shift; my $self = {}; $self->{nfa} = $nfa; $self->{input} = $string; $self->{pos} = 0; $self->{states} = [ $self->{nfa}->start_state ]; bless $self => $pack; $self->epsilon_transit(); $self; } # # Run an execution to the end of the input # sub run { my $self = shift; until ($self->end_of_input() || $self->states() == 0) { $self->step; } } # # Is this execution object at the end of its input? # sub end_of_input { my $self = shift; $self->{pos}>= length($self->{input}); } # # Advance an execution by one step. # sub step { my $self = shift; my $next_symbol = substr($self->{input}, $self->{pos}, 1); if ($next_symbol eq '') { # error } else { $self->transit($next_symbol); $self->epsilon_transit(); } $self->{pos}++; } # # Perform e-transitions in an execution # sub epsilon_transit { my $self = shift; my @newstates = $self->states; my @result = @newstates; my %seen = map {($_ => 1)} @newstates; for (;;) { my $s; my @nextstates; foreach $s (@newstates) { my $nextstate = $self->{nfa}->transition_table($s)->{''}; next unless defined $nextstate; push @nextstates, ref $nextstate ? @$nextstate : $nextstate; } @newstates = grep {! $seen{$_}++} @nextstates; last unless @newstates; push @result, @newstates; } $self->{states} = \@result; } # # Perform a transition # sub transit { my $self = shift; my $symbol = shift; $self->{states} = $self->transition_table->{$symbol}; } # # Current states # sub states { my $self = shift; @{$self->{states}}; } # # Should we accept? # sub final_state { my $self = shift; my $s; foreach $s ($self->states) { return 1 if $self->{nfa}->is_end_state($s); } 0; } # # Get current transition table # This is interesting because we have to merge the transition # tables for several states. sub transition_table { my $self = shift; my $s; my %ttab; foreach $s ($self->states) { my $sub_ttab = $self->{nfa}->transition_table($s); my ($symbol, $next_state); while (($symbol, $next_state) = each %$sub_ttab) { push @{$ttab{$symbol}}, ref $next_state ? @$next_state : $next_state; } } \%ttab; } 1;

AltStyle によって変換されたページ (->オリジナル) /