Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 522a8b0

Browse files
staabmondrejmirtes
authored andcommitted
Fix regex parsing by completing the grammar
- Skip regex parsing if the pattern is not valid, but report errors then if parsing fails - Fix support for internal options to include more options, and fix []] (i.e. class end delimiter as first character is not a literal) not parsing correctly - Add a couple missing quantifiers in getQuantificationRange - Handle more literal values and fix escaped ones to be unescaped
1 parent 831f4cd commit 522a8b0

File tree

5 files changed

+177
-48
lines changed

5 files changed

+177
-48
lines changed

‎resources/RegexGrammar.pp‎

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,23 @@
4646
%skip nl \n
4747

4848
// Character classes.
49-
%token negative_class_ \[\^
50-
%token class_ \[
51-
%token _class \]
52-
%token range \-
49+
%token negative_class_ \[\^ -> class
50+
%token class_ \[ -> class
51+
%token class:posix_class \[:\^?[a-z]+:\]
52+
%token class:class_ \[
53+
%token class:_class_literal (?<=[^\\]\[|[^\\]\[\^)\]
54+
%token class:_class \] -> default
55+
%token class:range \-
56+
%token class:escaped_end_class \\\]
57+
// taken over from literals but class:character has \b support on top (backspace in character classes)
58+
%token class:character \\([aefnrtb]|c[\x00-\x7f])
59+
%token class:dynamic_character \\([0-7]{3}|x[0-9a-zA-Z]{2}|x{[0-9a-zA-Z]+})
60+
%token class:character_type \\([CdDhHNRsSvVwWX]|[pP]{[^}]+})
61+
%token class:literal \\.|.
5362

5463
// Internal options.
55-
%token internal_option \(\?[\-+]?[imsx]\)
64+
// See https://www.regular-expressions.info/refmodifiers.html
65+
%token internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx)\)
5666

5767
// Lookahead and lookbehind assertions.
5868
%token lookahead_ \(\?=
@@ -77,6 +87,7 @@
7787
%token nc:_named_capturing > -> default
7888
%token nc:capturing_name .+?(?=(?<!\\)>)
7989
%token non_capturing_ \(\?:
90+
%token non_capturing_internal_option \(\?([imsxnJUX^]|xx)?-?([imsxnJUX^]|xx):
8091
%token non_capturing_reset_ \(\?\|
8192
%token atomic_group_ \(\?>
8293
%token capturing_ \(
@@ -168,7 +179,7 @@
168179
::negative_class_:: #negativeclass
169180
| ::class_::
170181
)
171-
( <class_> | range() | literal() )+
182+
( <range> | <_class_literal> )? ( <posix_class> | <class_> | range() | literal() | <escaped_end_class> )* <range>?
172183
::_class::
173184

174185
#range:
@@ -183,15 +194,18 @@
183194
| (
184195
::named_capturing_:: <capturing_name> ::_named_capturing:: #namedcapturing
185196
| ::non_capturing_:: #noncapturing
197+
| non_capturing_internal_options() #noncapturing
186198
| ::non_capturing_reset_:: #noncapturingreset
187199
| ::atomic_group_:: #atomicgroup
188200
| ::capturing_::
189201
)
190202
alternation() ::_capturing::
191203

204+
non_capturing_internal_options:
205+
<non_capturing_internal_option>
206+
192207
literal:
193208
<character>
194-
| <range>
195209
| <dynamic_character>
196210
| <character_type>
197211
| <anchor>

‎src/Type/Php/RegexArrayShapeMatcher.php‎

Lines changed: 89 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use Hoa\Compiler\Llk\TreeNode;
88
use Hoa\Exception\Exception;
99
use Hoa\File\Read;
10+
use Nette\Utils\RegexpException;
1011
use Nette\Utils\Strings;
1112
use PhpParser\Node\Expr;
1213
use PhpParser\Node\Name;
@@ -31,7 +32,11 @@
3132
use function in_array;
3233
use function is_int;
3334
use function is_string;
35+
use function rtrim;
3436
use function sscanf;
37+
use function str_replace;
38+
use function strlen;
39+
use function substr;
3540
use const PREG_OFFSET_CAPTURE;
3641
use const PREG_UNMATCHED_AS_NULL;
3742

@@ -375,6 +380,13 @@ private function parseGroups(string $regex): ?array
375380
self::$parser = Llk::load(new Read(__DIR__ . '/../../../resources/RegexGrammar.pp'));
376381
}
377382

383+
try {
384+
Strings::match('', $regex);
385+
} catch (RegexpException) {
386+
// pattern is invalid, so let the RegularExpressionPatternRule report it
387+
return null;
388+
}
389+
378390
try {
379391
$ast = self::$parser->parse($regex);
380392
} catch (Exception) {
@@ -516,25 +528,37 @@ private function getQuantificationRange(TreeNode $node): array
516528
$lastChild = $node->getChild($node->getChildrenNumber() - 1);
517529
$value = $lastChild->getValue();
518530

519-
if ($value['token'] === 'n_to_m') {
520-
if (sscanf($value['value'], '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
531+
// normalize away possessive and lazy quantifier-modifiers
532+
$token = str_replace(['_possessive', '_lazy'], '', $value['token']);
533+
$value = rtrim($value['value'], '+?');
534+
535+
if ($token === 'n_to_m') {
536+
if (sscanf($value, '{%d,%d}', $n, $m) !== 2 || !is_int($n) || !is_int($m)) {
521537
throw new ShouldNotHappenException();
522538
}
523539

524540
$min = $n;
525541
$max = $m;
526-
} elseif ($value['token'] === 'exactly_n') {
527-
if (sscanf($value['value'], '{%d}', $n) !== 1 || !is_int($n)) {
542+
} elseif ($token === 'n_or_more') {
543+
if (sscanf($value, '{%d,}', $n) !== 1 || !is_int($n)) {
544+
throw new ShouldNotHappenException();
545+
}
546+
547+
$min = $n;
548+
} elseif ($token === 'exactly_n') {
549+
if (sscanf($value, '{%d}', $n) !== 1 || !is_int($n)) {
528550
throw new ShouldNotHappenException();
529551
}
530552

531553
$min = $n;
532554
$max = $n;
533-
} elseif ($value['token'] === 'zero_or_one') {
555+
} elseif ($token === 'zero_or_one') {
534556
$min = 0;
535557
$max = 1;
536-
} elseif ($value['token'] === 'zero_or_more') {
558+
} elseif ($token === 'zero_or_more') {
537559
$min = 0;
560+
} elseif ($token === 'one_or_more') {
561+
$min = 1;
538562
}
539563

540564
return [$min, $max];
@@ -591,20 +615,8 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
591615
if ($literalValue !== null) {
592616
if (Strings::match($literalValue, '/^\d+$/') === null) {
593617
$isNumeric = TrinaryLogic::createNo();
594-
}
595-
596-
if (!$inOptionalQuantification) {
597-
$isNonEmpty = TrinaryLogic::createYes();
598-
}
599-
}
600-
601-
if ($ast->getValueToken() === 'character_type') {
602-
if ($ast->getValueValue() === '\d') {
603-
if ($isNumeric->maybe()) {
604-
$isNumeric = TrinaryLogic::createYes();
605-
}
606-
} else {
607-
$isNumeric = TrinaryLogic::createNo();
618+
} elseif ($isNumeric->maybe()) {
619+
$isNumeric = TrinaryLogic::createYes();
608620
}
609621

610622
if (!$inOptionalQuantification) {
@@ -624,7 +636,6 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
624636
}
625637

626638
if (Strings::match($literalValue, '/^\d+$/') === null) {
627-
$allNumeric = false;
628639
break;
629640
}
630641

@@ -653,13 +664,65 @@ private function walkGroupAst(TreeNode $ast, TrinaryLogic &$isNonEmpty, TrinaryL
653664

654665
private function getLiteralValue(TreeNode $node): ?string
655666
{
656-
if ($node->getId() === 'token' && $node->getValueToken() === 'literal') {
657-
return $node->getValueValue();
667+
if ($node->getId() !== 'token') {
668+
return null;
669+
}
670+
671+
// token is the token name from grammar without the namespace so literal and class:literal are both called literal here
672+
$token = $node->getValueToken();
673+
$value = $node->getValueValue();
674+
675+
if (in_array($token, ['literal', 'escaped_end_class'], true)) {
676+
if (strlen($node->getValueValue()) > 1 && $value[0] === '\\') {
677+
return substr($value, 1);
678+
}
679+
680+
return $value;
681+
}
682+
683+
// literal "-" in front/back of a character class like '[-a-z]' or '[abc-]', not forming a range
684+
if ($token === 'range') {
685+
return $value;
686+
}
687+
688+
// literal "[" or "]" inside character classes '[[]' or '[]]'
689+
if (in_array($token, ['class_', '_class_literal'], true)) {
690+
return $value;
691+
}
692+
693+
// character escape sequences, just return a fixed string
694+
if (in_array($token, ['character', 'dynamic_character', 'character_type'], true)) {
695+
if ($token === 'character_type' && $value === '\d') {
696+
return '0';
697+
}
698+
699+
return $value;
700+
}
701+
702+
// [:digit:] and the like, more support coming later
703+
if ($token === 'posix_class') {
704+
if ($value === '[:digit:]') {
705+
return '0';
706+
}
707+
if (in_array($value, ['[:alpha:]', '[:alnum:]', '[:upper:]', '[:lower:]', '[:word:]', '[:ascii:]', '[:print:]', '[:xdigit:]', '[:graph:]'], true)) {
708+
return 'a';
709+
}
710+
if ($value === '[:blank:]') {
711+
return "\t";
712+
}
713+
if ($value === '[:cntrl:]') {
714+
return "\x00\x1F";
715+
}
716+
if ($value === '[:space:]') {
717+
return "\t\r\n\v\f";
718+
}
719+
if ($value === '[:punct:]') {
720+
return '!"#$%&\'()*+,\-./:;<=>?@[\]^_`{|}~';
721+
}
658722
}
659723

660-
// literal "-" outside of a character class like '~^((\\d{1,6})-)$~'
661-
if ($node->getId() === 'token' && $node->getValueToken() === 'range') {
662-
return $node->getValueValue();
724+
if ($token === 'anchor' || $token === 'match_point_reset') {
725+
return '';
663726
}
664727

665728
return null;

‎tests/PHPStan/Analyser/LegacyNodeScopeResolverTest.php‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8003,7 +8003,7 @@ public function dataPassedByReference(): array
80038003
'$arr',
80048004
],
80058005
[
8006-
'array{0?: string}',
8006+
'array<string>',
80078007
'$matches',
80088008
],
80098009
[

‎tests/PHPStan/Analyser/nsrt/preg_match_shapes.php‎

Lines changed: 58 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,28 +127,18 @@ function doUnknownFlags(string $s, int $flags): void {
127127
assertType('array<array{string|null, int<-1, max>}|string|null>', $matches);
128128
}
129129

130-
function doNonAutoCapturingModifier(string $s): void {
131-
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
132-
// could be assertType('array{string}', $matches);
133-
assertType('array<string>', $matches);
134-
}
135-
assertType('array<string>', $matches);
136-
}
137-
138130
function doMultipleAlternativeCaptureGroupsWithSameNameWithModifier(string $s): void {
139131
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
140-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
141-
assertType('array<string>', $matches);
132+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
142133
}
143-
assertType('array<string>', $matches);
134+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
144135
}
145136

146137
function doMultipleConsecutiveCaptureGroupsWithSameNameWithModifier(string $s): void {
147138
if (preg_match('/(?J)(?<Foo>[a-z]+)|(?<Foo>[0-9]+)/', $s, $matches)) {
148-
// could be assertType('array{0: string, Foo: string, 1: string}', $matches);
149-
assertType('array<string>', $matches);
139+
assertType('array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
150140
}
151-
assertType('array<string>', $matches);
141+
assertType('array{}|array{0: string, Foo: numeric-string|non-empty-string, 1: non-empty-string, 2?: numeric-string}', $matches);
152142
}
153143

154144
// https://github.com/hoaproject/Regex/issues/31
@@ -472,3 +462,57 @@ function (string $s): void {
472462
assertType("array{string, non-empty-string}", $matches);
473463
}
474464
};
465+
466+
function bug11323(string $s): void {
467+
if (preg_match('/([*|+?{}()]+)([^*|+[:digit:]?{}()]+)/', $s, $matches)) {
468+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
469+
}
470+
if (preg_match('/\p{L}[[\]]+([-*|+?{}(?:)]+)([^*|+[:digit:]?{a-z}(\p{L})\a-]+)/', $s, $matches)) {
471+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
472+
}
473+
if (preg_match('{([-\p{L}[\]*|\x03\a\b+?{}(?:)-]+[^[:digit:]?{}a-z0-9#-k]+)(a-z)}', $s, $matches)) {
474+
assertType('array{string, non-empty-string, non-empty-string}', $matches);
475+
}
476+
if (preg_match('{(\d+)(?i)insensitive((?x-i)case SENSITIVE here(?i:insensitive non-capturing group))}', $s, $matches)) {
477+
assertType('array{string, numeric-string, non-empty-string}', $matches);
478+
}
479+
if (preg_match('{([]] [^]])}', $s, $matches)) {
480+
assertType('array{string, non-empty-string}', $matches);
481+
}
482+
if (preg_match('{([[:digit:]])}', $s, $matches)) {
483+
assertType('array{string, numeric-string}', $matches);
484+
}
485+
if (preg_match('{([\d])(\d)}', $s, $matches)) {
486+
assertType('array{string, numeric-string, numeric-string}', $matches);
487+
}
488+
if (preg_match('{([0-9])}', $s, $matches)) {
489+
assertType('array{string, numeric-string}', $matches);
490+
}
491+
if (preg_match('{(\p{L})(\p{P})(\p{Po})}', $s, $matches)) {
492+
assertType('array{string, non-empty-string, non-empty-string, non-empty-string}', $matches);
493+
}
494+
if (preg_match('{(a)??(b)*+(c++)(d)+?}', $s, $matches)) {
495+
assertType('array{string, string, string, non-empty-string, non-empty-string}', $matches);
496+
}
497+
if (preg_match('{(.\d)}', $s, $matches)) {
498+
assertType('array{string, non-empty-string}', $matches);
499+
}
500+
if (preg_match('{(\d.)}', $s, $matches)) {
501+
assertType('array{string, non-empty-string}', $matches);
502+
}
503+
if (preg_match('{(\d\d)}', $s, $matches)) {
504+
assertType('array{string, numeric-string}', $matches);
505+
}
506+
if (preg_match('{(.(\d))}', $s, $matches)) {
507+
assertType('array{string, non-empty-string, numeric-string}', $matches);
508+
}
509+
if (preg_match('{((\d).)}', $s, $matches)) {
510+
assertType('array{string, non-empty-string, numeric-string}', $matches);
511+
}
512+
if (preg_match('{(\d([1-4])\d)}', $s, $matches)) {
513+
assertType('array{string, numeric-string, numeric-string}', $matches);
514+
}
515+
if (preg_match('{(x?([1-4])\d)}', $s, $matches)) {
516+
assertType('array{string, non-empty-string, numeric-string}', $matches);
517+
}
518+
}

‎tests/PHPStan/Analyser/nsrt/preg_match_shapes_php80.php‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,11 @@ function doOffsetCaptureWithUnmatchedNull(string $s): void {
1111
}
1212
assertType('array{}|array{array{string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}, array{non-empty-string|null, int<-1, max>}}', $matches);
1313
}
14+
15+
function doNonAutoCapturingModifier(string $s): void {
16+
if (preg_match('/(?n)(\d+)/', $s, $matches)) {
17+
// should be assertType('array{string}', $matches);
18+
assertType('array{string, numeric-string}', $matches);
19+
}
20+
assertType('array{}|array{string, numeric-string}', $matches);
21+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /