Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 8f9322b

Browse files
UTF-8 validate strings before interning
1 parent bdd782e commit 8f9322b

File tree

6 files changed

+62
-15
lines changed

6 files changed

+62
-15
lines changed

‎Zend/zend_string.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ static zend_always_inline zend_string *zend_add_interned_string(zend_string *str
180180
GC_SET_REFCOUNT(str, 1);
181181
GC_ADD_FLAGS(str, IS_STR_INTERNED | flags);
182182

183+
if (!ZSTR_IS_VALID_UTF8(str) && zend_string_validate_utf8(str)) {
184+
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
185+
}
186+
183187
ZVAL_INTERNED_STR(&val, str);
184188

185189
zend_hash_add_new(interned_strings, str, &val);
@@ -493,3 +497,45 @@ ZEND_API zend_string *zend_string_concat3(
493497

494498
return res;
495499
}
500+
501+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
502+
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
503+
// https://stackoverflow.com/a/22135005/1320374
504+
505+
enum {
506+
UTF8_ACCEPT = 0,
507+
UTF8_REJECT = 1,
508+
};
509+
510+
static const uint8_t utf8d[] = {
511+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
512+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
513+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
514+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
515+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
516+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
517+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
518+
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
519+
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
520+
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
521+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
522+
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
523+
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
524+
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
525+
};
526+
527+
ZEND_API bool zend_string_validate_utf8(zend_string *string) {
528+
char *str = ZSTR_VAL(string);
529+
size_t len = ZSTR_LEN(string);
530+
uint32_t state = UTF8_ACCEPT;
531+
532+
for (size_t i = 0; i < len; i++) {
533+
uint32_t type = utf8d[(uint8_t)str[i]];
534+
state = utf8d[256 + state * 16 + type];
535+
536+
if (state == UTF8_REJECT)
537+
break;
538+
}
539+
540+
return state == UTF8_ACCEPT;
541+
}

‎Zend/zend_string.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ ZEND_API extern zend_string *zend_empty_string;
7878
ZEND_API extern zend_string *zend_one_char_string[256];
7979
ZEND_API extern zend_string **zend_known_strings;
8080

81+
ZEND_API bool zend_string_validate_utf8(zend_string *string);
82+
8183
END_EXTERN_C()
8284

8385
/* Shortcuts */

‎ext/mbstring/mbstring.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,7 +1804,7 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
18041804
unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
18051805
if (char_len) {
18061806
return ZSTR_LEN(string) / char_len;
1807-
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string)&IS_STR_VALID_UTF8) {
1807+
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && ZSTR_IS_VALID_UTF8(string)) {
18081808
return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
18091809
}
18101810

@@ -2254,7 +2254,7 @@ PHP_FUNCTION(mb_substr_count)
22542254
if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
22552255
/* No need to do any conversion if haystack/needle are already known-valid UTF-8
22562256
* (If they are not valid, then not passing them through conversion filters could affect output) */
2257-
if (GC_FLAGS(haystack)&IS_STR_VALID_UTF8) {
2257+
if (ZSTR_IS_VALID_UTF8(haystack)) {
22582258
haystack_u8 = haystack;
22592259
} else {
22602260
unsigned int num_errors = 0;
@@ -2264,7 +2264,7 @@ PHP_FUNCTION(mb_substr_count)
22642264
}
22652265
}
22662266

2267-
if (GC_FLAGS(needle)&IS_STR_VALID_UTF8) {
2267+
if (ZSTR_IS_VALID_UTF8(needle)) {
22682268
needle_u8 = needle;
22692269
} else {
22702270
unsigned int num_errors = 0;
@@ -3152,7 +3152,7 @@ PHP_FUNCTION(mb_detect_encoding)
31523152
strict = MBSTRG(strict_detection);
31533153
}
31543154

3155-
if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) &IS_STR_VALID_UTF8)) {
3155+
if (size == 1 && *elist == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
31563156
ret = &mbfl_encoding_utf8;
31573157
} else {
31583158
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
@@ -5172,11 +5172,13 @@ static bool mb_fast_check_utf8_avx2(zend_string *str)
51725172
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding)
51735173
{
51745174
if (encoding == &mbfl_encoding_utf8) {
5175-
if (GC_FLAGS(str)&IS_STR_VALID_UTF8) {
5175+
if (ZSTR_IS_VALID_UTF8(str)) {
51765176
return true;
5177+
} else if (ZSTR_IS_INTERNED(str)) {
5178+
return false;
51775179
}
51785180
bool result = mb_fast_check_utf8(str);
5179-
if (result&& !ZSTR_IS_INTERNED(str)) {
5181+
if (result) {
51805182
GC_ADD_FLAGS(str, IS_STR_VALID_UTF8);
51815183
}
51825184
return result;
@@ -5439,7 +5441,7 @@ PHP_FUNCTION(mb_scrub)
54395441
RETURN_THROWS();
54405442
}
54415443

5442-
if (enc == &mbfl_encoding_utf8 && (GC_FLAGS(str) &IS_STR_VALID_UTF8)) {
5444+
if (enc == &mbfl_encoding_utf8 && ZSTR_IS_VALID_UTF8(str)) {
54435445
/* A valid UTF-8 string will not be changed by mb_scrub; so just increment the refcount and return it */
54445446
RETURN_STR_COPY(str);
54455447
}

‎ext/opcache/ZendAccelerator.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,9 @@ zend_string* ZEND_FASTCALL accel_new_interned_string(zend_string *str)
550550
*hash_slot = STRTAB_STR_TO_POS(&ZCSG(interned_strings), s);
551551
GC_SET_REFCOUNT(s, 2);
552552
GC_TYPE_INFO(s) = GC_STRING | ((IS_STR_INTERNED | IS_STR_PERMANENT) << GC_FLAGS_SHIFT)| (ZSTR_IS_VALID_UTF8(str) ? IS_STR_VALID_UTF8 : 0);
553+
if (!ZSTR_IS_VALID_UTF8(s) && zend_string_validate_utf8(str)) {
554+
GC_ADD_FLAGS(s, IS_STR_VALID_UTF8);
555+
}
553556
ZSTR_H(s) = h;
554557
ZSTR_LEN(s) = ZSTR_LEN(str);
555558
memcpy(ZSTR_VAL(s), ZSTR_VAL(str), ZSTR_LEN(s) + 1);

‎ext/pcre/php_pcre.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1125,7 +1125,7 @@ static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ *
11251125

11261126
static zend_always_inline bool is_known_valid_utf8(
11271127
zend_string *subject_str, PCRE2_SIZE start_offset) {
1128-
if (!(GC_FLAGS(subject_str) &IS_STR_VALID_UTF8)) {
1128+
if (!ZSTR_IS_VALID_UTF8(subject_str)) {
11291129
/* We don't know whether the string is valid UTF-8 or not. */
11301130
return 0;
11311131
}

‎ext/zend_test/tests/strings_marked_as_utf8.phpt

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,6 @@ $s = "f" . "o";
4747
var_dump($s);
4848
var_dump(zend_test_is_string_marked_as_valid_utf8($s));
4949

50-
// The "foo" string matches with a "Foo" class which is registered by the zend_test extension.
51-
// That class name does not have the "valid UTF-8" flag because class names in general
52-
// don't have to be UTF-8. As the "foo" string here goes through the interning logic,
53-
// the string gets replaced by the "foo" string from the class, which does
54-
// not have the "valid UTF-8" flag. We therefore choose a different test case: "fxo".
55-
// The previous "foo" test case works because it is not interned.
5650
echo "Multiple concatenation known valid UTF-8 in assignment:\n";
5751
$s = "f" . "o" . "o";
5852
var_dump($s);
@@ -167,7 +161,7 @@ string(2) "fo"
167161
bool(true)
168162
Multiple concatenation known valid UTF-8 in assignment:
169163
string(3) "foo"
170-
bool(false)
164+
bool(true)
171165
string(3) "fxo"
172166
bool(true)
173167
Concatenation known valid UTF-8 string with empty string in variables:

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /