Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 935fef2

Browse files
Optimize DOM HTML serialization for UTF-8 (#16376)
* Use a direct call for decoding the UTF-8 buffer * Add fast path for UTF-8 HTML serialization This patch adds a fast path to the HTML serialization encoding that has to encode to UTF-8. Because the DOM internally represents all strings using UTF-8, we only need to validate here. Tested on Wikipedia English home page on an i7-4790: ``` Benchmark 1: ./sapi/cli/php x.php Time (mean ± σ): 516.0 ms ± 6.4 ms [User: 511.2 ms, System: 3.5 ms] Range (min ... max): 506.0 ms ... 527.1 ms 10 runs Benchmark 2: ./sapi/cli/php_old x.php Time (mean ± σ): 682.8 ms ± 6.5 ms [User: 676.8 ms, System: 3.8 ms] Range (min ... max): 675.8 ms ... 695.6 ms 10 runs Summary ./sapi/cli/php x.php ran 1.32 ± 0.02 times faster than ./sapi/cli/php_old x.php ``` (And if you're interested: it takes over a second on my machine using the old DOMDocument class) Future optimizations are certainly possible, but let's start here.
1 parent 6dd67bb commit 935fef2

File tree

1 file changed

+73
-5
lines changed

1 file changed

+73
-5
lines changed

‎ext/dom/html_document.c‎

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -570,12 +570,11 @@ static bool dom_decode_encode_fast_path(
570570
const lxb_char_t *buf_ref_backup = buf_ref;
571571
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
572572
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
573-
size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */
574573
if (!dom_process_parse_chunk(
575574
ctx,
576575
document,
577576
parser,
578-
buf_ref - last_output-skip,
577+
buf_ref_backup - last_output,
579578
last_output,
580579
buf_ref - last_output,
581580
tokenizer_error_offset,
@@ -1208,6 +1207,68 @@ static zend_result dom_write_output_stream(void *application_data, const char *b
12081207
return SUCCESS;
12091208
}
12101209

1210+
/* Fast path when the output encoding is UTF-8 */
1211+
static zend_result dom_saveHTML_write_string_len_utf8_output(void *application_data, const char *buf, size_t len)
1212+
{
1213+
dom_output_ctx *output = (dom_output_ctx *) application_data;
1214+
1215+
output->decode->status = LXB_STATUS_OK;
1216+
1217+
const lxb_char_t *buf_ref = (const lxb_char_t *) buf;
1218+
const lxb_char_t *last_output = buf_ref;
1219+
const lxb_char_t *buf_end = buf_ref + len;
1220+
1221+
while (buf_ref != buf_end) {
1222+
const lxb_char_t *buf_ref_backup = buf_ref;
1223+
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(output->decode, &buf_ref, buf_end);
1224+
if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {
1225+
if (UNEXPECTED(output->write_output(
1226+
output->output_data,
1227+
(const char *) last_output,
1228+
buf_ref_backup - last_output
1229+
) != SUCCESS)) {
1230+
return FAILURE;
1231+
}
1232+
1233+
if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {
1234+
ZEND_ASSERT(buf_ref == buf_end);
1235+
/* The decoder needs more data but the entire buffer is consumed.
1236+
* All valid data is outputted, and if the remaining data for the code point
1237+
* is invalid, the next call will output the replacement bytes. */
1238+
output->decode->status = LXB_STATUS_CONTINUE;
1239+
return SUCCESS;
1240+
}
1241+
1242+
if (UNEXPECTED(output->write_output(
1243+
output->output_data,
1244+
(const char *) LXB_ENCODING_REPLACEMENT_BYTES,
1245+
LXB_ENCODING_REPLACEMENT_SIZE
1246+
) != SUCCESS)) {
1247+
return FAILURE;
1248+
}
1249+
1250+
last_output = buf_ref;
1251+
}
1252+
}
1253+
1254+
if (buf_ref != last_output) {
1255+
if (UNEXPECTED(output->write_output(
1256+
output->output_data,
1257+
(const char *) last_output,
1258+
buf_ref - last_output
1259+
) != SUCCESS)) {
1260+
return FAILURE;
1261+
}
1262+
}
1263+
1264+
return SUCCESS;
1265+
}
1266+
1267+
static zend_result dom_saveHTML_write_string_utf8_output(void *application_data, const char *buf)
1268+
{
1269+
return dom_saveHTML_write_string_len_utf8_output(application_data, buf, strlen(buf));
1270+
}
1271+
12111272
static zend_result dom_saveHTML_write_string_len(void *application_data, const char *buf, size_t len)
12121273
{
12131274
dom_output_ctx *output = (dom_output_ctx *) application_data;
@@ -1216,7 +1277,7 @@ static zend_result dom_saveHTML_write_string_len(void *application_data, const c
12161277
const lxb_char_t *buf_end = buf_ref + len;
12171278

12181279
do {
1219-
decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);
1280+
decode_status = lxb_encoding_decode_utf_8(output->decode, &buf_ref, buf_end);
12201281

12211282
const lxb_codepoint_t *codepoints_ref = output->codepoints;
12221283
const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);
@@ -1272,8 +1333,15 @@ static zend_result dom_common_save(dom_output_ctx *output_ctx, dom_object *inter
12721333
output_ctx->encoding_output = encoding_output;
12731334

12741335
dom_html5_serialize_context ctx;
1275-
ctx.write_string_len = dom_saveHTML_write_string_len;
1276-
ctx.write_string = dom_saveHTML_write_string;
1336+
if (encoding_data->encoding == LXB_ENCODING_UTF_8) {
1337+
/* Fast path */
1338+
ctx.write_string_len = dom_saveHTML_write_string_len_utf8_output;
1339+
ctx.write_string = dom_saveHTML_write_string_utf8_output;
1340+
} else {
1341+
/* Slow path */
1342+
ctx.write_string_len = dom_saveHTML_write_string_len;
1343+
ctx.write_string = dom_saveHTML_write_string;
1344+
}
12771345
ctx.application_data = output_ctx;
12781346
ctx.private_data = php_dom_get_private_data(intern);
12791347
if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /