Commit 935fef2

authored

Optimize DOM HTML serialization for UTF-8 (#16376)

* Use a direct call for decoding the UTF-8 buffer * Add fast path for UTF-8 HTML serialization This patch adds a fast path to the HTML serialization encoding that has to encode to UTF-8. Because the DOM internally represents all strings using UTF-8, we only need to validate here. Tested on Wikipedia English home page on an i7-4790: ``` Benchmark 1: ./sapi/cli/php x.php Time (mean ± σ): 516.0 ms ± 6.4 ms [User: 511.2 ms, System: 3.5 ms] Range (min ... max): 506.0 ms ... 527.1 ms 10 runs Benchmark 2: ./sapi/cli/php_old x.php Time (mean ± σ): 682.8 ms ± 6.5 ms [User: 676.8 ms, System: 3.8 ms] Range (min ... max): 675.8 ms ... 695.6 ms 10 runs Summary ./sapi/cli/php x.php ran 1.32 ± 0.02 times faster than ./sapi/cli/php_old x.php ``` (And if you're interested: it takes over a second on my machine using the old DOMDocument class) Future optimizations are certainly possible, but let's start here.

1 parent 6dd67bb commit 935fef2Copy full SHA for 935fef2

File tree

1 file changed

+73

-5

lines changed

ext/dom
- html_document.c

1 file changed

+73

-5

lines changed

`‎ext/dom/html_document.c‎`

Lines changed: 73 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -570,12 +570,11 @@ static bool dom_decode_encode_fast_path(`
`570`	`570`	`const lxb_char_t *buf_ref_backup = buf_ref;`
`571`	`571`	`lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);`
`572`	`572`	`if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {`
`573`		`- size_t skip = buf_ref - buf_ref_backup; /* Skip invalid data, it's replaced by the UTF-8 replacement bytes */`
`574`	`573`	`if (!dom_process_parse_chunk(`
`575`	`574`	`ctx,`
`576`	`575`	`document,`
`577`	`576`	`parser,`
`578`		`- buf_ref - last_output-skip,`
	`577`	`+ buf_ref_backup - last_output,`
`579`	`578`	`last_output,`
`580`	`579`	`buf_ref - last_output,`
`581`	`580`	`tokenizer_error_offset,`
`@@ -1208,6 +1207,68 @@ static zend_result dom_write_output_stream(void application_data, const char b`
`1208`	`1207`	`return SUCCESS;`
`1209`	`1208`	`}`
`1210`	`1209`
	`1210`	`+/* Fast path when the output encoding is UTF-8 */`
	`1211`	`+static zend_result dom_saveHTML_write_string_len_utf8_output(void application_data, const char buf, size_t len)`
	`1212`	`+{`
	`1213`	`+ dom_output_ctx output = (dom_output_ctx ) application_data;`
	`1214`	`+`
	`1215`	`+ output->decode->status = LXB_STATUS_OK;`
	`1216`	`+`
	`1217`	`+ const lxb_char_t buf_ref = (const lxb_char_t ) buf;`
	`1218`	`+ const lxb_char_t *last_output = buf_ref;`
	`1219`	`+ const lxb_char_t *buf_end = buf_ref + len;`
	`1220`	`+`
	`1221`	`+ while (buf_ref != buf_end) {`
	`1222`	`+ const lxb_char_t *buf_ref_backup = buf_ref;`
	`1223`	`+ lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(output->decode, &buf_ref, buf_end);`
	`1224`	`+ if (UNEXPECTED(codepoint > LXB_ENCODING_MAX_CODEPOINT)) {`
	`1225`	`+ if (UNEXPECTED(output->write_output(`
	`1226`	`+ output->output_data,`
	`1227`	`+ (const char *) last_output,`
	`1228`	`+ buf_ref_backup - last_output`
	`1229`	`+ ) != SUCCESS)) {`
	`1230`	`+ return FAILURE;`
	`1231`	`+ }`
	`1232`	`+`
	`1233`	`+ if (codepoint == LXB_ENCODING_DECODE_CONTINUE) {`
	`1234`	`+ ZEND_ASSERT(buf_ref == buf_end);`
	`1235`	`+ /* The decoder needs more data but the entire buffer is consumed.`
	`1236`	`+ * All valid data is outputted, and if the remaining data for the code point`
	`1237`	`+ * is invalid, the next call will output the replacement bytes. */`
	`1238`	`+ output->decode->status = LXB_STATUS_CONTINUE;`
	`1239`	`+ return SUCCESS;`
	`1240`	`+ }`
	`1241`	`+`
	`1242`	`+ if (UNEXPECTED(output->write_output(`
	`1243`	`+ output->output_data,`
	`1244`	`+ (const char *) LXB_ENCODING_REPLACEMENT_BYTES,`
	`1245`	`+ LXB_ENCODING_REPLACEMENT_SIZE`
	`1246`	`+ ) != SUCCESS)) {`
	`1247`	`+ return FAILURE;`
	`1248`	`+ }`
	`1249`	`+`
	`1250`	`+ last_output = buf_ref;`
	`1251`	`+ }`
	`1252`	`+ }`
	`1253`	`+`
	`1254`	`+ if (buf_ref != last_output) {`
	`1255`	`+ if (UNEXPECTED(output->write_output(`
	`1256`	`+ output->output_data,`
	`1257`	`+ (const char *) last_output,`
	`1258`	`+ buf_ref - last_output`
	`1259`	`+ ) != SUCCESS)) {`
	`1260`	`+ return FAILURE;`
	`1261`	`+ }`
	`1262`	`+ }`
	`1263`	`+`
	`1264`	`+ return SUCCESS;`
	`1265`	`+}`
	`1266`	`+`
	`1267`	`+static zend_result dom_saveHTML_write_string_utf8_output(void application_data, const char buf)`
	`1268`	`+{`
	`1269`	`+ return dom_saveHTML_write_string_len_utf8_output(application_data, buf, strlen(buf));`
	`1270`	`+}`
	`1271`	`+`
`1211`	`1272`	`static zend_result dom_saveHTML_write_string_len(void application_data, const char buf, size_t len)`
`1212`	`1273`	`{`
`1213`	`1274`	`dom_output_ctx output = (dom_output_ctx ) application_data;`
`@@ -1216,7 +1277,7 @@ static zend_result dom_saveHTML_write_string_len(void *application_data, const c`
`1216`	`1277`	`const lxb_char_t *buf_end = buf_ref + len;`
`1217`	`1278`
`1218`	`1279`	`do {`
`1219`		`- decode_status = output->decoding_data->decode(output->decode, &buf_ref, buf_end);`
	`1280`	`+ decode_status = lxb_encoding_decode_utf_8(output->decode, &buf_ref, buf_end);`
`1220`	`1281`
`1221`	`1282`	`const lxb_codepoint_t *codepoints_ref = output->codepoints;`
`1222`	`1283`	`const lxb_codepoint_t *codepoints_end = codepoints_ref + lxb_encoding_decode_buf_used(output->decode);`
`@@ -1272,8 +1333,15 @@ static zend_result dom_common_save(dom_output_ctx output_ctx, dom_object inter`
`1272`	`1333`	`output_ctx->encoding_output = encoding_output;`
`1273`	`1334`
`1274`	`1335`	`dom_html5_serialize_context ctx;`
`1275`		`- ctx.write_string_len = dom_saveHTML_write_string_len;`
`1276`		`- ctx.write_string = dom_saveHTML_write_string;`
	`1336`	`+ if (encoding_data->encoding == LXB_ENCODING_UTF_8) {`
	`1337`	`+ /* Fast path */`
	`1338`	`+ ctx.write_string_len = dom_saveHTML_write_string_len_utf8_output;`
	`1339`	`+ ctx.write_string = dom_saveHTML_write_string_utf8_output;`
	`1340`	`+ } else {`
	`1341`	`+ /* Slow path */`
	`1342`	`+ ctx.write_string_len = dom_saveHTML_write_string_len;`
	`1343`	`+ ctx.write_string = dom_saveHTML_write_string;`
	`1344`	`+ }`
`1277`	`1345`	`ctx.application_data = output_ctx;`
`1278`	`1346`	`ctx.private_data = php_dom_get_private_data(intern);`
`1279`	`1347`	`if (UNEXPECTED(dom_html5_serialize_outer(&ctx, node) != SUCCESS)) {`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 935fef2

File tree

1 file changed

1 file changed

`‎ext/dom/html_document.c‎`

0 commit comments