From 5d85f73ad234e4d677b384d02e477a0777fff37b Mon Sep 17 00:00:00 2001 From: "FUJI Goro (gfx)" Date: 2019εΉ΄5月29ζ—₯ 10:19:31 +0900 Subject: [PATCH 1/3] improve: more smart selection of string decoding functions --- benchmark/decode-string.ts | 77 ++++++++++++++++++-------------------- src/Decoder.ts | 14 ++++--- src/utils/utf8.ts | 62 ++++++++++++++---------------- src/wasmFunctions.ts | 21 +++++++++-- 4 files changed, 92 insertions(+), 82 deletions(-) diff --git a/benchmark/decode-string.ts b/benchmark/decode-string.ts index 331a9e05..ed4813ee 100644 --- a/benchmark/decode-string.ts +++ b/benchmark/decode-string.ts @@ -1,50 +1,45 @@ /* eslint-disable no-console */ -import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8"; +import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8"; import { utf8DecodeWasm } from "../src/wasmFunctions"; // @ts-ignore import Benchmark from "benchmark"; -const textDecoder = new TextDecoder(); - -const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => { - return "a".repeat(n); -}); - -for (const str of dataSet) { - const byteLength = utf8Count(str); - const bytes = new Uint8Array(new ArrayBuffer(byteLength)); - utf8Encode(str, bytes, 0); - - console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`); - - const suite = new Benchmark.Suite(); - - const N = Math.round(100_0000 / str.length); - - // use the result to avoid void-context optimizations - let count = 0; - - suite.add("utf8Decode", () => { - if (utf8Decode(bytes, 0, byteLength) !== str) { - throw new Error("wrong result!"); - } - }); - - suite.add("utf8DecodeWasm", () => { - if (utf8DecodeWasm(bytes, 0, byteLength) !== str) { - throw new Error("wrong result!"); - } - }); - - suite.add("TextDecoder", () => { - if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) { - throw new Error("wrong result!"); - } - }); - suite.on("cycle", (event: any) => { - console.log(String(event.target)); +for (const baseStr of ["A", "あ", "🌏"]) { + const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => { + return baseStr.repeat(n); }); - suite.run(); + for (const str of dataSet) { + const byteLength = utf8Count(str); + const bytes = new Uint8Array(new ArrayBuffer(byteLength)); + utf8Encode(str, bytes, 0); + + console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`); + + const suite = new Benchmark.Suite(); + + suite.add("utf8DecodeJs", () => { + if (utf8DecodeJs(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + + suite.add("utf8DecodeWasm", () => { + if (utf8DecodeWasm(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + + suite.add("TextDecoder", () => { + if (utf8DecodeTD(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + suite.on("cycle", (event: any) => { + console.log(String(event.target)); + }); + + suite.run(); + } } diff --git a/src/Decoder.ts b/src/Decoder.ts index c6c29b63..b8ab4f32 100644 --- a/src/Decoder.ts +++ b/src/Decoder.ts @@ -1,7 +1,7 @@ import { prettyByte } from "./utils/prettyByte"; import { ExtensionCodec } from "./ExtensionCodec"; import { getInt64, getUint64 } from "./utils/int"; -import { utf8Decode } from "./utils/utf8"; +import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8"; import { createDataView, ensureUint8Array } from "./utils/typedArrays"; import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions"; @@ -400,10 +400,14 @@ export class Decoder { } const offset = this.pos + headerOffset; - const object = - WASM_AVAILABLE && byteLength> WASM_STR_THRESHOLD - ? utf8DecodeWasm(this.bytes, offset, byteLength) - : utf8Decode(this.bytes, offset, byteLength); + let object: string; + if (TEXT_DECODER_AVAILABLE && byteLength> TEXT_DECODER_THRESHOLD) { + object = utf8DecodeTD(this.bytes, offset, byteLength); + } else if (WASM_AVAILABLE && byteLength> WASM_STR_THRESHOLD) { + object = utf8DecodeWasm(this.bytes, offset, byteLength); + } else { + object = utf8DecodeJs(this.bytes, offset, byteLength); + } this.pos += headerOffset + byteLength; return object; } diff --git a/src/utils/utf8.ts b/src/utils/utf8.ts index cf5a6112..a64f5113 100644 --- a/src/utils/utf8.ts +++ b/src/utils/utf8.ts @@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number const CHUNK_SIZE = 0x10_000; -export function safeStringFromCharCode(units: Array | Uint16Array) { - if (units.length <= CHUNK_SIZE) { - // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)` - // in case `units` is a typed array - return String.fromCharCode.apply(String, units as any); - } - - let result = ""; - for (let i = 0; i < units.length; i++) { - const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE); - result += String.fromCharCode.apply(String, chunk as any); - } - return result; -} - -const MIN_TEXT_DECODER_STRING_LENGTH = 200; -const defaultEncoding = "utf-8"; -const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null; - -export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string { +export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string { let offset = inputOffset; const end = offset + byteLength; - if (sharedTextDecoder !== null && byteLength> MIN_TEXT_DECODER_STRING_LENGTH) { - const stringBytes = bytes.subarray(offset, end); - return sharedTextDecoder.decode(stringBytes); - } - - const out: Array = []; + const units: Array = []; + let result = ""; while (offset < end) { const byte1 = bytes[offset++]; if ((byte1 & 0x80) === 0) { // 1 byte - out.push(byte1); + units.push(byte1); } else if ((byte1 & 0xe0) === 0xc0) { // 2 bytes const byte2 = bytes[offset++] & 0x3f; - out.push(((byte1 & 0x1f) << 6) | byte2); + units.push(((byte1 & 0x1f) << 6) | byte2); } else if ((byte1 & 0xf0) === 0xe0) { // 3 bytes const byte2 = bytes[offset++] & 0x3f; const byte3 = bytes[offset++] & 0x3f; - out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); + units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); } else if ((byte1 & 0xf8) === 0xf0) { // 4 bytes const byte2 = bytes[offset++] & 0x3f; @@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; if (unit> 0xffff) { unit -= 0x10000; - out.push(((unit>>> 10) & 0x3ff) | 0xd800); + units.push(((unit>>> 10) & 0x3ff) | 0xd800); unit = 0xdc00 | (unit & 0x3ff); } - out.push(unit); + units.push(unit); } else { - out.push(byte1); + units.push(byte1); + } + + if (units.length - 4>= CHUNK_SIZE) { + result += String.fromCharCode(...units); + units.length = 0; } } - return safeStringFromCharCode(out); + if (units.length> 0) { + result += String.fromCharCode(...units); + } + + return result; +} + +const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null; +export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder; +export const TEXT_DECODER_THRESHOLD = 200; + +export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string { + const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength); + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + return sharedTextDecoder!.decode(stringBytes); } diff --git a/src/wasmFunctions.ts b/src/wasmFunctions.ts index 0c6e14f8..5e92b4f0 100644 --- a/src/wasmFunctions.ts +++ b/src/wasmFunctions.ts @@ -1,5 +1,3 @@ -import { safeStringFromCharCode } from "./utils/utf8"; - // WASM=never - disable WASM functions // WASM=force - force to use WASM functions const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || ""; @@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu } } +const CHUNK_SIZE = 0x10_000; + +function safeStringFromCharCodeU16(units: Uint16Array) { + if (units.length <= CHUNK_SIZE) { + // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)` + // in case `units` is a typed array + return String.fromCharCode.apply(String, units as any); + } + + let result = ""; + for (let i = 0; i < units.length; i++) { + const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE); + result += String.fromCharCode.apply(String, chunk as any); + } + return result; +} + // A wrapper function for utf8DecodeToUint16Array() export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string { const inputPtr: pointer = wm.malloc(byteLength); @@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength); const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize); - return safeStringFromCharCode(units); + return safeStringFromCharCodeU16(units); } finally { wm.free(inputPtr); wm.free(outputPtr); From a788f19b1740bc546ae4877fd78249343b7919b9 Mon Sep 17 00:00:00 2001 From: "FUJI Goro (gfx)" Date: 2019εΉ΄5月29ζ—₯ 10:33:27 +0900 Subject: [PATCH 2/3] more tests for TextDecoder --- src/utils/utf8.ts | 2 +- test/msgpack-test-suite.test.ts | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/utils/utf8.ts b/src/utils/utf8.ts index a64f5113..78241647 100644 --- a/src/utils/utf8.ts +++ b/src/utils/utf8.ts @@ -134,7 +134,7 @@ export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null; export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder; -export const TEXT_DECODER_THRESHOLD = 200; +export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0; export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string { const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength); diff --git a/test/msgpack-test-suite.test.ts b/test/msgpack-test-suite.test.ts index 079cb370..f516cb22 100644 --- a/test/msgpack-test-suite.test.ts +++ b/test/msgpack-test-suite.test.ts @@ -89,7 +89,9 @@ describe("msgpack-test-suite", () => { FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY, FLOAT64_NAN: Number.NaN, STR16: "a".repeat(0x100), + STR16_MBS: "🌏".repeat(0x100), STR32: "b".repeat(0x10_000), + STR32_MBS: "🍣".repeat(0x10_000), STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions STR_INCLUDING_NUL: "foo0円bar", STR_BROKEN_FF: "\xff", From 46f1ec9c1da9f1488cbcd71953bcdfeea3641de5 Mon Sep 17 00:00:00 2001 From: "FUJI Goro (gfx)" Date: 2019εΉ΄5月29ζ—₯ 10:58:36 +0900 Subject: [PATCH 3/3] tewaks for test coverage --- package.json | 8 +++++--- test/msgpack-test-suite.test.ts | 4 +--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/package.json b/package.json index 884982ca..ba866f98 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,15 @@ "prepublishOnly": "run-p 'test:dist:*' && npm run test:browser", "clean": "rimraf build dist dist.*", "test": "mocha 'test/**/*.test.ts'", - "test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'", - "test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'", + "test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'", + "test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'", + "test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'", "test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot", "test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot", - "test:cover": "npm run cover:clean && npm run test:cover:purejs && npm run test:cover:wasm && npm run cover:report", + "test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report", "test:cover:purejs": "npx nyc --no-clean npm run test:purejs", "test:cover:wasm": "npx nyc --no-clean npm run test:wasm", + "test:cover:td": "npx nyc --no-clean npm run test:td", "cover:clean": "rimraf .nyc_output coverage/", "cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html", "test:browser": "karma start --single-run", diff --git a/test/msgpack-test-suite.test.ts b/test/msgpack-test-suite.test.ts index f516cb22..9b30516b 100644 --- a/test/msgpack-test-suite.test.ts +++ b/test/msgpack-test-suite.test.ts @@ -93,10 +93,8 @@ describe("msgpack-test-suite", () => { STR32: "b".repeat(0x10_000), STR32_MBS: "🍣".repeat(0x10_000), STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions - STR_INCLUDING_NUL: "foo0円bar", + STR_INCLUDING_NUL: "foo0円bar0円", STR_BROKEN_FF: "\xff", - STR_LONE_SURROGATE_1: "\ud800", - STR_LONE_SURROGATE_2: "\udbff", BIN16: new Uint8Array(0x100).fill(0xff), BIN32: new Uint8Array(0x10000).fill(0xff), ARRAY16: new Array(0x100).fill(true),

AltStyle γ«γ‚ˆγ£γ¦ε€‰ζ›γ•γ‚ŒγŸγƒšγƒΌγ‚Έ (->γ‚ͺγƒͺγ‚ΈγƒŠγƒ«) /