diff --git a/benchmark/decode-string.ts b/benchmark/decode-string.ts index 331a9e05..ed4813ee 100644 --- a/benchmark/decode-string.ts +++ b/benchmark/decode-string.ts @@ -1,50 +1,45 @@ /* eslint-disable no-console */ -import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8"; +import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8"; import { utf8DecodeWasm } from "../src/wasmFunctions"; // @ts-ignore import Benchmark from "benchmark"; -const textDecoder = new TextDecoder(); - -const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => { - return "a".repeat(n); -}); - -for (const str of dataSet) { - const byteLength = utf8Count(str); - const bytes = new Uint8Array(new ArrayBuffer(byteLength)); - utf8Encode(str, bytes, 0); - - console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`); - - const suite = new Benchmark.Suite(); - - const N = Math.round(100_0000 / str.length); - - // use the result to avoid void-context optimizations - let count = 0; - - suite.add("utf8Decode", () => { - if (utf8Decode(bytes, 0, byteLength) !== str) { - throw new Error("wrong result!"); - } - }); - - suite.add("utf8DecodeWasm", () => { - if (utf8DecodeWasm(bytes, 0, byteLength) !== str) { - throw new Error("wrong result!"); - } - }); - - suite.add("TextDecoder", () => { - if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) { - throw new Error("wrong result!"); - } - }); - suite.on("cycle", (event: any) => { - console.log(String(event.target)); +for (const baseStr of ["A", "あ", "🌏"]) { + const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => { + return baseStr.repeat(n); }); - suite.run(); + for (const str of dataSet) { + const byteLength = utf8Count(str); + const bytes = new Uint8Array(new ArrayBuffer(byteLength)); + utf8Encode(str, bytes, 0); + + console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`); + + const suite = new Benchmark.Suite(); + + suite.add("utf8DecodeJs", () => { + if (utf8DecodeJs(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + + suite.add("utf8DecodeWasm", () => { + if (utf8DecodeWasm(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + + suite.add("TextDecoder", () => { + if (utf8DecodeTD(bytes, 0, byteLength) !== str) { + throw new Error("wrong result!"); + } + }); + suite.on("cycle", (event: any) => { + console.log(String(event.target)); + }); + + suite.run(); + } } diff --git a/package.json b/package.json index 884982ca..ba866f98 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,15 @@ "prepublishOnly": "run-p 'test:dist:*' && npm run test:browser", "clean": "rimraf build dist dist.*", "test": "mocha 'test/**/*.test.ts'", - "test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'", - "test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'", + "test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'", + "test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'", + "test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'", "test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot", "test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot", - "test:cover": "npm run cover:clean && npm run test:cover:purejs && npm run test:cover:wasm && npm run cover:report", + "test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report", "test:cover:purejs": "npx nyc --no-clean npm run test:purejs", "test:cover:wasm": "npx nyc --no-clean npm run test:wasm", + "test:cover:td": "npx nyc --no-clean npm run test:td", "cover:clean": "rimraf .nyc_output coverage/", "cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html", "test:browser": "karma start --single-run", diff --git a/src/Decoder.ts b/src/Decoder.ts index c6c29b63..b8ab4f32 100644 --- a/src/Decoder.ts +++ b/src/Decoder.ts @@ -1,7 +1,7 @@ import { prettyByte } from "./utils/prettyByte"; import { ExtensionCodec } from "./ExtensionCodec"; import { getInt64, getUint64 } from "./utils/int"; -import { utf8Decode } from "./utils/utf8"; +import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8"; import { createDataView, ensureUint8Array } from "./utils/typedArrays"; import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions"; @@ -400,10 +400,14 @@ export class Decoder { } const offset = this.pos + headerOffset; - const object = - WASM_AVAILABLE && byteLength> WASM_STR_THRESHOLD - ? utf8DecodeWasm(this.bytes, offset, byteLength) - : utf8Decode(this.bytes, offset, byteLength); + let object: string; + if (TEXT_DECODER_AVAILABLE && byteLength> TEXT_DECODER_THRESHOLD) { + object = utf8DecodeTD(this.bytes, offset, byteLength); + } else if (WASM_AVAILABLE && byteLength> WASM_STR_THRESHOLD) { + object = utf8DecodeWasm(this.bytes, offset, byteLength); + } else { + object = utf8DecodeJs(this.bytes, offset, byteLength); + } this.pos += headerOffset + byteLength; return object; } diff --git a/src/utils/utf8.ts b/src/utils/utf8.ts index cf5a6112..78241647 100644 --- a/src/utils/utf8.ts +++ b/src/utils/utf8.ts @@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number const CHUNK_SIZE = 0x10_000; -export function safeStringFromCharCode(units: Array | Uint16Array) { - if (units.length <= CHUNK_SIZE) { - // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)` - // in case `units` is a typed array - return String.fromCharCode.apply(String, units as any); - } - - let result = ""; - for (let i = 0; i < units.length; i++) { - const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE); - result += String.fromCharCode.apply(String, chunk as any); - } - return result; -} - -const MIN_TEXT_DECODER_STRING_LENGTH = 200; -const defaultEncoding = "utf-8"; -const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null; - -export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string { +export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string { let offset = inputOffset; const end = offset + byteLength; - if (sharedTextDecoder !== null && byteLength> MIN_TEXT_DECODER_STRING_LENGTH) { - const stringBytes = bytes.subarray(offset, end); - return sharedTextDecoder.decode(stringBytes); - } - - const out: Array = []; + const units: Array = []; + let result = ""; while (offset < end) { const byte1 = bytes[offset++]; if ((byte1 & 0x80) === 0) { // 1 byte - out.push(byte1); + units.push(byte1); } else if ((byte1 & 0xe0) === 0xc0) { // 2 bytes const byte2 = bytes[offset++] & 0x3f; - out.push(((byte1 & 0x1f) << 6) | byte2); + units.push(((byte1 & 0x1f) << 6) | byte2); } else if ((byte1 & 0xf0) === 0xe0) { // 3 bytes const byte2 = bytes[offset++] & 0x3f; const byte3 = bytes[offset++] & 0x3f; - out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); + units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3); } else if ((byte1 & 0xf8) === 0xf0) { // 4 bytes const byte2 = bytes[offset++] & 0x3f; @@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4; if (unit> 0xffff) { unit -= 0x10000; - out.push(((unit>>> 10) & 0x3ff) | 0xd800); + units.push(((unit>>> 10) & 0x3ff) | 0xd800); unit = 0xdc00 | (unit & 0x3ff); } - out.push(unit); + units.push(unit); } else { - out.push(byte1); + units.push(byte1); + } + + if (units.length - 4>= CHUNK_SIZE) { + result += String.fromCharCode(...units); + units.length = 0; } } - return safeStringFromCharCode(out); + if (units.length> 0) { + result += String.fromCharCode(...units); + } + + return result; +} + +const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null; +export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder; +export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0; + +export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string { + const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength); + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion + return sharedTextDecoder!.decode(stringBytes); } diff --git a/src/wasmFunctions.ts b/src/wasmFunctions.ts index 0c6e14f8..5e92b4f0 100644 --- a/src/wasmFunctions.ts +++ b/src/wasmFunctions.ts @@ -1,5 +1,3 @@ -import { safeStringFromCharCode } from "./utils/utf8"; - // WASM=never - disable WASM functions // WASM=force - force to use WASM functions const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || ""; @@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu } } +const CHUNK_SIZE = 0x10_000; + +function safeStringFromCharCodeU16(units: Uint16Array) { + if (units.length <= CHUNK_SIZE) { + // `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)` + // in case `units` is a typed array + return String.fromCharCode.apply(String, units as any); + } + + let result = ""; + for (let i = 0; i < units.length; i++) { + const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE); + result += String.fromCharCode.apply(String, chunk as any); + } + return result; +} + // A wrapper function for utf8DecodeToUint16Array() export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string { const inputPtr: pointer = wm.malloc(byteLength); @@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength); const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize); - return safeStringFromCharCode(units); + return safeStringFromCharCodeU16(units); } finally { wm.free(inputPtr); wm.free(outputPtr); diff --git a/test/msgpack-test-suite.test.ts b/test/msgpack-test-suite.test.ts index 079cb370..9b30516b 100644 --- a/test/msgpack-test-suite.test.ts +++ b/test/msgpack-test-suite.test.ts @@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => { FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY, FLOAT64_NAN: Number.NaN, STR16: "a".repeat(0x100), + STR16_MBS: "🌏".repeat(0x100), STR32: "b".repeat(0x10_000), + STR32_MBS: "🍣".repeat(0x10_000), STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions - STR_INCLUDING_NUL: "foo0円bar", + STR_INCLUDING_NUL: "foo0円bar0円", STR_BROKEN_FF: "\xff", - STR_LONE_SURROGATE_1: "\ud800", - STR_LONE_SURROGATE_2: "\udbff", BIN16: new Uint8Array(0x100).fill(0xff), BIN32: new Uint8Array(0x10000).fill(0xff), ARRAY16: new Array(0x100).fill(true),

AltStyle γ«γ‚ˆγ£γ¦ε€‰ζ›γ•γ‚ŒγŸγƒšγƒΌγ‚Έ (->γ‚ͺγƒͺγ‚ΈγƒŠγƒ«) /