Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c2f8f6b

Browse files
authored
Merge pull request #39 from msgpack/select_best_function_in_str_decoding
improve: more smart conditions of string decoding functions
2 parents 7f338b2 + 46f1ec9 commit c2f8f6b

File tree

6 files changed

+100
-88
lines changed

6 files changed

+100
-88
lines changed

‎benchmark/decode-string.ts

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,45 @@
11
/* eslint-disable no-console */
2-
import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
2+
import { utf8Encode, utf8Count, utf8DecodeJs,utf8DecodeTD } from "../src/utils/utf8";
33
import { utf8DecodeWasm } from "../src/wasmFunctions";
44

55
// @ts-ignore
66
import Benchmark from "benchmark";
77

8-
const textDecoder = new TextDecoder();
9-
10-
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
11-
return "a".repeat(n);
12-
});
13-
14-
for (const str of dataSet) {
15-
const byteLength = utf8Count(str);
16-
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
17-
utf8Encode(str, bytes, 0);
18-
19-
console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);
20-
21-
const suite = new Benchmark.Suite();
22-
23-
const N = Math.round(100_0000 / str.length);
24-
25-
// use the result to avoid void-context optimizations
26-
let count = 0;
27-
28-
suite.add("utf8Decode", () => {
29-
if (utf8Decode(bytes, 0, byteLength) !== str) {
30-
throw new Error("wrong result!");
31-
}
32-
});
33-
34-
suite.add("utf8DecodeWasm", () => {
35-
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
36-
throw new Error("wrong result!");
37-
}
38-
});
39-
40-
suite.add("TextDecoder", () => {
41-
if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
42-
throw new Error("wrong result!");
43-
}
44-
});
45-
suite.on("cycle", (event: any) => {
46-
console.log(String(event.target));
8+
for (const baseStr of ["A", "あ", "🌏"]) {
9+
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
10+
return baseStr.repeat(n);
4711
});
4812

49-
suite.run();
13+
for (const str of dataSet) {
14+
const byteLength = utf8Count(str);
15+
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
16+
utf8Encode(str, bytes, 0);
17+
18+
console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);
19+
20+
const suite = new Benchmark.Suite();
21+
22+
suite.add("utf8DecodeJs", () => {
23+
if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
24+
throw new Error("wrong result!");
25+
}
26+
});
27+
28+
suite.add("utf8DecodeWasm", () => {
29+
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
30+
throw new Error("wrong result!");
31+
}
32+
});
33+
34+
suite.add("TextDecoder", () => {
35+
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
36+
throw new Error("wrong result!");
37+
}
38+
});
39+
suite.on("cycle", (event: any) => {
40+
console.log(String(event.target));
41+
});
42+
43+
suite.run();
44+
}
5045
}

‎package.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@
1212
"prepublishOnly": "run-p 'test:dist:*' && npm run test:browser",
1313
"clean": "rimraf build dist dist.*",
1414
"test": "mocha 'test/**/*.test.ts'",
15-
"test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
16-
"test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
15+
"test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
16+
"test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
17+
"test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'",
1718
"test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot",
1819
"test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot",
19-
"test:cover": "npm run cover:clean && npmruntest:cover:purejs && npm run test:cover:wasm && npm run cover:report",
20+
"test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report",
2021
"test:cover:purejs": "npx nyc --no-clean npm run test:purejs",
2122
"test:cover:wasm": "npx nyc --no-clean npm run test:wasm",
23+
"test:cover:td": "npx nyc --no-clean npm run test:td",
2224
"cover:clean": "rimraf .nyc_output coverage/",
2325
"cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html",
2426
"test:browser": "karma start --single-run",

‎src/Decoder.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { prettyByte } from "./utils/prettyByte";
22
import { ExtensionCodec } from "./ExtensionCodec";
33
import { getInt64, getUint64 } from "./utils/int";
4-
import { utf8Decode } from "./utils/utf8";
4+
import { utf8DecodeJs,TEXT_DECODER_AVAILABLE,TEXT_DECODER_THRESHOLD,utf8DecodeTD } from "./utils/utf8";
55
import { createDataView, ensureUint8Array } from "./utils/typedArrays";
66
import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";
77

@@ -400,10 +400,14 @@ export class Decoder {
400400
}
401401

402402
const offset = this.pos + headerOffset;
403-
const object =
404-
WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
405-
? utf8DecodeWasm(this.bytes, offset, byteLength)
406-
: utf8Decode(this.bytes, offset, byteLength);
403+
let object: string;
404+
if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
405+
object = utf8DecodeTD(this.bytes, offset, byteLength);
406+
} else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
407+
object = utf8DecodeWasm(this.bytes, offset, byteLength);
408+
} else {
409+
object = utf8DecodeJs(this.bytes, offset, byteLength);
410+
}
407411
this.pos += headerOffset + byteLength;
408412
return object;
409413
}

‎src/utils/utf8.ts

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number
8383

8484
const CHUNK_SIZE = 0x10_000;
8585

86-
export function safeStringFromCharCode(units: Array<number> | Uint16Array) {
87-
if (units.length <= CHUNK_SIZE) {
88-
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
89-
// in case `units` is a typed array
90-
return String.fromCharCode.apply(String, units as any);
91-
}
92-
93-
let result = "";
94-
for (let i = 0; i < units.length; i++) {
95-
const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
96-
result += String.fromCharCode.apply(String, chunk as any);
97-
}
98-
return result;
99-
}
100-
101-
const MIN_TEXT_DECODER_STRING_LENGTH = 200;
102-
const defaultEncoding = "utf-8";
103-
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;
104-
105-
export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
86+
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
10687
let offset = inputOffset;
10788
const end = offset + byteLength;
10889

109-
if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
110-
const stringBytes = bytes.subarray(offset, end);
111-
return sharedTextDecoder.decode(stringBytes);
112-
}
113-
114-
const out: Array<number> = [];
90+
const units: Array<number> = [];
91+
let result = "";
11592
while (offset < end) {
11693
const byte1 = bytes[offset++];
11794
if ((byte1 & 0x80) === 0) {
11895
// 1 byte
119-
out.push(byte1);
96+
units.push(byte1);
12097
} else if ((byte1 & 0xe0) === 0xc0) {
12198
// 2 bytes
12299
const byte2 = bytes[offset++] & 0x3f;
123-
out.push(((byte1 & 0x1f) << 6) | byte2);
100+
units.push(((byte1 & 0x1f) << 6) | byte2);
124101
} else if ((byte1 & 0xf0) === 0xe0) {
125102
// 3 bytes
126103
const byte2 = bytes[offset++] & 0x3f;
127104
const byte3 = bytes[offset++] & 0x3f;
128-
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
105+
units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
129106
} else if ((byte1 & 0xf8) === 0xf0) {
130107
// 4 bytes
131108
const byte2 = bytes[offset++] & 0x3f;
@@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
134111
let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
135112
if (unit > 0xffff) {
136113
unit -= 0x10000;
137-
out.push(((unit >>> 10) & 0x3ff) | 0xd800);
114+
units.push(((unit >>> 10) & 0x3ff) | 0xd800);
138115
unit = 0xdc00 | (unit & 0x3ff);
139116
}
140-
out.push(unit);
117+
units.push(unit);
141118
} else {
142-
out.push(byte1);
119+
units.push(byte1);
120+
}
121+
122+
if (units.length - 4 >= CHUNK_SIZE) {
123+
result += String.fromCharCode(...units);
124+
units.length = 0;
143125
}
144126
}
145127

146-
return safeStringFromCharCode(out);
128+
if (units.length > 0) {
129+
result += String.fromCharCode(...units);
130+
}
131+
132+
return result;
133+
}
134+
135+
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
136+
export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
137+
export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0;
138+
139+
export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
140+
const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
141+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
142+
return sharedTextDecoder!.decode(stringBytes);
147143
}

‎src/wasmFunctions.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import { safeStringFromCharCode } from "./utils/utf8";
2-
31
// WASM=never - disable WASM functions
42
// WASM=force - force to use WASM functions
53
const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || "";
@@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
6361
}
6462
}
6563

64+
const CHUNK_SIZE = 0x10_000;
65+
66+
function safeStringFromCharCodeU16(units: Uint16Array) {
67+
if (units.length <= CHUNK_SIZE) {
68+
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
69+
// in case `units` is a typed array
70+
return String.fromCharCode.apply(String, units as any);
71+
}
72+
73+
let result = "";
74+
for (let i = 0; i < units.length; i++) {
75+
const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
76+
result += String.fromCharCode.apply(String, chunk as any);
77+
}
78+
return result;
79+
}
80+
6681
// A wrapper function for utf8DecodeToUint16Array()
6782
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
6883
const inputPtr: pointer = wm.malloc(byteLength);
@@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt
7388

7489
const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
7590
const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
76-
return safeStringFromCharCode(units);
91+
return safeStringFromCharCodeU16(units);
7792
} finally {
7893
wm.free(inputPtr);
7994
wm.free(outputPtr);

‎test/msgpack-test-suite.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => {
8989
FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY,
9090
FLOAT64_NAN: Number.NaN,
9191
STR16: "a".repeat(0x100),
92+
STR16_MBS: "🌏".repeat(0x100),
9293
STR32: "b".repeat(0x10_000),
94+
STR32_MBS: "🍣".repeat(0x10_000),
9395
STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions
94-
STR_INCLUDING_NUL: "foo0円bar",
96+
STR_INCLUDING_NUL: "foo0円bar0円",
9597
STR_BROKEN_FF: "\xff",
96-
STR_LONE_SURROGATE_1: "\ud800",
97-
STR_LONE_SURROGATE_2: "\udbff",
9898
BIN16: new Uint8Array(0x100).fill(0xff),
9999
BIN32: new Uint8Array(0x10000).fill(0xff),
100100
ARRAY16: new Array<boolean>(0x100).fill(true),

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /