Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

improve: more smart conditions of string decoding functions #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
gfx merged 3 commits into master from select_best_function_in_str_decoding
May 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 36 additions & 41 deletions benchmark/decode-string.ts
View file Open in desktop
Original file line number Diff line number Diff line change
@@ -1,50 +1,45 @@
/* eslint-disable no-console */
import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To benchmark them separately, I have split utf8decode functions.

import { utf8DecodeWasm } from "../src/wasmFunctions";

// @ts-ignore
import Benchmark from "benchmark";

const textDecoder = new TextDecoder();

const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
return "a".repeat(n);
});

for (const str of dataSet) {
const byteLength = utf8Count(str);
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
utf8Encode(str, bytes, 0);

console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);

const suite = new Benchmark.Suite();

const N = Math.round(100_0000 / str.length);

// use the result to avoid void-context optimizations
let count = 0;

suite.add("utf8Decode", () => {
if (utf8Decode(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("utf8DecodeWasm", () => {
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("TextDecoder", () => {
if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
throw new Error("wrong result!");
}
});
suite.on("cycle", (event: any) => {
console.log(String(event.target));
for (const baseStr of ["A", "あ", "🌏"]) {
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
return baseStr.repeat(n);
});

suite.run();
for (const str of dataSet) {
const byteLength = utf8Count(str);
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
utf8Encode(str, bytes, 0);

console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);

const suite = new Benchmark.Suite();

suite.add("utf8DecodeJs", () => {
if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("utf8DecodeWasm", () => {
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});

suite.add("TextDecoder", () => {
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
throw new Error("wrong result!");
}
});
suite.on("cycle", (event: any) => {
console.log(String(event.target));
});

suite.run();
}
}
8 changes: 5 additions & 3 deletions package.json
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@
"prepublishOnly": "run-p 'test:dist:*' && npm run test:browser",
"clean": "rimraf build dist dist.*",
"test": "mocha 'test/**/*.test.ts'",
"test:purejs": "MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
"test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
"test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/**/*.test.ts'",
"test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/**/*.test.ts'",
"test:td": "TEXT_DECODER=force mocha 'test/**/*.test.ts'",
"test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot",
"test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot",
"test:cover": "npm run cover:clean && npmruntest:cover:purejs && npm run test:cover:wasm && npm run cover:report",
"test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report",
"test:cover:purejs": "npx nyc --no-clean npm run test:purejs",
"test:cover:wasm": "npx nyc --no-clean npm run test:wasm",
"test:cover:td": "npx nyc --no-clean npm run test:td",
"cover:clean": "rimraf .nyc_output coverage/",
"cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html",
"test:browser": "karma start --single-run",
Expand Down
14 changes: 9 additions & 5 deletions src/Decoder.ts
View file Open in desktop
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { prettyByte } from "./utils/prettyByte";
import { ExtensionCodec } from "./ExtensionCodec";
import { getInt64, getUint64 } from "./utils/int";
import { utf8Decode } from "./utils/utf8";
import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8";
import { createDataView, ensureUint8Array } from "./utils/typedArrays";
import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";

Expand Down Expand Up @@ -400,10 +400,14 @@ export class Decoder {
}

const offset = this.pos + headerOffset;
const object =
WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
? utf8DecodeWasm(this.bytes, offset, byteLength)
: utf8Decode(this.bytes, offset, byteLength);
let object: string;
if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
object = utf8DecodeTD(this.bytes, offset, byteLength);
} else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
object = utf8DecodeWasm(this.bytes, offset, byteLength);
} else {
object = utf8DecodeJs(this.bytes, offset, byteLength);
}
this.pos += headerOffset + byteLength;
return object;
}
Expand Down
62 changes: 29 additions & 33 deletions src/utils/utf8.ts
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number

const CHUNK_SIZE = 0x10_000;

export function safeStringFromCharCode(units: Array<number> | Uint16Array) {
if (units.length <= CHUNK_SIZE) {
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
// in case `units` is a typed array
return String.fromCharCode.apply(String, units as any);
}

let result = "";
for (let i = 0; i < units.length; i++) {
const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
result += String.fromCharCode.apply(String, chunk as any);
}
return result;
}

const MIN_TEXT_DECODER_STRING_LENGTH = 200;
const defaultEncoding = "utf-8";
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;

export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
let offset = inputOffset;
const end = offset + byteLength;

if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
const stringBytes = bytes.subarray(offset, end);
return sharedTextDecoder.decode(stringBytes);
}

const out: Array<number> = [];
const units: Array<number> = [];
let result = "";
while (offset < end) {
const byte1 = bytes[offset++];
if ((byte1 & 0x80) === 0) {
// 1 byte
out.push(byte1);
units.push(byte1);
} else if ((byte1 & 0xe0) === 0xc0) {
// 2 bytes
const byte2 = bytes[offset++] & 0x3f;
out.push(((byte1 & 0x1f) << 6) | byte2);
units.push(((byte1 & 0x1f) << 6) | byte2);
} else if ((byte1 & 0xf0) === 0xe0) {
// 3 bytes
const byte2 = bytes[offset++] & 0x3f;
const byte3 = bytes[offset++] & 0x3f;
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
} else if ((byte1 & 0xf8) === 0xf0) {
// 4 bytes
const byte2 = bytes[offset++] & 0x3f;
Expand All @@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
if (unit > 0xffff) {
unit -= 0x10000;
out.push(((unit >>> 10) & 0x3ff) | 0xd800);
units.push(((unit >>> 10) & 0x3ff) | 0xd800);
unit = 0xdc00 | (unit & 0x3ff);
}
out.push(unit);
units.push(unit);
} else {
out.push(byte1);
units.push(byte1);
}

if (units.length - 4 >= CHUNK_SIZE) {
result += String.fromCharCode(...units);
units.length = 0;
}
}

return safeStringFromCharCode(out);
if (units.length > 0) {
result += String.fromCharCode(...units);
}

return result;
}

const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0;

export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
return sharedTextDecoder!.decode(stringBytes);
}
21 changes: 18 additions & 3 deletions src/wasmFunctions.ts
View file Open in desktop
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { safeStringFromCharCode } from "./utils/utf8";

// WASM=never - disable WASM functions
// WASM=force - force to use WASM functions
const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || "";
Expand Down Expand Up @@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
}
}

const CHUNK_SIZE = 0x10_000;

function safeStringFromCharCodeU16(units: Uint16Array) {
if (units.length <= CHUNK_SIZE) {
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
// in case `units` is a typed array
return String.fromCharCode.apply(String, units as any);
}

let result = "";
for (let i = 0; i < units.length; i++) {
const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
result += String.fromCharCode.apply(String, chunk as any);
}
return result;
}

// A wrapper function for utf8DecodeToUint16Array()
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
const inputPtr: pointer = wm.malloc(byteLength);
Expand All @@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt

const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
return safeStringFromCharCode(units);
return safeStringFromCharCodeU16(units);
} finally {
wm.free(inputPtr);
wm.free(outputPtr);
Expand Down
6 changes: 3 additions & 3 deletions test/msgpack-test-suite.test.ts
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => {
FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY,
FLOAT64_NAN: Number.NaN,
STR16: "a".repeat(0x100),
STR16_MBS: "🌏".repeat(0x100),
STR32: "b".repeat(0x10_000),
STR32_MBS: "🍣".repeat(0x10_000),
STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions
STR_INCLUDING_NUL: "foo0円bar",
STR_INCLUDING_NUL: "foo0円bar0円",
STR_BROKEN_FF: "\xff",
STR_LONE_SURROGATE_1: "\ud800",
STR_LONE_SURROGATE_2: "\udbff",
BIN16: new Uint8Array(0x100).fill(0xff),
BIN32: new Uint8Array(0x10000).fill(0xff),
ARRAY16: new Array<boolean>(0x100).fill(true),
Expand Down

AltStyle によって変換されたページ (->オリジナル) /