improve: more smart conditions of string decoding functions #39

Original file line number	Diff line number	Diff line change
		@@ -1,50 +1,45 @@
		/* eslint-disable no-console */
	import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
	import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
Copy link Member Author @gfx gfx May 29, 2019 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. To benchmark them separately, I have split utf8decode functions.
		import { utf8DecodeWasm } from "../src/wasmFunctions";

		// @ts-ignore
		import Benchmark from "benchmark";

	const textDecoder = new TextDecoder();

	const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
	return "a".repeat(n);
	});

	for (const str of dataSet) {
	const byteLength = utf8Count(str);
	const bytes = new Uint8Array(new ArrayBuffer(byteLength));
	utf8Encode(str, bytes, 0);

	console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);

	const suite = new Benchmark.Suite();

	const N = Math.round(100_0000 / str.length);

	// use the result to avoid void-context optimizations
	let count = 0;

	suite.add("utf8Decode", () => {
	if (utf8Decode(bytes, 0, byteLength) !== str) {
	throw new Error("wrong result!");
	}
	});

	suite.add("utf8DecodeWasm", () => {
	if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
	throw new Error("wrong result!");
	}
	});

	suite.add("TextDecoder", () => {
	if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
	throw new Error("wrong result!");
	}
	});
	suite.on("cycle", (event: any) => {
	console.log(String(event.target));
	for (const baseStr of ["A", "あ", "🌏"]) {
	const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
	return baseStr.repeat(n);
		});

	suite.run();
	for (const str of dataSet) {
	const byteLength = utf8Count(str);
	const bytes = new Uint8Array(new ArrayBuffer(byteLength));
	utf8Encode(str, bytes, 0);

	console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);

	const suite = new Benchmark.Suite();

	suite.add("utf8DecodeJs", () => {
	if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
	throw new Error("wrong result!");
	}
	});

	suite.add("utf8DecodeWasm", () => {
	if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
	throw new Error("wrong result!");
	}
	});

	suite.add("TextDecoder", () => {
	if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
	throw new Error("wrong result!");
	}
	});
	suite.on("cycle", (event: any) => {
	console.log(String(event.target));
	});

	suite.run();
	}
		}

8 changes: 5 additions & 3 deletions package.json

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,13 +12,15 @@
		"prepublishOnly": "run-p 'test:dist:*' && npm run test:browser",
		"clean": "rimraf build dist dist.*",
		"test": "mocha 'test/*/.test.ts'",
	"test:purejs": "MSGPACK_WASM=never mocha 'test/*/.test.ts'",
	"test:wasm": "npm run asbuild:production && MSGPACK_WASM=force mocha 'test/*/.test.ts'",
	"test:purejs": "TEXT_DECODER=never MSGPACK_WASM=never mocha 'test/*/.test.ts'",
	"test:wasm": "npm run asbuild:production && TEXT_DECODER=never MSGPACK_WASM=force mocha 'test/*/.test.ts'",
	"test:td": "TEXT_DECODER=force mocha 'test/*/.test.ts'",
		"test:dist:purejs": "TS_NODE_PROJECT=tsconfig.test-dist-es5-purejs.json npm run test:purejs -- --reporter=dot",
		"test:dist:wasm": "TS_NODE_PROJECT=tsconfig.test-dist-es5-wasm.json npm run test:wasm -- --reporter=dot",
	"test:cover": "npm run cover:clean && npmruntest:cover:purejs && npm run test:cover:wasm && npm run cover:report",
	"test:cover": "npm run cover:clean && npm-run-all 'test:cover:*' && npm run cover:report",
		"test:cover:purejs": "npx nyc --no-clean npm run test:purejs",
		"test:cover:wasm": "npx nyc --no-clean npm run test:wasm",
	"test:cover:td": "npx nyc --no-clean npm run test:td",
		"cover:clean": "rimraf .nyc_output coverage/",
		"cover:report": "nyc report --reporter=lcov --reporter=text-summary --reporter=html",
		"test:browser": "karma start --single-run",
Expand Down

14 changes: 9 additions & 5 deletions src/Decoder.ts

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,7 +1,7 @@
		import { prettyByte } from "./utils/prettyByte";
		import { ExtensionCodec } from "./ExtensionCodec";
		import { getInt64, getUint64 } from "./utils/int";
	import { utf8Decode } from "./utils/utf8";
	import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8";
		import { createDataView, ensureUint8Array } from "./utils/typedArrays";
		import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";

Expand Down Expand Up		@@ -400,10 +400,14 @@ export class Decoder {
		}

		const offset = this.pos + headerOffset;
	const object =
	WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
	? utf8DecodeWasm(this.bytes, offset, byteLength)
	: utf8Decode(this.bytes, offset, byteLength);
	let object: string;
	if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
	object = utf8DecodeTD(this.bytes, offset, byteLength);
	} else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
	object = utf8DecodeWasm(this.bytes, offset, byteLength);
	} else {
	object = utf8DecodeJs(this.bytes, offset, byteLength);
	}
		this.pos += headerOffset + byteLength;
		return object;
		}
Expand Down

62 changes: 29 additions & 33 deletions src/utils/utf8.ts

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number

		const CHUNK_SIZE = 0x10_000;

	export function safeStringFromCharCode(units: Array<number> \| Uint16Array) {
	if (units.length <= CHUNK_SIZE) {
	// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
	// in case `units` is a typed array
	return String.fromCharCode.apply(String, units as any);
	}

	let result = "";
	for (let i = 0; i < units.length; i++) {
	const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
	result += String.fromCharCode.apply(String, chunk as any);
	}
	return result;
	}

	const MIN_TEXT_DECODER_STRING_LENGTH = 200;
	const defaultEncoding = "utf-8";
	const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;

	export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
	export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
		let offset = inputOffset;
		const end = offset + byteLength;

	if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
	const stringBytes = bytes.subarray(offset, end);
	return sharedTextDecoder.decode(stringBytes);
	}

	const out: Array<number> = [];
	const units: Array<number> = [];
	let result = "";
		while (offset < end) {
		const byte1 = bytes[offset++];
		if ((byte1 & 0x80) === 0) {
		// 1 byte
	out.push(byte1);
	units.push(byte1);
		} else if ((byte1 & 0xe0) === 0xc0) {
		// 2 bytes
		const byte2 = bytes[offset++] & 0x3f;
	out.push(((byte1 & 0x1f) << 6) \| byte2);
	units.push(((byte1 & 0x1f) << 6) \| byte2);
		} else if ((byte1 & 0xf0) === 0xe0) {
		// 3 bytes
		const byte2 = bytes[offset++] & 0x3f;
		const byte3 = bytes[offset++] & 0x3f;
	out.push(((byte1 & 0x1f) << 12) \| (byte2 << 6) \| byte3);
	units.push(((byte1 & 0x1f) << 12) \| (byte2 << 6) \| byte3);
		} else if ((byte1 & 0xf8) === 0xf0) {
		// 4 bytes
		const byte2 = bytes[offset++] & 0x3f;
Expand All		@@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
		let unit = ((byte1 & 0x07) << 0x12) \| (byte2 << 0x0c) \| (byte3 << 0x06) \| byte4;
		if (unit > 0xffff) {
		unit -= 0x10000;
	out.push(((unit >>> 10) & 0x3ff) \| 0xd800);
	units.push(((unit >>> 10) & 0x3ff) \| 0xd800);
		unit = 0xdc00 \| (unit & 0x3ff);
		}
	out.push(unit);
	units.push(unit);
		} else {
	out.push(byte1);
	units.push(byte1);
	}

	if (units.length - 4 >= CHUNK_SIZE) {
	result += String.fromCharCode(...units);
	units.length = 0;
		}
		}

	return safeStringFromCharCode(out);
	if (units.length > 0) {
	result += String.fromCharCode(...units);
	}

	return result;
	}

	const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
	export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
	export const TEXT_DECODER_THRESHOLD = process.env.TEXT_DECODER !== "force" ? 200 : 0;

	export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
	const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
	// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
	return sharedTextDecoder!.decode(stringBytes);
		}

21 changes: 18 additions & 3 deletions src/wasmFunctions.ts

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
		@@ -1,5 +1,3 @@
	import { safeStringFromCharCode } from "./utils/utf8";

		// WASM=never - disable WASM functions
		// WASM=force - force to use WASM functions
		const WASM: string = process.env.MSGPACK_WASM \|\| process.env.WASM \|\| "";
Expand Down Expand Up		@@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
		}
		}

	const CHUNK_SIZE = 0x10_000;

	function safeStringFromCharCodeU16(units: Uint16Array) {
	if (units.length <= CHUNK_SIZE) {
	// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
	// in case `units` is a typed array
	return String.fromCharCode.apply(String, units as any);
	}

	let result = "";
	for (let i = 0; i < units.length; i++) {
	const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
	result += String.fromCharCode.apply(String, chunk as any);
	}
	return result;
	}

		// A wrapper function for utf8DecodeToUint16Array()
		export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
		const inputPtr: pointer = wm.malloc(byteLength);
Expand All		@@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt

		const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
		const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
	return safeStringFromCharCode(units);
	return safeStringFromCharCodeU16(units);
		} finally {
		wm.free(inputPtr);
		wm.free(outputPtr);
Expand Down

6 changes: 3 additions & 3 deletions test/msgpack-test-suite.test.ts

Show comments View file Open in desktop

Original file line number	Diff line number	Diff line change
Expand Up		@@ -89,12 +89,12 @@ describe("msgpack-test-suite", () => {
		FLOAT64_NEGATIVE_INF: Number.NEGATIVE_INFINITY,
		FLOAT64_NAN: Number.NaN,
		STR16: "a".repeat(0x100),
	STR16_MBS: "🌏".repeat(0x100),
		STR32: "b".repeat(0x10_000),
	STR32_MBS: "🍣".repeat(0x10_000),
		STR32LARGE: "c".repeat(0x100_000), // may cause "RangeError: Maximum call stack size exceeded" in simple implelementions
	STR_INCLUDING_NUL: "foo0円bar",
	STR_INCLUDING_NUL: "foo0円bar0円",
		STR_BROKEN_FF: "\xff",
	STR_LONE_SURROGATE_1: "\ud800",
	STR_LONE_SURROGATE_2: "\udbff",
		BIN16: new Uint8Array(0x100).fill(0xff),
		BIN32: new Uint8Array(0x10000).fill(0xff),
		ARRAY16: new Array<boolean>(0x100).fill(true),
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

improve: more smart conditions of string decoding functions #39

Uh oh!

improve: more smart conditions of string decoding functions #39

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

@gfx gfx May 29, 2019

Choose a reason for hiding this comment

Uh oh!