Skip to content

Commit 5d85f73

Browse files
committed
improve: more smart selection of string decoding functions
1 parent 7f338b2 commit 5d85f73

File tree

4 files changed

+92
-82
lines changed

4 files changed

+92
-82
lines changed

benchmark/decode-string.ts

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,45 @@
11
/* eslint-disable no-console */
2-
import { utf8Encode, utf8Count, utf8Decode } from "../src/utils/utf8";
2+
import { utf8Encode, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
33
import { utf8DecodeWasm } from "../src/wasmFunctions";
44

55
// @ts-ignore
66
import Benchmark from "benchmark";
77

8-
const textDecoder = new TextDecoder();
9-
10-
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
11-
return "a".repeat(n);
12-
});
13-
14-
for (const str of dataSet) {
15-
const byteLength = utf8Count(str);
16-
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
17-
utf8Encode(str, bytes, 0);
18-
19-
console.log(`\n## string length=${str.length} byteLength=${byteLength}\n`);
20-
21-
const suite = new Benchmark.Suite();
22-
23-
const N = Math.round(100_0000 / str.length);
24-
25-
// use the result to avoid void-context optimizations
26-
let count = 0;
27-
28-
suite.add("utf8Decode", () => {
29-
if (utf8Decode(bytes, 0, byteLength) !== str) {
30-
throw new Error("wrong result!");
31-
}
32-
});
33-
34-
suite.add("utf8DecodeWasm", () => {
35-
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
36-
throw new Error("wrong result!");
37-
}
38-
});
39-
40-
suite.add("TextDecoder", () => {
41-
if (textDecoder.decode(bytes.subarray(0, byteLength)) !== str) {
42-
throw new Error("wrong result!");
43-
}
44-
});
45-
suite.on("cycle", (event: any) => {
46-
console.log(String(event.target));
8+
for (const baseStr of ["A", "あ", "🌏"]) {
9+
const dataSet = [10, 100, 200, 1_000, 10_000, 100_000].map((n) => {
10+
return baseStr.repeat(n);
4711
});
4812

49-
suite.run();
13+
for (const str of dataSet) {
14+
const byteLength = utf8Count(str);
15+
const bytes = new Uint8Array(new ArrayBuffer(byteLength));
16+
utf8Encode(str, bytes, 0);
17+
18+
console.log(`\n## string "${baseStr}" x ${str.length} (byteLength=${byteLength})\n`);
19+
20+
const suite = new Benchmark.Suite();
21+
22+
suite.add("utf8DecodeJs", () => {
23+
if (utf8DecodeJs(bytes, 0, byteLength) !== str) {
24+
throw new Error("wrong result!");
25+
}
26+
});
27+
28+
suite.add("utf8DecodeWasm", () => {
29+
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
30+
throw new Error("wrong result!");
31+
}
32+
});
33+
34+
suite.add("TextDecoder", () => {
35+
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
36+
throw new Error("wrong result!");
37+
}
38+
});
39+
suite.on("cycle", (event: any) => {
40+
console.log(String(event.target));
41+
});
42+
43+
suite.run();
44+
}
5045
}

src/Decoder.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { prettyByte } from "./utils/prettyByte";
22
import { ExtensionCodec } from "./ExtensionCodec";
33
import { getInt64, getUint64 } from "./utils/int";
4-
import { utf8Decode } from "./utils/utf8";
4+
import { utf8DecodeJs, TEXT_DECODER_AVAILABLE, TEXT_DECODER_THRESHOLD, utf8DecodeTD } from "./utils/utf8";
55
import { createDataView, ensureUint8Array } from "./utils/typedArrays";
66
import { WASM_AVAILABLE, WASM_STR_THRESHOLD, utf8DecodeWasm } from "./wasmFunctions";
77

@@ -400,10 +400,14 @@ export class Decoder {
400400
}
401401

402402
const offset = this.pos + headerOffset;
403-
const object =
404-
WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD
405-
? utf8DecodeWasm(this.bytes, offset, byteLength)
406-
: utf8Decode(this.bytes, offset, byteLength);
403+
let object: string;
404+
if (TEXT_DECODER_AVAILABLE && byteLength > TEXT_DECODER_THRESHOLD) {
405+
object = utf8DecodeTD(this.bytes, offset, byteLength);
406+
} else if (WASM_AVAILABLE && byteLength > WASM_STR_THRESHOLD) {
407+
object = utf8DecodeWasm(this.bytes, offset, byteLength);
408+
} else {
409+
object = utf8DecodeJs(this.bytes, offset, byteLength);
410+
}
407411
this.pos += headerOffset + byteLength;
408412
return object;
409413
}

src/utils/utf8.ts

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -83,49 +83,26 @@ export function utf8Encode(str: string, output: Uint8Array, outputOffset: number
8383

8484
const CHUNK_SIZE = 0x10_000;
8585

86-
export function safeStringFromCharCode(units: Array<number> | Uint16Array) {
87-
if (units.length <= CHUNK_SIZE) {
88-
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
89-
// in case `units` is a typed array
90-
return String.fromCharCode.apply(String, units as any);
91-
}
92-
93-
let result = "";
94-
for (let i = 0; i < units.length; i++) {
95-
const chunk = units.slice(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
96-
result += String.fromCharCode.apply(String, chunk as any);
97-
}
98-
return result;
99-
}
100-
101-
const MIN_TEXT_DECODER_STRING_LENGTH = 200;
102-
const defaultEncoding = "utf-8";
103-
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder(defaultEncoding) : null;
104-
105-
export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
86+
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
10687
let offset = inputOffset;
10788
const end = offset + byteLength;
10889

109-
if (sharedTextDecoder !== null && byteLength > MIN_TEXT_DECODER_STRING_LENGTH) {
110-
const stringBytes = bytes.subarray(offset, end);
111-
return sharedTextDecoder.decode(stringBytes);
112-
}
113-
114-
const out: Array<number> = [];
90+
const units: Array<number> = [];
91+
let result = "";
11592
while (offset < end) {
11693
const byte1 = bytes[offset++];
11794
if ((byte1 & 0x80) === 0) {
11895
// 1 byte
119-
out.push(byte1);
96+
units.push(byte1);
12097
} else if ((byte1 & 0xe0) === 0xc0) {
12198
// 2 bytes
12299
const byte2 = bytes[offset++] & 0x3f;
123-
out.push(((byte1 & 0x1f) << 6) | byte2);
100+
units.push(((byte1 & 0x1f) << 6) | byte2);
124101
} else if ((byte1 & 0xf0) === 0xe0) {
125102
// 3 bytes
126103
const byte2 = bytes[offset++] & 0x3f;
127104
const byte3 = bytes[offset++] & 0x3f;
128-
out.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
105+
units.push(((byte1 & 0x1f) << 12) | (byte2 << 6) | byte3);
129106
} else if ((byte1 & 0xf8) === 0xf0) {
130107
// 4 bytes
131108
const byte2 = bytes[offset++] & 0x3f;
@@ -134,14 +111,33 @@ export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: n
134111
let unit = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0c) | (byte3 << 0x06) | byte4;
135112
if (unit > 0xffff) {
136113
unit -= 0x10000;
137-
out.push(((unit >>> 10) & 0x3ff) | 0xd800);
114+
units.push(((unit >>> 10) & 0x3ff) | 0xd800);
138115
unit = 0xdc00 | (unit & 0x3ff);
139116
}
140-
out.push(unit);
117+
units.push(unit);
141118
} else {
142-
out.push(byte1);
119+
units.push(byte1);
120+
}
121+
122+
if (units.length - 4 >= CHUNK_SIZE) {
123+
result += String.fromCharCode(...units);
124+
units.length = 0;
143125
}
144126
}
145127

146-
return safeStringFromCharCode(out);
128+
if (units.length > 0) {
129+
result += String.fromCharCode(...units);
130+
}
131+
132+
return result;
133+
}
134+
135+
const sharedTextDecoder = typeof TextDecoder !== "undefined" ? new TextDecoder() : null;
136+
export const TEXT_DECODER_AVAILABLE = process.env.TEXT_DECODER !== "never" && !!sharedTextDecoder;
137+
export const TEXT_DECODER_THRESHOLD = 200;
138+
139+
export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
140+
const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
141+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
142+
return sharedTextDecoder!.decode(stringBytes);
147143
}

src/wasmFunctions.ts

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import { safeStringFromCharCode } from "./utils/utf8";
2-
31
// WASM=never - disable WASM functions
42
// WASM=force - force to use WASM functions
53
const WASM: string = process.env.MSGPACK_WASM || process.env.WASM || "";
@@ -63,6 +61,23 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
6361
}
6462
}
6563

64+
const CHUNK_SIZE = 0x10_000;
65+
66+
function safeStringFromCharCodeU16(units: Uint16Array) {
67+
if (units.length <= CHUNK_SIZE) {
68+
// `String.fromCharCode.apply()` is faster than `String.fromCharCode(...units)`
69+
// in case `units` is a typed array
70+
return String.fromCharCode.apply(String, units as any);
71+
}
72+
73+
let result = "";
74+
for (let i = 0; i < units.length; i++) {
75+
const chunk = units.subarray(i * CHUNK_SIZE, (i + 1) * CHUNK_SIZE);
76+
result += String.fromCharCode.apply(String, chunk as any);
77+
}
78+
return result;
79+
}
80+
6681
// A wrapper function for utf8DecodeToUint16Array()
6782
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
6883
const inputPtr: pointer = wm.malloc(byteLength);
@@ -73,7 +88,7 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt
7388

7489
const outputArraySize = wm.utf8DecodeToUint16Array(outputPtr, inputPtr, byteLength);
7590
const units = new Uint16Array(wm.memory.buffer, outputPtr, outputArraySize);
76-
return safeStringFromCharCode(units);
91+
return safeStringFromCharCodeU16(units);
7792
} finally {
7893
wm.free(inputPtr);
7994
wm.free(outputPtr);

0 commit comments

Comments
 (0)