Skip to content

Commit 495c148

Browse files
anonrigerikcorrylemire
committed
util: improve textencoder encodeInto performance
Co-authored-by: Erik Corry <ecorry@cloudflare.com> Co-authored-by: Daniel Lemire <daniel@lemire.me>
1 parent 340e619 commit 495c148

File tree

1 file changed

+146
-8
lines changed

1 file changed

+146
-8
lines changed

src/encoding_binding.cc

Lines changed: 146 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "node_external_reference.h"
77
#include "simdutf.h"
88
#include "string_bytes.h"
9+
#include "util.h"
910
#include "v8.h"
1011

1112
#include <cstdint>
@@ -71,6 +72,90 @@ InternalFieldInfoBase* BindingData::Serialize(int index) {
7172
return info;
7273
}
7374

75+
// The following code is adapted from Cloudflare workers.
76+
// Particularly from: https://github.com/cloudflare/workerd/pull/5448
77+
//
78+
// Copyright (c) 2017-2025 Cloudflare, Inc.
79+
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
80+
// https://opensource.org/licenses/Apache-2.0
81+
namespace {
82+
constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096;
83+
84+
constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) {
85+
return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00;
86+
}
87+
88+
constexpr size_t simpleUtfEncodingLength(uint16_t c) {
89+
if (c < 0x80) return 1;
90+
if (c < 0x400) return 2;
91+
return 3;
92+
}
93+
94+
template <typename Char>
95+
size_t findBestFit(const Char* data, size_t length, size_t bufferSize) {
96+
size_t pos = 0;
97+
size_t utf8Accumulated = 0;
98+
constexpr size_t CHUNK = 257;
99+
constexpr bool UTF16 = sizeof(Char) == 2;
100+
constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2;
101+
102+
double expansion = 1.15;
103+
104+
while (pos < length && utf8Accumulated < bufferSize) {
105+
size_t remainingInput = length - pos;
106+
size_t spaceRemaining = bufferSize - utf8Accumulated;
107+
DCHECK_GE(expansion, 1.15);
108+
109+
size_t guaranteedToFit = spaceRemaining / MAX_FACTOR;
110+
if (guaranteedToFit >= remainingInput) {
111+
return length;
112+
}
113+
size_t likelyToFit =
114+
std::min(static_cast<size_t>(spaceRemaining / expansion), CHUNK);
115+
size_t fitEstimate =
116+
std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit));
117+
size_t chunkSize = std::min(remainingInput, fitEstimate);
118+
if (chunkSize == 1) break;
119+
DCHECK_GE(chunkSize, 1);
120+
121+
size_t chunkUtf8Len;
122+
if constexpr (UTF16) {
123+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
124+
// available For now, validate and use utf8_length_from_utf16
125+
chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize);
126+
} else {
127+
chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize);
128+
}
129+
130+
if (utf8Accumulated + chunkUtf8Len > bufferSize) {
131+
DCHECK_GT(chunkSize, guaranteedToFit);
132+
expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize);
133+
} else {
134+
expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize);
135+
pos += chunkSize;
136+
utf8Accumulated += chunkUtf8Len;
137+
}
138+
}
139+
140+
while (pos < length && utf8Accumulated < bufferSize) {
141+
size_t extra = simpleUtfEncodingLength(data[pos]);
142+
if (utf8Accumulated + extra > bufferSize) break;
143+
pos++;
144+
utf8Accumulated += extra;
145+
}
146+
147+
if (UTF16 && pos != 0 && pos != length &&
148+
isSurrogatePair(data[pos - 1], data[pos])) {
149+
if (utf8Accumulated < bufferSize) {
150+
pos++;
151+
} else {
152+
pos--;
153+
}
154+
}
155+
return pos;
156+
}
157+
} // namespace
158+
74159
void BindingData::Deserialize(Local<Context> context,
75160
Local<Object> holder,
76161
int index,
@@ -101,15 +186,68 @@ void BindingData::EncodeInto(const FunctionCallbackInfo<Value>& args) {
101186
char* write_result = static_cast<char*>(buf->Data()) + dest->ByteOffset();
102187
size_t dest_length = dest->ByteLength();
103188

104-
size_t nchars;
105-
size_t written = source->WriteUtf8V2(isolate,
106-
write_result,
107-
dest_length,
108-
String::WriteFlags::kReplaceInvalidUtf8,
109-
&nchars);
189+
size_t read = 0;
190+
size_t written = 0;
191+
v8::String::ValueView view(isolate, source);
192+
uint32_t length = view.length();
193+
194+
if (view.is_one_byte()) {
195+
auto data = reinterpret_cast<const char*>(view.data8());
196+
simdutf::result result = simdutf::validate_ascii_with_errors(
197+
data, std::min(static_cast<size_t>(length), dest_length));
198+
written = read = result.count;
199+
auto out_addr = write_result;
200+
memcpy(out_addr, data, read);
201+
out_addr += read;
202+
data += read;
203+
length -= read;
204+
dest_length -= read;
205+
if (length != 0 && dest_length != 0) {
206+
size_t rest = findBestFit(data, length, dest_length);
207+
if (rest != 0) {
208+
DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length);
209+
written += simdutf::convert_latin1_to_utf8(data, rest, out_addr);
210+
read += rest;
211+
}
212+
}
213+
} else {
214+
auto data = reinterpret_cast<const char16_t*>(view.data16());
215+
216+
// Check if input has unpaired surrogates - if so, convert to well-formed
217+
// first
218+
simdutf::result validation_result =
219+
simdutf::validate_utf16_with_errors(data, length);
220+
221+
if (validation_result.error == simdutf::SUCCESS) {
222+
// Valid UTF-16 - use the fast path
223+
read = findBestFit(data, length, dest_length);
224+
if (read != 0) {
225+
DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length);
226+
written = simdutf::convert_utf16_to_utf8(data, read, write_result);
227+
}
228+
} else {
229+
// Invalid UTF-16 with unpaired surrogates - convert to well-formed first
230+
// TODO(anonrig): Use utf8_length_from_utf16_with_replacement when
231+
// available
232+
MaybeStackBuffer<char16_t, MAX_SIZE_FOR_STACK_ALLOC> conversion_buffer(
233+
length);
234+
simdutf::to_well_formed_utf16(data, length, conversion_buffer.out());
235+
236+
// Now use findBestFit with the well-formed data
237+
read = findBestFit(conversion_buffer.out(), length, dest_length);
238+
if (read != 0) {
239+
DCHECK_LE(
240+
simdutf::utf8_length_from_utf16(conversion_buffer.out(), read),
241+
dest_length);
242+
written = simdutf::convert_utf16_to_utf8(
243+
conversion_buffer.out(), read, write_result);
244+
}
245+
}
246+
}
247+
DCHECK_LE(written, dest_length);
110248

111-
binding_data->encode_into_results_buffer_[0] = nchars;
112-
binding_data->encode_into_results_buffer_[1] = written;
249+
binding_data->encode_into_results_buffer_[0] = static_cast<double>(read);
250+
binding_data->encode_into_results_buffer_[1] = static_cast<double>(written);
113251
}
114252

115253
// Encode a single string to a UTF-8 Uint8Array (not Buffer).

0 commit comments

Comments
 (0)