diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index 877ae8a18f6b8f..88d49d7fcd134d 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -6,8 +6,10 @@ #include "node_external_reference.h" #include "simdutf.h" #include "string_bytes.h" +#include "util.h" #include "v8.h" +#include #include namespace node { @@ -71,6 +73,113 @@ InternalFieldInfoBase* BindingData::Serialize(int index) { return info; } +// The following code is adapted from Cloudflare workers. +// Particularly from: https://github.com/cloudflare/workerd/pull/5448 +// +// Copyright (c) 2017-2025 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 +namespace { +constexpr int MAX_SIZE_FOR_STACK_ALLOC = 4096; + +constexpr bool isSurrogatePair(uint16_t lead, uint16_t trail) { + return (lead & 0xfc00) == 0xd800 && (trail & 0xfc00) == 0xdc00; +} + +constexpr size_t simpleUtfEncodingLength(uint16_t c) { + if (c < 0x80) return 1; + if (c < 0x400) return 2; + return 3; +} + +// Finds the maximum number of input characters (UTF-16 or Latin1) that can be +// encoded into a UTF-8 buffer of the given size. +// +// The challenge is that UTF-8 encoding expands characters by variable amounts: +// - ASCII (< 0x80): 1 byte +// - Code points < 0x800: 2 bytes +// - Other BMP characters: 3 bytes +// - Surrogate pairs (supplementary planes): 4 bytes total +// +// This function uses an adaptive chunking algorithm: +// 1. Process the input in chunks, estimating how many characters will fit +// 2. Calculate the actual UTF-8 length for each chunk using simdutf +// 3. Adjust the expansion factor based on observed encoding ratios +// 4. Fall back to character-by-character processing near the buffer boundary +// 5. Handle UTF-16 surrogate pairs to avoid splitting them across boundaries +// +// The algorithm starts with a conservative expansion estimate (1.15x) and +// dynamically adjusts based on actual character distribution, making it +// efficient for common ASCII-heavy text while remaining correct for +// multi-byte heavy content. +template +size_t findBestFit(const Char* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + constexpr size_t CHUNK = 257; + constexpr bool UTF16 = sizeof(Char) == 2; + constexpr size_t MAX_FACTOR = UTF16 ? 3 : 2; + + double expansion = 1.15; + + while (pos < length && utf8Accumulated < bufferSize) { + size_t remainingInput = length - pos; + size_t spaceRemaining = bufferSize - utf8Accumulated; + DCHECK_GE(expansion, 1.15); + + size_t guaranteedToFit = spaceRemaining / MAX_FACTOR; + if (guaranteedToFit >= remainingInput) { + return length; + } + size_t likelyToFit = + std::min(static_cast(spaceRemaining / expansion), CHUNK); + size_t fitEstimate = + std::max(size_t{1}, std::max(guaranteedToFit, likelyToFit)); + size_t chunkSize = std::min(remainingInput, fitEstimate); + if (chunkSize == 1) break; + CHECK_GT(chunkSize, 1); + + size_t chunkUtf8Len; + if constexpr (UTF16) { + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when + // available For now, validate and use utf8_length_from_utf16 + size_t newPos = pos + chunkSize; + if (newPos < length && isSurrogatePair(data[newPos - 1], data[newPos])) + chunkSize--; + chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); + } else { + chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); + } + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + DCHECK_GT(chunkSize, guaranteedToFit); + expansion = std::max(expansion * 1.1, (chunkUtf8Len * 1.1) / chunkSize); + } else { + expansion = std::max(1.15, (chunkUtf8Len * 1.1) / chunkSize); + pos += chunkSize; + utf8Accumulated += chunkUtf8Len; + } + } + + while (pos < length && utf8Accumulated < bufferSize) { + size_t extra = simpleUtfEncodingLength(data[pos]); + if (utf8Accumulated + extra > bufferSize) break; + pos++; + utf8Accumulated += extra; + } + + if (UTF16 && pos != 0 && pos != length && + isSurrogatePair(data[pos - 1], data[pos])) { + if (utf8Accumulated < bufferSize) { + pos++; + } else { + pos--; + } + } + return pos; +} +} // namespace + void BindingData::Deserialize(Local context, Local holder, int index, @@ -98,18 +207,102 @@ void BindingData::EncodeInto(const FunctionCallbackInfo& args) { Local dest = args[1].As(); Local buf = dest->Buffer(); + + // Handle detached buffers - return {read: 0, written: 0} + if (buf->Data() == nullptr) { + binding_data->encode_into_results_buffer_[0] = 0; + binding_data->encode_into_results_buffer_[1] = 0; + return; + } + char* write_result = static_cast(buf->Data()) + dest->ByteOffset(); size_t dest_length = dest->ByteLength(); + size_t read = 0; + size_t written = 0; + + // For small strings (length <= 32), use the old V8 path for better + // performance + static constexpr int kSmallStringThreshold = 32; + if (source->Length() <= kSmallStringThreshold) { + written = source->WriteUtf8V2(isolate, + write_result, + dest_length, + String::WriteFlags::kReplaceInvalidUtf8, + &read); + binding_data->encode_into_results_buffer_[0] = static_cast(read); + binding_data->encode_into_results_buffer_[1] = static_cast(written); + return; + } - size_t nchars; - size_t written = source->WriteUtf8V2(isolate, - write_result, - dest_length, - String::WriteFlags::kReplaceInvalidUtf8, - &nchars); + v8::String::ValueView view(isolate, source); + size_t length_that_fits = + std::min(static_cast(view.length()), dest_length); + + if (view.is_one_byte()) { + auto data = reinterpret_cast(view.data8()); + simdutf::result result = + simdutf::validate_ascii_with_errors(data, length_that_fits); + written = read = result.count; + memcpy(write_result, data, read); + write_result += read; + data += read; + length_that_fits -= read; + dest_length -= read; + if (length_that_fits != 0 && dest_length != 0) { + if (size_t rest = findBestFit(data, length_that_fits, dest_length)) { + DCHECK_LE(simdutf::utf8_length_from_latin1(data, rest), dest_length); + written += simdutf::convert_latin1_to_utf8(data, rest, write_result); + read += rest; + } + } + } else { + auto data = reinterpret_cast(view.data16()); + + // Limit conversion to what could fit in destination, avoiding splitting + // a valid surrogate pair at the boundary, which could cause a spurious call + // of simdutf::to_well_formed_utf16() + if (length_that_fits > 0 && length_that_fits < view.length() && + isSurrogatePair(data[length_that_fits - 1], data[length_that_fits])) { + length_that_fits--; + } + + // Check if input has unpaired surrogates - if so, convert to well-formed + // first + simdutf::result validation_result = + simdutf::validate_utf16_with_errors(data, length_that_fits); + + if (validation_result.error == simdutf::SUCCESS) { + // Valid UTF-16 - use the fast path + read = findBestFit(data, length_that_fits, dest_length); + if (read != 0) { + DCHECK_LE(simdutf::utf8_length_from_utf16(data, read), dest_length); + written = simdutf::convert_utf16_to_utf8(data, read, write_result); + } + } else { + // Invalid UTF-16 with unpaired surrogates - convert to well-formed first + // TODO(anonrig): Use utf8_length_from_utf16_with_replacement when + // available + MaybeStackBuffer conversion_buffer( + length_that_fits); + simdutf::to_well_formed_utf16( + data, length_that_fits, conversion_buffer.out()); + + // Now use findBestFit with the well-formed data + read = + findBestFit(conversion_buffer.out(), length_that_fits, dest_length); + if (read != 0) { + DCHECK_LE( + simdutf::utf8_length_from_utf16(conversion_buffer.out(), read), + dest_length); + written = simdutf::convert_utf16_to_utf8( + conversion_buffer.out(), read, write_result); + } + } + } + DCHECK_LE(written, dest->ByteLength()); - binding_data->encode_into_results_buffer_[0] = nchars; - binding_data->encode_into_results_buffer_[1] = written; + binding_data->encode_into_results_buffer_[0] = static_cast(read); + binding_data->encode_into_results_buffer_[1] = static_cast(written); } // Encode a single string to a UTF-8 Uint8Array (not Buffer).