From 6f504b71ac4693f5b7ad33cd6a1490d82b9a9955 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Thu, 4 Apr 2024 18:03:17 -0400 Subject: [PATCH] buffer: use simdutf for `atob` implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniel Lemire PR-URL: https://github.com/nodejs/node/pull/52381 Refs: https://github.com/nodejs/node/pull/51670 Reviewed-By: Daniel Lemire Reviewed-By: Vinícius Lourenço Claro Cardoso Reviewed-By: Matteo Collina Reviewed-By: Robert Nagy Reviewed-By: Benjamin Gruenbaum Reviewed-By: Filip Skokan --- benchmark/buffers/buffer-atob.js | 20 ++++++++ lib/buffer.js | 86 +++++--------------------------- src/node_buffer.cc | 60 ++++++++++++++++++++++ 3 files changed, 93 insertions(+), 73 deletions(-) create mode 100644 benchmark/buffers/buffer-atob.js diff --git a/benchmark/buffers/buffer-atob.js b/benchmark/buffers/buffer-atob.js new file mode 100644 index 00000000000000..2cc20759e3f0f6 --- /dev/null +++ b/benchmark/buffers/buffer-atob.js @@ -0,0 +1,20 @@ +'use strict'; +const common = require('../common.js'); +const assert = require('node:assert'); + +const bench = common.createBenchmark(main, { + size: [16, 32, 64, 128], + n: [1e6], +}); + +function main({ n, size }) { + const input = btoa('A'.repeat(size)); + let out = 0; + + bench.start(); + for (let i = 0; i < n; i++) { + out += atob(input).length; + } + bench.end(n); + assert(out > 0); +} diff --git a/lib/buffer.js b/lib/buffer.js index a8d07342e15eaa..ea94ebf24192f9 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -23,10 +23,8 @@ const { Array, - ArrayFrom, ArrayIsArray, ArrayPrototypeForEach, - ArrayPrototypeIndexOf, MathFloor, MathMin, MathTrunc, @@ -70,6 +68,7 @@ const { swap64: _swap64, kMaxLength, kStringMaxLength, + atob: _atob, } = internalBinding('buffer'); const { constants: { @@ -1259,85 +1258,26 @@ function btoa(input) { return buf.toString('base64'); } -// Refs: https://infra.spec.whatwg.org/#forgiving-base64-decode -const kForgivingBase64AllowedChars = [ - // ASCII whitespace - // Refs: https://infra.spec.whatwg.org/#ascii-whitespace - 0x09, 0x0A, 0x0C, 0x0D, 0x20, - - // Uppercase letters - ...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('A') + i), - - // Lowercase letters - ...ArrayFrom({ length: 26 }, (_, i) => StringPrototypeCharCodeAt('a') + i), - - // Decimal digits - ...ArrayFrom({ length: 10 }, (_, i) => StringPrototypeCharCodeAt('0') + i), - - 0x2B, // + - 0x2F, // / - 0x3D, // = -]; -const kEqualSignIndex = ArrayPrototypeIndexOf(kForgivingBase64AllowedChars, - 0x3D); - function atob(input) { - // The implementation here has not been performance optimized in any way and - // should not be. - // Refs: https://github.com/nodejs/node/pull/38433#issuecomment-828426932 if (arguments.length === 0) { throw new ERR_MISSING_ARGS('input'); } - input = `${input}`; - let nonAsciiWhitespaceCharCount = 0; - let equalCharCount = 0; + const result = _atob(`${input}`); - for (let n = 0; n < input.length; n++) { - const index = ArrayPrototypeIndexOf( - kForgivingBase64AllowedChars, - StringPrototypeCharCodeAt(input, n)); - - if (index > 4) { - // The first 5 elements of `kForgivingBase64AllowedChars` are - // ASCII whitespace char codes. - nonAsciiWhitespaceCharCount++; - - if (index === kEqualSignIndex) { - equalCharCount++; - } else if (equalCharCount) { - // The `=` char is only allowed at the end. - throw lazyDOMException('Invalid character', 'InvalidCharacterError'); - } - - if (equalCharCount > 2) { - // Only one more `=` is permitted after the first equal sign. - throw lazyDOMException('Invalid character', 'InvalidCharacterError'); - } - } else if (index === -1) { + switch (result) { + case -2: // Invalid character throw lazyDOMException('Invalid character', 'InvalidCharacterError'); - } - } - - let reminder = nonAsciiWhitespaceCharCount % 4; - - // See #2, #3, #4 - https://infra.spec.whatwg.org/#forgiving-base64 - if (!reminder) { - // Remove all trailing `=` characters and get the new reminder. - reminder = (nonAsciiWhitespaceCharCount - equalCharCount) % 4; - } else if (equalCharCount) { - // `=` should not in the input if there's a reminder. - throw lazyDOMException('Invalid character', 'InvalidCharacterError'); - } - - // See #3 - https://infra.spec.whatwg.org/#forgiving-base64 - if (reminder === 1) { - throw lazyDOMException( - 'The string to be decoded is not correctly encoded.', - 'InvalidCharacterError'); + case -1: // Single character remained + throw lazyDOMException( + 'The string to be decoded is not correctly encoded.', + 'InvalidCharacterError'); + case -3: // Possible overflow + // TODO(@anonrig): Throw correct error in here. + throw lazyDOMException('The input causes overflow.', 'InvalidCharacterError'); + default: + return result; } - - return Buffer.from(input, 'base64').toString('latin1'); } function isUtf8(input) { diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 82e98193ba0fdf..b31beada451bc8 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -67,6 +67,7 @@ using v8::Just; using v8::Local; using v8::Maybe; using v8::MaybeLocal; +using v8::NewStringType; using v8::Nothing; using v8::Number; using v8::Object; @@ -1210,6 +1211,61 @@ void DetachArrayBuffer(const FunctionCallbackInfo& args) { } } +// In case of success, the decoded string is returned. +// In case of error, a negative value is returned: +// * -1 indicates a single character remained, +// * -2 indicates an invalid character, +// * -3 indicates a possible overflow (i.e., more than 2 GB output). +static void Atob(const FunctionCallbackInfo& args) { + CHECK_EQ(args.Length(), 1); + Environment* env = Environment::GetCurrent(args); + THROW_AND_RETURN_IF_NOT_STRING(env, args[0], "argument"); + + Local input = args[0].As(); + MaybeStackBuffer buffer; + simdutf::result result; + + if (input->IsExternalOneByte()) { // 8-bit case + auto ext = input->GetExternalOneByteStringResource(); + size_t expected_length = + simdutf::maximal_binary_length_from_base64(ext->data(), ext->length()); + buffer.AllocateSufficientStorage(expected_length + 1); + buffer.SetLengthAndZeroTerminate(expected_length); + result = simdutf::base64_to_binary( + ext->data(), ext->length(), buffer.out(), simdutf::base64_default); + } else { // 16-bit case + String::Value value(env->isolate(), input); + auto data = reinterpret_cast(*value); + size_t expected_length = + simdutf::maximal_binary_length_from_base64(data, value.length()); + buffer.AllocateSufficientStorage(expected_length + 1); + buffer.SetLengthAndZeroTerminate(expected_length); + result = simdutf::base64_to_binary( + data, value.length(), buffer.out(), simdutf::base64_default); + } + + if (result.error == simdutf::error_code::SUCCESS) { + auto value = + String::NewFromOneByte(env->isolate(), + reinterpret_cast(buffer.out()), + NewStringType::kNormal, + result.count) + .ToLocalChecked(); + return args.GetReturnValue().Set(value); + } + + // Default value is: "possible overflow" + int32_t error_code = -3; + + if (result.error == simdutf::error_code::INVALID_BASE64_CHARACTER) { + error_code = -2; + } else if (result.error == simdutf::error_code::BASE64_INPUT_REMAINDER) { + error_code = -1; + } + + args.GetReturnValue().Set(error_code); +} + namespace { std::pair DecomposeBufferToParts(Local buffer) { @@ -1272,6 +1328,8 @@ void Initialize(Local target, Environment* env = Environment::GetCurrent(context); Isolate* isolate = env->isolate(); + SetMethodNoSideEffect(context, target, "atob", Atob); + SetMethod(context, target, "setBufferPrototype", SetBufferPrototype); SetMethodNoSideEffect(context, target, "createFromString", CreateFromString); @@ -1373,6 +1431,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(DetachArrayBuffer); registry->Register(CopyArrayBuffer); + + registry->Register(Atob); } } // namespace Buffer