From 65e06d9cf3b88e9b1d75c357baf5326c28d8876c Mon Sep 17 00:00:00 2001 From: theweipeng Date: Sat, 21 Dec 2024 21:36:22 +0800 Subject: [PATCH] src: detect whether the string is one byte representation or not References: nodejs#56090 --- doc/api/v8.md | 109 ++++++++++++++++++ lib/v8.js | 13 +++ src/node_external_reference.h | 4 + src/node_v8.cc | 28 +++++ ...st-v8-string-is-one-byte-representation.js | 37 ++++++ 5 files changed, 191 insertions(+) create mode 100644 test/parallel/test-v8-string-is-one-byte-representation.js diff --git a/doc/api/v8.md b/doc/api/v8.md index 670283e17f5d80..26b77b3adce39d 100644 --- a/doc/api/v8.md +++ b/doc/api/v8.md @@ -1304,6 +1304,115 @@ setTimeout(() => { }, 1000); ``` +## `v8.isStringOneByteRepresentation(content)` + + + +* `content` {string} +* Returns: {boolean} + +V8 only supports `Latin-1/ISO-8859-1` and `UTF16` as the underlying representations. +If the `content` uses `Latin-1/ISO-8859-1` as the underlying representation, this function will return true; +otherwise, it returns false. + +If this method returns false, that does not mean that the string contains some characters not in `Latin-1/ISO-8859-1`. +Sometimes a `Latin-1` string may also be represented as `UTF16`. + +```js +const { isStringOneByteRepresentation } = require('node:v8'); +const assert = require('node:assert'); + +const Encoding = { + latin1: 1, + utf16le: 2, +}; + +/** + * Read strings from the buffer. + * + * Note that this example ignores flag checks and boundary checks. + */ +class Deserializer { + buffer; + cursor; + + constructor(buffer) { + this.buffer = buffer; + this.cursor = 0; + } + + readString() { + const encoding = this.buffer.readUint8(this.cursor++); + const length = this.buffer.readUint32LE(this.cursor); + this.cursor += 4; + if (encoding === Encoding.latin1) { + const result = this.buffer.toString('latin1', this.cursor, this.cursor + length); + this.cursor += length; + return result; + } + const result = this.buffer.toString('utf16le', this.cursor, this.cursor + length); + this.cursor += length; + return result; + } +} + +/** + * By means of the `isStringOneByteRepresentation` function, + * we can write strings into the buffer with high performance, + * and it only takes the time of a memcopy. + * + * Note that this example ignores boundary checks. + */ +class Serializer { + buffer; + cursor; + + constructor() { + this.buffer = Buffer.alloc(100); + this.cursor = 0; + } + + /** + * step1: Write the encoding flag to the first byte. + * step2: Write the byte length of the string to the next four bytes. + * step3: Write the string to the buffer. + * @param {string} input + */ + writeString(input) { + if (isStringOneByteRepresentation(input)) { + this.buffer.writeUint8(Encoding.latin1, this.cursor++); + this.buffer.writeUint32LE(input.length, this.cursor); + this.cursor += 4; + this.buffer.write(input, this.cursor, 'latin1'); + this.cursor += input.length; + } else { + this.buffer.writeUint8(Encoding.utf16le, this.cursor++); + this.buffer.writeUint32LE(input.length * 2, this.cursor); + this.cursor += 4; + this.buffer.write(input, this.cursor, 'utf16le'); + this.cursor += input.length * 2; + } + } + + finish() { + return this.buffer.subarray(0, this.cursor); + } +} + +// Write strings to the buffer. +const serializer = new Serializer(); +serializer.writeString('hello'); +serializer.writeString('你好'); +const data = serializer.finish(); + +// Read strings from the buffer. +const deserializer = new Deserializer(data); +assert(deserializer.readString() === 'hello'); +assert(deserializer.readString() === '你好'); +``` + [HTML structured clone algorithm]: https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Structured_clone_algorithm [Hook Callbacks]: #hook-callbacks [V8]: https://developers.google.com/v8/ diff --git a/lib/v8.js b/lib/v8.js index 7a8979887bab49..381aabfcbfafab 100644 --- a/lib/v8.js +++ b/lib/v8.js @@ -104,6 +104,7 @@ const binding = internalBinding('v8'); const { cachedDataVersionTag, setFlagsFromString: _setFlagsFromString, + isStringOneByteRepresentation: _isStringOneByteRepresentation, updateHeapStatisticsBuffer, updateHeapSpaceStatisticsBuffer, updateHeapCodeStatisticsBuffer, @@ -155,6 +156,17 @@ function setFlagsFromString(flags) { _setFlagsFromString(flags); } +/** + * Return whether this string uses one byte as underlying representation or not. + * @param {string} content + * @returns {boolean} + */ +function isStringOneByteRepresentation(content) { + validateString(content, 'content'); + return _isStringOneByteRepresentation(content); +} + + /** * Gets the current V8 heap statistics. * @returns {{ @@ -439,4 +451,5 @@ module.exports = { startupSnapshot, setHeapSnapshotNearHeapLimit, GCProfiler, + isStringOneByteRepresentation, }; diff --git a/src/node_external_reference.h b/src/node_external_reference.h index 8d49a119c21832..bb007dbdcce486 100644 --- a/src/node_external_reference.h +++ b/src/node_external_reference.h @@ -12,6 +12,9 @@ namespace node { using CFunctionCallbackWithOneByteString = uint32_t (*)(v8::Local, const v8::FastOneByteString&); + +using CFunctionCallbackReturnBool = bool (*)(v8::Local unused, + v8::Local receiver); using CFunctionCallback = void (*)(v8::Local unused, v8::Local receiver); using CFunctionCallbackReturnDouble = @@ -90,6 +93,7 @@ class ExternalReferenceRegistry { #define ALLOWED_EXTERNAL_REFERENCE_TYPES(V) \ V(CFunctionCallback) \ V(CFunctionCallbackWithOneByteString) \ + V(CFunctionCallbackReturnBool) \ V(CFunctionCallbackReturnDouble) \ V(CFunctionCallbackReturnInt32) \ V(CFunctionCallbackValueReturnDouble) \ diff --git a/src/node_v8.cc b/src/node_v8.cc index a7f0ba7973498e..eecf09f048891d 100644 --- a/src/node_v8.cc +++ b/src/node_v8.cc @@ -32,6 +32,7 @@ namespace node { namespace v8_utils { using v8::Array; +using v8::CFunction; using v8::Context; using v8::FunctionCallbackInfo; using v8::FunctionTemplate; @@ -238,6 +239,23 @@ void SetFlagsFromString(const FunctionCallbackInfo& args) { V8::SetFlagsFromString(*flags, static_cast(flags.length())); } +static void IsStringOneByteRepresentation( + const FunctionCallbackInfo& args) { + CHECK_EQ(args.Length(), 1); + CHECK(args[0]->IsString()); + bool is_one_byte = args[0].As()->IsOneByte(); + args.GetReturnValue().Set(is_one_byte); +} + +static bool FastIsStringOneByteRepresentation(Local receiver, + const Local target) { + CHECK(target->IsString()); + return target.As()->IsOneByte(); +} + +CFunction fast_is_string_one_byte_representation_( + CFunction::Make(FastIsStringOneByteRepresentation)); + static const char* GetGCTypeName(v8::GCType gc_type) { switch (gc_type) { case v8::GCType::kGCTypeScavenge: @@ -479,6 +497,13 @@ void Initialize(Local target, // Export symbols used by v8.setFlagsFromString() SetMethod(context, target, "setFlagsFromString", SetFlagsFromString); + // Export symbols used by v8.isStringOneByteRepresentation() + SetFastMethodNoSideEffect(context, + target, + "isStringOneByteRepresentation", + IsStringOneByteRepresentation, + &fast_is_string_one_byte_representation_); + // GCProfiler Local t = NewFunctionTemplate(env->isolate(), GCProfiler::New); @@ -498,6 +523,9 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(GCProfiler::New); registry->Register(GCProfiler::Start); registry->Register(GCProfiler::Stop); + registry->Register(IsStringOneByteRepresentation); + registry->Register(FastIsStringOneByteRepresentation); + registry->Register(fast_is_string_one_byte_representation_.GetTypeInfo()); } } // namespace v8_utils diff --git a/test/parallel/test-v8-string-is-one-byte-representation.js b/test/parallel/test-v8-string-is-one-byte-representation.js new file mode 100644 index 00000000000000..0403299c01015f --- /dev/null +++ b/test/parallel/test-v8-string-is-one-byte-representation.js @@ -0,0 +1,37 @@ +// Flags: --expose-internals +'use strict'; +require('../common'); +const assert = require('assert'); +const { isStringOneByteRepresentation } = require('v8'); + +[ + undefined, + null, + false, + 5n, + 5, + Symbol(), + () => {}, + {}, +].forEach((value) => { + assert.throws( + () => { isStringOneByteRepresentation(value); }, + /The "content" argument must be of type string/ + ); +}); + +{ + const latin1String = 'hello world!'; + // Run this inside a for loop to trigger the fast API + for (let i = 0; i < 10_000; i++) { + assert.strictEqual(isStringOneByteRepresentation(latin1String), true); + } +} + +{ + const utf16String = '你好😀😃'; + // Run this inside a for loop to trigger the fast API + for (let i = 0; i < 10_000; i++) { + assert.strictEqual(isStringOneByteRepresentation(utf16String), false); + } +}