diff options
author | Yagiz Nizipli <yagiz@nizipli.com> | 2022-12-24 21:32:05 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-25 02:32:05 +0000 |
commit | d5a08c7e11d66067c80a5a448e6e09cf599ef85c (patch) | |
tree | 98b17c572139adf26da98b82309b6b0030814318 | |
parent | 07fdbbd015583f25149f649ac007e8fac104ba08 (diff) | |
download | node-new-d5a08c7e11d66067c80a5a448e6e09cf599ef85c.tar.gz |
buffer: add buffer.isUtf8 for utf8 validation
PR-URL: https://github.com/nodejs/node/pull/45947
Reviewed-By: Robert Nagy <ronagy@icloud.com>
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
Reviewed-By: Anna Henningsen <anna@addaleax.net>
-rw-r--r-- | doc/api/buffer.md | 11 | ||||
-rw-r--r-- | lib/buffer.js | 13 | ||||
-rw-r--r-- | src/node_buffer.cc | 18 | ||||
-rw-r--r-- | src/node_errors.h | 1 | ||||
-rw-r--r-- | src/util-inl.h | 1 | ||||
-rw-r--r-- | src/util.h | 2 | ||||
-rw-r--r-- | test/parallel/test-buffer-isutf8.js | 86 |
7 files changed, 131 insertions, 1 deletions
diff --git a/doc/api/buffer.md b/doc/api/buffer.md index 978c26f734..3ed750fce8 100644 --- a/doc/api/buffer.md +++ b/doc/api/buffer.md @@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings and binary data should be performed using `Buffer.from(str, 'base64')` and `buf.toString('base64')`.** +### `buffer.isUtf8(input)` + +<!-- YAML +added: REPLACEME +--> + +* input {Buffer | ArrayBuffer | TypedArray} The input to validate. +* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8. + +This function is used to check if input contains UTF-8 code points (characters). + ### `buffer.INSPECT_MAX_BYTES` <!-- YAML diff --git a/lib/buffer.js b/lib/buffer.js index 1ffde766b5..65709ca5ed 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -57,6 +57,7 @@ const { compareOffset, createFromString, fill: bindingFill, + isUtf8: bindingIsUtf8, indexOfBuffer, indexOfNumber, indexOfString, @@ -84,7 +85,8 @@ const { const { isAnyArrayBuffer, isArrayBufferView, - isUint8Array + isUint8Array, + isTypedArray, } = require('internal/util/types'); const { inspect: utilInspect @@ -1314,10 +1316,19 @@ function atob(input) { return Buffer.from(input, 'base64').toString('latin1'); } +function isUtf8(input) { + if (isTypedArray(input) || isAnyArrayBuffer(input)) { + return bindingIsUtf8(input); + } + + throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input); +} + module.exports = { Buffer, SlowBuffer, transcode, + isUtf8, // Legacy kMaxLength, diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 4a0ffbbca3..f7b008af36 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) { results[1] = written; } +static void IsUtf8(const FunctionCallbackInfo<Value>& args) { + Environment* env = Environment::GetCurrent(args); + CHECK_EQ(args.Length(), 1); + CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() || + args[0]->IsSharedArrayBuffer()); + ArrayBufferViewContents<char> abv(args[0]); + + if (abv.WasDetached()) { + return node::THROW_ERR_INVALID_STATE( + env, "Cannot validate on a detached buffer"); + } + + args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length())); +} void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) { Environment* env = Environment::GetCurrent(args); @@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target, SetMethod(context, target, "encodeInto", EncodeInto); SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String); + SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8); + target ->Set(context, FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"), @@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(EncodeInto); registry->Register(EncodeUtf8String); + registry->Register(IsUtf8); + registry->Register(StringSlice<ASCII>); registry->Register(StringSlice<BASE64>); registry->Register(StringSlice<BASE64URL>); diff --git a/src/node_errors.h b/src/node_errors.h index 706464acc8..3f17a80a62 100644 --- a/src/node_errors.h +++ b/src/node_errors.h @@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details); V(ERR_INVALID_ARG_TYPE, TypeError) \ V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \ V(ERR_INVALID_MODULE, Error) \ + V(ERR_INVALID_STATE, Error) \ V(ERR_INVALID_THIS, TypeError) \ V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \ V(ERR_MEMORY_ALLOCATION_FAILED, Error) \ diff --git a/src/util-inl.h b/src/util-inl.h index f98bb16aa7..833082291a 100644 --- a/src/util-inl.h +++ b/src/util-inl.h @@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) { auto ab = buf.As<v8::ArrayBuffer>(); length_ = ab->ByteLength(); data_ = static_cast<T*>(ab->Data()); + was_detached_ = ab->WasDetached(); } else { CHECK(buf->IsSharedArrayBuffer()); auto sab = buf.As<v8::SharedArrayBuffer>(); diff --git a/src/util.h b/src/util.h index 399018655e..7a3885ec8f 100644 --- a/src/util.h +++ b/src/util.h @@ -511,6 +511,7 @@ class ArrayBufferViewContents { inline void Read(v8::Local<v8::ArrayBufferView> abv); inline void ReadValue(v8::Local<v8::Value> buf); + inline bool WasDetached() const { return was_detached_; } inline const T* data() const { return data_; } inline size_t length() const { return length_; } @@ -525,6 +526,7 @@ class ArrayBufferViewContents { T stack_storage_[kStackStorageSize]; T* data_ = nullptr; size_t length_ = 0; + bool was_detached_ = false; }; class Utf8Value : public MaybeStackBuffer<char> { diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js new file mode 100644 index 0000000000..204db3e6a5 --- /dev/null +++ b/test/parallel/test-buffer-isutf8.js @@ -0,0 +1,86 @@ +'use strict'; + +require('../common'); +const assert = require('assert'); +const { isUtf8, Buffer } = require('buffer'); +const { TextEncoder } = require('util'); + +const encoder = new TextEncoder(); + +assert.strictEqual(isUtf8(encoder.encode('hello')), true); +assert.strictEqual(isUtf8(encoder.encode('ğ')), true); +assert.strictEqual(isUtf8(Buffer.from([])), true); + +// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js +[ + [0xFF], // 'invalid code' + [0xC0], // 'ends early' + [0xE0], // 'ends early 2' + [0xC0, 0x00], // 'invalid trail' + [0xC0, 0xC0], // 'invalid trail 2' + [0xE0, 0x00], // 'invalid trail 3' + [0xE0, 0xC0], // 'invalid trail 4' + [0xE0, 0x80, 0x00], // 'invalid trail 5' + [0xE0, 0x80, 0xC0], // 'invalid trail 6' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF' + [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte' + + // Overlong encodings + [0xC0, 0x80], // 'overlong U+0000 - 2 bytes' + [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes' + [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes' + [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes' + + [0xC1, 0xBF], // 'overlong U+007F - 2 bytes' + [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes' + [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes' + [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes' + + [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes' + [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes' + [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes' + + [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes' + [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes' + [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes' + + [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes' + [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes' + + // UTF-16 surrogates encoded as code points in UTF-8 + [0xED, 0xA0, 0x80], // 'lead surrogate' + [0xED, 0xB0, 0x80], // 'trail surrogate' + [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair' +].forEach((input) => { + assert.strictEqual(isUtf8(Buffer.from(input)), false); +}); + +[ + null, + undefined, + 'hello', + true, + false, +].forEach((input) => { + assert.throws( + () => { isUtf8(input); }, + { + code: 'ERR_INVALID_ARG_TYPE', + }, + ); +}); + +{ + // Test with detached array buffers + const arrayBuffer = new ArrayBuffer(1024); + structuredClone(arrayBuffer, { transfer: [arrayBuffer] }); + assert.throws( + () => { isUtf8(arrayBuffer); }, + { + code: 'ERR_INVALID_STATE' + } + ); +} |