summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYagiz Nizipli <yagiz@nizipli.com>2022-12-24 21:32:05 -0500
committerGitHub <noreply@github.com>2022-12-25 02:32:05 +0000
commitd5a08c7e11d66067c80a5a448e6e09cf599ef85c (patch)
tree98b17c572139adf26da98b82309b6b0030814318
parent07fdbbd015583f25149f649ac007e8fac104ba08 (diff)
downloadnode-new-d5a08c7e11d66067c80a5a448e6e09cf599ef85c.tar.gz
buffer: add buffer.isUtf8 for utf8 validation
PR-URL: https://github.com/nodejs/node/pull/45947 Reviewed-By: Robert Nagy <ronagy@icloud.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl> Reviewed-By: Anna Henningsen <anna@addaleax.net>
-rw-r--r--doc/api/buffer.md11
-rw-r--r--lib/buffer.js13
-rw-r--r--src/node_buffer.cc18
-rw-r--r--src/node_errors.h1
-rw-r--r--src/util-inl.h1
-rw-r--r--src/util.h2
-rw-r--r--test/parallel/test-buffer-isutf8.js86
7 files changed, 131 insertions, 1 deletions
diff --git a/doc/api/buffer.md b/doc/api/buffer.md
index 978c26f734..3ed750fce8 100644
--- a/doc/api/buffer.md
+++ b/doc/api/buffer.md
@@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
and binary data should be performed using `Buffer.from(str, 'base64')` and
`buf.toString('base64')`.**
+### `buffer.isUtf8(input)`
+
+<!-- YAML
+added: REPLACEME
+-->
+
+* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
+* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.
+
+This function is used to check if input contains UTF-8 code points (characters).
+
### `buffer.INSPECT_MAX_BYTES`
<!-- YAML
diff --git a/lib/buffer.js b/lib/buffer.js
index 1ffde766b5..65709ca5ed 100644
--- a/lib/buffer.js
+++ b/lib/buffer.js
@@ -57,6 +57,7 @@ const {
compareOffset,
createFromString,
fill: bindingFill,
+ isUtf8: bindingIsUtf8,
indexOfBuffer,
indexOfNumber,
indexOfString,
@@ -84,7 +85,8 @@ const {
const {
isAnyArrayBuffer,
isArrayBufferView,
- isUint8Array
+ isUint8Array,
+ isTypedArray,
} = require('internal/util/types');
const {
inspect: utilInspect
@@ -1314,10 +1316,19 @@ function atob(input) {
return Buffer.from(input, 'base64').toString('latin1');
}
+function isUtf8(input) {
+ if (isTypedArray(input) || isAnyArrayBuffer(input)) {
+ return bindingIsUtf8(input);
+ }
+
+ throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
+}
+
module.exports = {
Buffer,
SlowBuffer,
transcode,
+ isUtf8,
// Legacy
kMaxLength,
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 4a0ffbbca3..f7b008af36 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
results[1] = written;
}
+static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
+ Environment* env = Environment::GetCurrent(args);
+ CHECK_EQ(args.Length(), 1);
+ CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
+ args[0]->IsSharedArrayBuffer());
+ ArrayBufferViewContents<char> abv(args[0]);
+
+ if (abv.WasDetached()) {
+ return node::THROW_ERR_INVALID_STATE(
+ env, "Cannot validate on a detached buffer");
+ }
+
+ args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
+}
void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
@@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
SetMethod(context, target, "encodeInto", EncodeInto);
SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);
+ SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);
+
target
->Set(context,
FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
@@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(EncodeInto);
registry->Register(EncodeUtf8String);
+ registry->Register(IsUtf8);
+
registry->Register(StringSlice<ASCII>);
registry->Register(StringSlice<BASE64>);
registry->Register(StringSlice<BASE64URL>);
diff --git a/src/node_errors.h b/src/node_errors.h
index 706464acc8..3f17a80a62 100644
--- a/src/node_errors.h
+++ b/src/node_errors.h
@@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details);
V(ERR_INVALID_ARG_TYPE, TypeError) \
V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \
V(ERR_INVALID_MODULE, Error) \
+ V(ERR_INVALID_STATE, Error) \
V(ERR_INVALID_THIS, TypeError) \
V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \
V(ERR_MEMORY_ALLOCATION_FAILED, Error) \
diff --git a/src/util-inl.h b/src/util-inl.h
index f98bb16aa7..833082291a 100644
--- a/src/util-inl.h
+++ b/src/util-inl.h
@@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
auto ab = buf.As<v8::ArrayBuffer>();
length_ = ab->ByteLength();
data_ = static_cast<T*>(ab->Data());
+ was_detached_ = ab->WasDetached();
} else {
CHECK(buf->IsSharedArrayBuffer());
auto sab = buf.As<v8::SharedArrayBuffer>();
diff --git a/src/util.h b/src/util.h
index 399018655e..7a3885ec8f 100644
--- a/src/util.h
+++ b/src/util.h
@@ -511,6 +511,7 @@ class ArrayBufferViewContents {
inline void Read(v8::Local<v8::ArrayBufferView> abv);
inline void ReadValue(v8::Local<v8::Value> buf);
+ inline bool WasDetached() const { return was_detached_; }
inline const T* data() const { return data_; }
inline size_t length() const { return length_; }
@@ -525,6 +526,7 @@ class ArrayBufferViewContents {
T stack_storage_[kStackStorageSize];
T* data_ = nullptr;
size_t length_ = 0;
+ bool was_detached_ = false;
};
class Utf8Value : public MaybeStackBuffer<char> {
diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js
new file mode 100644
index 0000000000..204db3e6a5
--- /dev/null
+++ b/test/parallel/test-buffer-isutf8.js
@@ -0,0 +1,86 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { isUtf8, Buffer } = require('buffer');
+const { TextEncoder } = require('util');
+
+const encoder = new TextEncoder();
+
+assert.strictEqual(isUtf8(encoder.encode('hello')), true);
+assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
+assert.strictEqual(isUtf8(Buffer.from([])), true);
+
+// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
+[
+ [0xFF], // 'invalid code'
+ [0xC0], // 'ends early'
+ [0xE0], // 'ends early 2'
+ [0xC0, 0x00], // 'invalid trail'
+ [0xC0, 0xC0], // 'invalid trail 2'
+ [0xE0, 0x00], // 'invalid trail 3'
+ [0xE0, 0xC0], // 'invalid trail 4'
+ [0xE0, 0x80, 0x00], // 'invalid trail 5'
+ [0xE0, 0x80, 0xC0], // 'invalid trail 6'
+ [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
+ [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
+
+ // Overlong encodings
+ [0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
+ [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
+ [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
+
+ [0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
+ [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
+ [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
+
+ [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
+ [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
+ [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
+
+ [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
+ [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
+ [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
+
+ [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
+ [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
+
+ // UTF-16 surrogates encoded as code points in UTF-8
+ [0xED, 0xA0, 0x80], // 'lead surrogate'
+ [0xED, 0xB0, 0x80], // 'trail surrogate'
+ [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
+].forEach((input) => {
+ assert.strictEqual(isUtf8(Buffer.from(input)), false);
+});
+
+[
+ null,
+ undefined,
+ 'hello',
+ true,
+ false,
+].forEach((input) => {
+ assert.throws(
+ () => { isUtf8(input); },
+ {
+ code: 'ERR_INVALID_ARG_TYPE',
+ },
+ );
+});
+
+{
+ // Test with detached array buffers
+ const arrayBuffer = new ArrayBuffer(1024);
+ structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
+ assert.throws(
+ () => { isUtf8(arrayBuffer); },
+ {
+ code: 'ERR_INVALID_STATE'
+ }
+ );
+}