diff options
author | Felix Geisendörfer <felix@debuggable.com> | 2014-01-20 09:47:19 +0100 |
---|---|---|
committer | Timothy J Fontaine <tjfontaine@gmail.com> | 2014-06-06 15:07:29 -0700 |
commit | 0da4c671659cfbae12def127b2e94690b9d9b5e1 (patch) | |
tree | 14b81692db27486685b07f69303bff39daa790e6 | |
parent | 881ac26f27f4ac9585d66c8d8a67d5b246a23d1b (diff) | |
download | node-new-0da4c671659cfbae12def127b2e94690b9d9b5e1.tar.gz |
string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output
when encountering unmatched surrogate code units [1]. The new
REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points
with the unicode replacement character.
[1]: JS Strings are defined as arrays of 16 bit unsigned integers. There
is no unicode enforcement, so one can easily end up with invalid unicode
code unit sequences inside a string.
-rw-r--r-- | src/node.cc | 7 | ||||
-rw-r--r-- | src/string_bytes.cc | 2 | ||||
-rw-r--r-- | src/string_bytes.h | 2 | ||||
-rw-r--r-- | test/simple/test-buffer.js | 12 |
4 files changed, 22 insertions, 1 deletions
diff --git a/src/node.cc b/src/node.cc index 8257604d52..5cb202fa2e 100644 --- a/src/node.cc +++ b/src/node.cc @@ -176,6 +176,8 @@ static uv_async_t dispatch_debug_messages_async; // Declared in node_internals.h Isolate* node_isolate = NULL; +int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED | + v8::String::NO_NULL_TERMINATION; static void Spin(uv_idle_t* handle, int status) { assert((uv_idle_t*) handle == &tick_spinner); @@ -3042,6 +3044,11 @@ static char **copy_argv(int argc, char **argv) { } int Start(int argc, char *argv[]) { + const char* replaceInvalid = getenv("NODE_INVALID_UTF8"); + + if (replaceInvalid == NULL) + WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8; + // Hack aroung with the argv pointer. Used for process.title = "blah". argv = uv_setup_args(argc, argv); diff --git a/src/string_bytes.cc b/src/string_bytes.cc index e4a34fee0e..a7bab3895f 100644 --- a/src/string_bytes.cc +++ b/src/string_bytes.cc @@ -199,7 +199,7 @@ size_t StringBytes::Write(char* buf, break; case UTF8: - len = str->WriteUtf8(buf, buflen, chars_written, flags); + len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS); break; case UCS2: diff --git a/src/string_bytes.h b/src/string_bytes.h index 8071a494ae..31f04bbe4b 100644 --- a/src/string_bytes.h +++ b/src/string_bytes.h @@ -29,6 +29,8 @@ namespace node { +extern int WRITE_UTF8_FLAGS; + using v8::Handle; using v8::Local; using v8::String; diff --git a/test/simple/test-buffer.js b/test/simple/test-buffer.js index 3026824f0a..f8b2798676 100644 --- a/test/simple/test-buffer.js +++ b/test/simple/test-buffer.js @@ -791,6 +791,18 @@ assert.equal(buf[3], 0xFF); assert.equal(buf[3], 0xFF); }); +// test unmatched surrogates not producing invalid utf8 output +// ef bf bd = utf-8 representation of unicode replacement character +// see https://codereview.chromium.org/121173009/ +buf = new Buffer('ab\ud800cd', 'utf8'); +assert.equal(buf[0], 0x61); +assert.equal(buf[1], 0x62); +assert.equal(buf[2], 0xef); +assert.equal(buf[3], 0xbf); +assert.equal(buf[4], 0xbd); +assert.equal(buf[5], 0x63); +assert.equal(buf[6], 0x64); + // test for buffer overrun buf = new Buffer([0, 0, 0, 0, 0]); // length: 5 var sub = buf.slice(0, 4); // length: 4 |