diff options
author | Felix Geisendörfer <felix@debuggable.com> | 2014-01-20 09:47:19 +0100 |
---|---|---|
committer | Timothy J Fontaine <tjfontaine@gmail.com> | 2014-06-06 15:04:39 -0700 |
commit | 066e97867abeeddaee39b0a801fb8a13c0db5c61 (patch) | |
tree | 6d7c9b887a8b654ba62f6422ac7c5d7497dd3736 | |
parent | 11d21f5b17f27576cf4245e6ad05cfc66bccf2f8 (diff) | |
download | node-066e97867abeeddaee39b0a801fb8a13c0db5c61.tar.gz |
string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output
when encountering unmatched surrogate code units [1]. The new
REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points
with the unicode replacement character.
[1]: JS Strings are defined as arrays of 16 bit unsigned integers. There
is no unicode enforcement, so one can easily end up with invalid unicode
code unit sequences inside a string.
-rw-r--r-- | src/node.cc | 8 | ||||
-rw-r--r-- | src/node_buffer.cc | 4 | ||||
-rw-r--r-- | src/node_buffer.h | 2 | ||||
-rw-r--r-- | src/stream_wrap.cc | 3 | ||||
-rw-r--r-- | test/simple/test-buffer.js | 12 |
5 files changed, 25 insertions, 4 deletions
diff --git a/src/node.cc b/src/node.cc index e9a0d1220..c65ec8bbc 100644 --- a/src/node.cc +++ b/src/node.cc @@ -178,6 +178,9 @@ static double prog_start_time; static int64_t tick_times[RPM_SAMPLES]; static int tick_time_head; +int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED | + v8::String::NO_NULL_TERMINATION; + static void CheckStatus(uv_timer_t* watcher, int status); static void StartGCTimer () { @@ -2931,6 +2934,11 @@ static char **copy_argv(int argc, char **argv) { } int Start(int argc, char *argv[]) { + const char* replaceInvalid = getenv("NODE_INVALID_UTF8"); + + if (replaceInvalid == NULL) + node::WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8; + // Hack aroung with the argv pointer. Used for process.title = "blah". argv = uv_setup_args(argc, argv); diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 3f8ebefe0..2eb0654ba 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -493,8 +493,8 @@ Handle<Value> Buffer::Utf8Write(const Arguments &args) { int written = s->WriteUtf8(p, max_length, &char_written, - (String::HINT_MANY_WRITES_EXPECTED | - String::NO_NULL_TERMINATION)); + node::WRITE_UTF8_FLAGS); + constructor_template->GetFunction()->Set(chars_written_sym, Integer::New(char_written)); diff --git a/src/node_buffer.h b/src/node_buffer.h index 38c1e2d29..04c119f2a 100644 --- a/src/node_buffer.h +++ b/src/node_buffer.h @@ -62,6 +62,8 @@ namespace node { */ +extern int WRITE_UTF8_FLAGS; + class NODE_EXTERN Buffer: public ObjectWrap { public: diff --git a/src/stream_wrap.cc b/src/stream_wrap.cc index e79d212c4..9fae33d44 100644 --- a/src/stream_wrap.cc +++ b/src/stream_wrap.cc @@ -356,8 +356,7 @@ Handle<Value> StreamWrap::WriteStringImpl(const Arguments& args) { break; case kUtf8: - data_size = string->WriteUtf8(data, -1, NULL, - String::NO_NULL_TERMINATION | String::HINT_MANY_WRITES_EXPECTED); + data_size = string->WriteUtf8(data, -1, NULL, node::WRITE_UTF8_FLAGS); break; case kUcs2: { diff --git a/test/simple/test-buffer.js b/test/simple/test-buffer.js index 11165f88c..57302abc9 100644 --- a/test/simple/test-buffer.js +++ b/test/simple/test-buffer.js @@ -696,6 +696,18 @@ assert.equal(buf[3], 0xFF); assert.equal(buf[3], 0xFF); }); +// test unmatched surrogates not producing invalid utf8 output +// ef bf bd = utf-8 representation of unicode replacement character +// see https://codereview.chromium.org/121173009/ +buf = new Buffer('ab\ud800cd', 'utf8'); +assert.equal(buf[0], 0x61); +assert.equal(buf[1], 0x62); +assert.equal(buf[2], 0xef); +assert.equal(buf[3], 0xbf); +assert.equal(buf[4], 0xbd); +assert.equal(buf[5], 0x63); +assert.equal(buf[6], 0x64); + // test for buffer overrun buf = new Buffer([0, 0, 0, 0, 0]); // length: 5 var sub = buf.slice(0, 4); // length: 4 |