summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Geisendörfer <felix@debuggable.com>2014-01-20 09:47:19 +0100
committerTimothy J Fontaine <tjfontaine@gmail.com>2014-06-06 15:04:39 -0700
commit066e97867abeeddaee39b0a801fb8a13c0db5c61 (patch)
tree6d7c9b887a8b654ba62f6422ac7c5d7497dd3736
parent11d21f5b17f27576cf4245e6ad05cfc66bccf2f8 (diff)
downloadnode-066e97867abeeddaee39b0a801fb8a13c0db5c61.tar.gz
string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output when encountering unmatched surrogate code units [1]. The new REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points with the unicode replacement character. [1]: JS Strings are defined as arrays of 16 bit unsigned integers. There is no unicode enforcement, so one can easily end up with invalid unicode code unit sequences inside a string.
-rw-r--r--src/node.cc8
-rw-r--r--src/node_buffer.cc4
-rw-r--r--src/node_buffer.h2
-rw-r--r--src/stream_wrap.cc3
-rw-r--r--test/simple/test-buffer.js12
5 files changed, 25 insertions, 4 deletions
diff --git a/src/node.cc b/src/node.cc
index e9a0d1220..c65ec8bbc 100644
--- a/src/node.cc
+++ b/src/node.cc
@@ -178,6 +178,9 @@ static double prog_start_time;
static int64_t tick_times[RPM_SAMPLES];
static int tick_time_head;
+int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
+ v8::String::NO_NULL_TERMINATION;
+
static void CheckStatus(uv_timer_t* watcher, int status);
static void StartGCTimer () {
@@ -2931,6 +2934,11 @@ static char **copy_argv(int argc, char **argv) {
}
int Start(int argc, char *argv[]) {
+ const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
+
+ if (replaceInvalid == NULL)
+ node::WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
+
// Hack aroung with the argv pointer. Used for process.title = "blah".
argv = uv_setup_args(argc, argv);
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
index 3f8ebefe0..2eb0654ba 100644
--- a/src/node_buffer.cc
+++ b/src/node_buffer.cc
@@ -493,8 +493,8 @@ Handle<Value> Buffer::Utf8Write(const Arguments &args) {
int written = s->WriteUtf8(p,
max_length,
&char_written,
- (String::HINT_MANY_WRITES_EXPECTED |
- String::NO_NULL_TERMINATION));
+ node::WRITE_UTF8_FLAGS);
+
constructor_template->GetFunction()->Set(chars_written_sym,
Integer::New(char_written));
diff --git a/src/node_buffer.h b/src/node_buffer.h
index 38c1e2d29..04c119f2a 100644
--- a/src/node_buffer.h
+++ b/src/node_buffer.h
@@ -62,6 +62,8 @@ namespace node {
*/
+extern int WRITE_UTF8_FLAGS;
+
class NODE_EXTERN Buffer: public ObjectWrap {
public:
diff --git a/src/stream_wrap.cc b/src/stream_wrap.cc
index e79d212c4..9fae33d44 100644
--- a/src/stream_wrap.cc
+++ b/src/stream_wrap.cc
@@ -356,8 +356,7 @@ Handle<Value> StreamWrap::WriteStringImpl(const Arguments& args) {
break;
case kUtf8:
- data_size = string->WriteUtf8(data, -1, NULL,
- String::NO_NULL_TERMINATION | String::HINT_MANY_WRITES_EXPECTED);
+ data_size = string->WriteUtf8(data, -1, NULL, node::WRITE_UTF8_FLAGS);
break;
case kUcs2: {
diff --git a/test/simple/test-buffer.js b/test/simple/test-buffer.js
index 11165f88c..57302abc9 100644
--- a/test/simple/test-buffer.js
+++ b/test/simple/test-buffer.js
@@ -696,6 +696,18 @@ assert.equal(buf[3], 0xFF);
assert.equal(buf[3], 0xFF);
});
+// test unmatched surrogates not producing invalid utf8 output
+// ef bf bd = utf-8 representation of unicode replacement character
+// see https://codereview.chromium.org/121173009/
+buf = new Buffer('ab\ud800cd', 'utf8');
+assert.equal(buf[0], 0x61);
+assert.equal(buf[1], 0x62);
+assert.equal(buf[2], 0xef);
+assert.equal(buf[3], 0xbf);
+assert.equal(buf[4], 0xbd);
+assert.equal(buf[5], 0x63);
+assert.equal(buf[6], 0x64);
+
// test for buffer overrun
buf = new Buffer([0, 0, 0, 0, 0]); // length: 5
var sub = buf.slice(0, 4); // length: 4