summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFelix Geisendörfer <felix@debuggable.com>2014-01-20 09:47:19 +0100
committerTimothy J Fontaine <tjfontaine@gmail.com>2014-06-06 15:07:29 -0700
commit0da4c671659cfbae12def127b2e94690b9d9b5e1 (patch)
tree14b81692db27486685b07f69303bff39daa790e6
parent881ac26f27f4ac9585d66c8d8a67d5b246a23d1b (diff)
downloadnode-new-0da4c671659cfbae12def127b2e94690b9d9b5e1.tar.gz
string_bytes: Guarantee valid utf-8 output
Previously v8's WriteUtf8 function would produce invalid utf-8 output when encountering unmatched surrogate code units [1]. The new REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points with the unicode replacement character. [1]: JS Strings are defined as arrays of 16 bit unsigned integers. There is no unicode enforcement, so one can easily end up with invalid unicode code unit sequences inside a string.
-rw-r--r--src/node.cc7
-rw-r--r--src/string_bytes.cc2
-rw-r--r--src/string_bytes.h2
-rw-r--r--test/simple/test-buffer.js12
4 files changed, 22 insertions, 1 deletions
diff --git a/src/node.cc b/src/node.cc
index 8257604d52..5cb202fa2e 100644
--- a/src/node.cc
+++ b/src/node.cc
@@ -176,6 +176,8 @@ static uv_async_t dispatch_debug_messages_async;
// Declared in node_internals.h
Isolate* node_isolate = NULL;
+int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
+ v8::String::NO_NULL_TERMINATION;
static void Spin(uv_idle_t* handle, int status) {
assert((uv_idle_t*) handle == &tick_spinner);
@@ -3042,6 +3044,11 @@ static char **copy_argv(int argc, char **argv) {
}
int Start(int argc, char *argv[]) {
+ const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
+
+ if (replaceInvalid == NULL)
+ WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
+
// Hack aroung with the argv pointer. Used for process.title = "blah".
argv = uv_setup_args(argc, argv);
diff --git a/src/string_bytes.cc b/src/string_bytes.cc
index e4a34fee0e..a7bab3895f 100644
--- a/src/string_bytes.cc
+++ b/src/string_bytes.cc
@@ -199,7 +199,7 @@ size_t StringBytes::Write(char* buf,
break;
case UTF8:
- len = str->WriteUtf8(buf, buflen, chars_written, flags);
+ len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS);
break;
case UCS2:
diff --git a/src/string_bytes.h b/src/string_bytes.h
index 8071a494ae..31f04bbe4b 100644
--- a/src/string_bytes.h
+++ b/src/string_bytes.h
@@ -29,6 +29,8 @@
namespace node {
+extern int WRITE_UTF8_FLAGS;
+
using v8::Handle;
using v8::Local;
using v8::String;
diff --git a/test/simple/test-buffer.js b/test/simple/test-buffer.js
index 3026824f0a..f8b2798676 100644
--- a/test/simple/test-buffer.js
+++ b/test/simple/test-buffer.js
@@ -791,6 +791,18 @@ assert.equal(buf[3], 0xFF);
assert.equal(buf[3], 0xFF);
});
+// test unmatched surrogates not producing invalid utf8 output
+// ef bf bd = utf-8 representation of unicode replacement character
+// see https://codereview.chromium.org/121173009/
+buf = new Buffer('ab\ud800cd', 'utf8');
+assert.equal(buf[0], 0x61);
+assert.equal(buf[1], 0x62);
+assert.equal(buf[2], 0xef);
+assert.equal(buf[3], 0xbf);
+assert.equal(buf[4], 0xbd);
+assert.equal(buf[5], 0x63);
+assert.equal(buf[6], 0x64);
+
// test for buffer overrun
buf = new Buffer([0, 0, 0, 0, 0]); // length: 5
var sub = buf.slice(0, 4); // length: 4