string_bytes: Guarantee valid utf-8 output

Previously v8's WriteUtf8 function would produce invalid utf-8 output when encountering unmatched surrogate code units [1]. The new REPLACE_INVALID_UTF8 option fixes that by replacing invalid code points with the unicode replacement character. [1]: JS Strings are defined as arrays of 16 bit unsigned integers. There is no unicode enforcement, so one can easily end up with invalid unicode code unit sequences inside a string.
author: Felix Geisendörfer <felix@debuggable.com> 2014-01-20 09:47:19 +0100
committer: Timothy J Fontaine <tjfontaine@gmail.com> 2014-06-06 15:07:29 -0700
commit: 0da4c671659cfbae12def127b2e94690b9d9b5e1 (patch)
tree: 14b81692db27486685b07f69303bff39daa790e6
parent: 881ac26f27f4ac9585d66c8d8a67d5b246a23d1b (diff)
download: node-new-0da4c671659cfbae12def127b2e94690b9d9b5e1.tar.gz
4 files changed, 22 insertions, 1 deletions
diff --git a/src/node.cc b/src/node.cc
index 8257604d52..5cb202fa2e 100644
--- a/src/node.cc
+++ b/src/node.cc
@@ -176,6 +176,8 @@ static uv_async_t dispatch_debug_messages_async;
 // Declared in node_internals.h
 Isolate* node_isolate = NULL;
 
+int WRITE_UTF8_FLAGS = v8::String::HINT_MANY_WRITES_EXPECTED |
+                       v8::String::NO_NULL_TERMINATION;
 
 static void Spin(uv_idle_t* handle, int status) {
   assert((uv_idle_t*) handle == &tick_spinner);
@@ -3042,6 +3044,11 @@ static char **copy_argv(int argc, char **argv) {
 }
 
 int Start(int argc, char *argv[]) {
+  const char* replaceInvalid = getenv("NODE_INVALID_UTF8");
+
+  if (replaceInvalid == NULL)
+    WRITE_UTF8_FLAGS |= String::REPLACE_INVALID_UTF8;
+
   // Hack aroung with the argv pointer. Used for process.title = "blah".
   argv = uv_setup_args(argc, argv);
 
diff --git a/src/string_bytes.cc b/src/string_bytes.cc
index e4a34fee0e..a7bab3895f 100644
--- a/src/string_bytes.cc
+++ b/src/string_bytes.cc
@@ -199,7 +199,7 @@ size_t StringBytes::Write(char* buf,
       break;
 
     case UTF8:
-      len = str->WriteUtf8(buf, buflen, chars_written, flags);
+      len = str->WriteUtf8(buf, buflen, chars_written, WRITE_UTF8_FLAGS);
       break;
 
     case UCS2:
diff --git a/src/string_bytes.h b/src/string_bytes.h
index 8071a494ae..31f04bbe4b 100644
--- a/src/string_bytes.h
+++ b/src/string_bytes.h
@@ -29,6 +29,8 @@
 
 namespace node {
 
+extern int WRITE_UTF8_FLAGS;
+
 using v8::Handle;
 using v8::Local;
 using v8::String;
diff --git a/test/simple/test-buffer.js b/test/simple/test-buffer.js
index 3026824f0a..f8b2798676 100644
--- a/test/simple/test-buffer.js
+++ b/test/simple/test-buffer.js
@@ -791,6 +791,18 @@ assert.equal(buf[3], 0xFF);
   assert.equal(buf[3], 0xFF);
 });
 
+// test unmatched surrogates not producing invalid utf8 output
+// ef bf bd = utf-8 representation of unicode replacement character
+// see https://codereview.chromium.org/121173009/
+buf = new Buffer('ab\ud800cd', 'utf8');
+assert.equal(buf[0], 0x61);
+assert.equal(buf[1], 0x62);
+assert.equal(buf[2], 0xef);
+assert.equal(buf[3], 0xbf);
+assert.equal(buf[4], 0xbd);
+assert.equal(buf[5], 0x63);
+assert.equal(buf[6], 0x64);
+
 // test for buffer overrun
 buf = new Buffer([0, 0, 0, 0, 0]); // length: 5
 var sub = buf.slice(0, 4);         // length: 4
author	Felix Geisendörfer <felix@debuggable.com>	2014-01-20 09:47:19 +0100
committer	Timothy J Fontaine <tjfontaine@gmail.com>	2014-06-06 15:07:29 -0700
commit	0da4c671659cfbae12def127b2e94690b9d9b5e1 (patch)
tree	14b81692db27486685b07f69303bff39daa790e6
parent	881ac26f27f4ac9585d66c8d8a67d5b246a23d1b (diff)
download	node-new-0da4c671659cfbae12def127b2e94690b9d9b5e1.tar.gz