3 files changed, 80 insertions, 68 deletions
diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c
index c140fdb..d09b393 100644
--- a/ext/json/ext/parser/parser.c
+++ b/ext/json/ext/parser/parser.c
@@ -70,7 +70,7 @@ static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
 #ifdef HAVE_RUBY_ENCODING_H
 static VALUE CEncoding_ASCII_8BIT, CEncoding_UTF_8, CEncoding_UTF_16BE,
     CEncoding_UTF_16LE, CEncoding_UTF_32BE, CEncoding_UTF_32LE;
-static ID i_encoding, i_encode;
+static ID i_encoding, i_encode, i_valid_encoding_p;
 #else
 static ID i_iconv;
 #endif
@@ -1553,8 +1553,7 @@ case 7:
 }
 
 /*
- * Document-class: JSON::Ext::Parser
- *
+ * Document-class: JSON::Ext::Parser *
  * This is the JSON parser implemented as a C extension. It can be configured
  * to be used by setting
  *
@@ -1564,7 +1563,7 @@ case 7:
  *
  */
 
-static VALUE convert_encoding(VALUE source)
+static VALUE convert_encoding(VALUE source, VALUE encoding)
 {
     char *ptr = RSTRING_PTR(source);
     long len = RSTRING_LEN(source);
@@ -1572,34 +1571,31 @@ static VALUE convert_encoding(VALUE source)
         rb_raise(eParserError, "A JSON text must at least contain two octets!");
     }
 #ifdef HAVE_RUBY_ENCODING_H
-    {
-        VALUE encoding = rb_funcall(source, i_encoding, 0);
-        if (encoding == CEncoding_ASCII_8BIT) {
-            if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32BE);
-            } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16BE);
-            } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32LE);
-            } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16LE);
-            } else {
-                source = rb_str_dup(source);
-                FORCE_UTF8(source);
-            }
+    if (encoding == CEncoding_ASCII_8BIT) {
+        if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32BE);
+        } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16BE);
+        } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32LE);
+        } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16LE);
         } else {
-            source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8);
+            source = rb_str_dup(source);
+            FORCE_UTF8(source);
         }
+    } else {
+        source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8);
     }
 #else
     if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source);
     } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source);
     } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source);
     } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source);
     }
 #endif
     return source;
@@ -1634,6 +1630,7 @@ static VALUE convert_encoding(VALUE source)
 static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 {
     VALUE source, opts;
+    VALUE encoding = Qnil;
     GET_PARSER_INIT;
 
     if (json->Vsource) {
@@ -1717,8 +1714,16 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
         json->array_class = Qnil;
     }
     source = rb_convert_type(source, T_STRING, "String", "to_str");
+    source = StringValue(source);
+#ifdef HAVE_RUBY_ENCODING_H
+    encoding = rb_funcall(source, i_encoding, 0);
+    if (encoding == CEncoding_UTF_8 &&
+        !RTEST(rb_funcall(source, i_valid_encoding_p, 0))) {
+      rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
+    }
+#endif
     if (!json->quirks_mode) {
-      source = convert_encoding(StringValue(source));
+      source = convert_encoding(source, encoding);
     }
     json->current_nesting = 0;
     StringValue(source);
@@ -1729,7 +1734,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 }
 
 
-#line 1733 "parser.c"
+#line 1738 "parser.c"
 static const int JSON_start = 1;
 static const int JSON_first_final = 10;
 static const int JSON_error = 0;
@@ -1737,7 +1742,7 @@ static const int JSON_error = 0;
 static const int JSON_en_main = 1;
 
 
-#line 740 "parser.rl"
+#line 745 "parser.rl"
 
 
 static VALUE cParser_parse_strict(VALUE self)
@@ -1748,16 +1753,16 @@ static VALUE cParser_parse_strict(VALUE self)
     GET_PARSER;
 
 
-#line 1752 "parser.c"
+#line 1757 "parser.c"
 	{
 	cs = JSON_start;
 	}
 
-#line 750 "parser.rl"
+#line 755 "parser.rl"
     p = json->source;
     pe = p + json->len;
 
-#line 1761 "parser.c"
+#line 1766 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1813,7 +1818,7 @@ case 5:
 		goto st1;
 	goto st5;
 tr3:
-#line 729 "parser.rl"
+#line 734 "parser.rl"
 	{
         char *np;
         json->current_nesting = 1;
@@ -1822,7 +1827,7 @@ tr3:
     }
 	goto st10;
 tr4:
-#line 722 "parser.rl"
+#line 727 "parser.rl"
 	{
         char *np;
         json->current_nesting = 1;
@@ -1834,7 +1839,7 @@ st10:
 	if ( ++p == pe )
 		goto _test_eof10;
 case 10:
-#line 1838 "parser.c"
+#line 1843 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st10;
 		case 32: goto st10;
@@ -1891,7 +1896,7 @@ case 9:
 	_out: {}
 	}
 
-#line 753 "parser.rl"
+#line 758 "parser.rl"
 
     if (cs >= JSON_first_final && p == pe) {
         return result;
@@ -1903,7 +1908,7 @@ case 9:
 
 
 
-#line 1907 "parser.c"
+#line 1912 "parser.c"
 static const int JSON_quirks_mode_start = 1;
 static const int JSON_quirks_mode_first_final = 10;
 static const int JSON_quirks_mode_error = 0;
@@ -1911,7 +1916,7 @@ static const int JSON_quirks_mode_error = 0;
 static const int JSON_quirks_mode_en_main = 1;
 
 
-#line 778 "parser.rl"
+#line 783 "parser.rl"
 
 
 static VALUE cParser_parse_quirks_mode(VALUE self)
@@ -1922,16 +1927,16 @@ static VALUE cParser_parse_quirks_mode(VALUE self)
     GET_PARSER;
 
 
-#line 1926 "parser.c"
+#line 1931 "parser.c"
 	{
 	cs = JSON_quirks_mode_start;
 	}
 
-#line 788 "parser.rl"
+#line 793 "parser.rl"
     p = json->source;
     pe = p + json->len;
 
-#line 1935 "parser.c"
+#line 1940 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1965,7 +1970,7 @@ st0:
 cs = 0;
 	goto _out;
 tr2:
-#line 770 "parser.rl"
+#line 775 "parser.rl"
 	{
         char *np = JSON_parse_value(json, p, pe, &result);
         if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@@ -1975,7 +1980,7 @@ st10:
 	if ( ++p == pe )
 		goto _test_eof10;
 case 10:
-#line 1979 "parser.c"
+#line 1984 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st10;
 		case 32: goto st10;
@@ -2064,7 +2069,7 @@ case 9:
 	_out: {}
 	}
 
-#line 791 "parser.rl"
+#line 796 "parser.rl"
 
     if (cs >= JSON_quirks_mode_first_final && p == pe) {
         return result;
@@ -2190,6 +2195,7 @@ void Init_parser()
     CEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit"));
     i_encoding = rb_intern("encoding");
     i_encode = rb_intern("encode");
+    i_valid_encoding_p = rb_intern("valid_encoding?");
 #else
     i_iconv = rb_intern("iconv");
 #endif
diff --git a/ext/json/ext/parser/parser.h b/ext/json/ext/parser/parser.h
index b192064..fbe0ef2 100644
--- a/ext/json/ext/parser/parser.h
+++ b/ext/json/ext/parser/parser.h
@@ -65,7 +65,7 @@ static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *resul
 static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result);
 static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd);
 static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static VALUE convert_encoding(VALUE source);
+static VALUE convert_encoding(VALUE source, VALUE encoding);
 static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self);
 static VALUE cParser_parse(VALUE self);
 static JSON_Parser *JSON_allocate();
diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl
index 20ecc48..393aad6 100644
--- a/ext/json/ext/parser/parser.rl
+++ b/ext/json/ext/parser/parser.rl
@@ -68,7 +68,7 @@ static int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
 #ifdef HAVE_RUBY_ENCODING_H
 static VALUE CEncoding_ASCII_8BIT, CEncoding_UTF_8, CEncoding_UTF_16BE,
     CEncoding_UTF_16LE, CEncoding_UTF_32BE, CEncoding_UTF_32LE;
-static ID i_encoding, i_encode;
+static ID i_encoding, i_encode, i_valid_encoding_p;
 #else
 static ID i_iconv;
 #endif
@@ -537,8 +537,7 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
 }
 
 /*
- * Document-class: JSON::Ext::Parser
- *
+ * Document-class: JSON::Ext::Parser *
  * This is the JSON parser implemented as a C extension. It can be configured
  * to be used by setting
  *
@@ -548,7 +547,7 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
  *
  */
 
-static VALUE convert_encoding(VALUE source)
+static VALUE convert_encoding(VALUE source, VALUE encoding)
 {
     char *ptr = RSTRING_PTR(source);
     long len = RSTRING_LEN(source);
@@ -556,34 +555,31 @@ static VALUE convert_encoding(VALUE source)
         rb_raise(eParserError, "A JSON text must at least contain two octets!");
     }
 #ifdef HAVE_RUBY_ENCODING_H
-    {
-        VALUE encoding = rb_funcall(source, i_encoding, 0);
-        if (encoding == CEncoding_ASCII_8BIT) {
-            if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32BE);
-            } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16BE);
-            } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32LE);
-            } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
-                source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16LE);
-            } else {
-                source = rb_str_dup(source);
-                FORCE_UTF8(source);
-            }
+    if (encoding == CEncoding_ASCII_8BIT) {
+        if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32BE);
+        } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16BE);
+        } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_32LE);
+        } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
+            source = rb_funcall(source, i_encode, 2, CEncoding_UTF_8, CEncoding_UTF_16LE);
         } else {
-            source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8);
+            source = rb_str_dup(source);
+            FORCE_UTF8(source);
         }
+    } else {
+        source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8);
     }
 #else
     if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source);
     } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source);
     } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source);
     } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
-      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source);
+        source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source);
     }
 #endif
     return source;
@@ -618,6 +614,7 @@ static VALUE convert_encoding(VALUE source)
 static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 {
     VALUE source, opts;
+    VALUE encoding = Qnil;
     GET_PARSER_INIT;
 
     if (json->Vsource) {
@@ -701,8 +698,16 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
         json->array_class = Qnil;
     }
     source = rb_convert_type(source, T_STRING, "String", "to_str");
+    source = StringValue(source);
+#ifdef HAVE_RUBY_ENCODING_H
+    encoding = rb_funcall(source, i_encoding, 0);
+    if (encoding == CEncoding_UTF_8 &&
+        !RTEST(rb_funcall(source, i_valid_encoding_p, 0))) {
+      rb_raise(rb_eArgError, "invalid byte sequence in UTF-8");
+    }
+#endif
     if (!json->quirks_mode) {
-      source = convert_encoding(StringValue(source));
+      source = convert_encoding(source, encoding);
     }
     json->current_nesting = 0;
     StringValue(source);
@@ -913,6 +918,7 @@ void Init_parser()
     CEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit"));
     i_encoding = rb_intern("encoding");
     i_encode = rb_intern("encode");
+    i_valid_encoding_p = rb_intern("valid_encoding?");
 #else
     i_iconv = rb_intern("iconv");
 #endif