diff options
author | Florian Frank <flori@ping.de> | 2009-10-20 13:04:24 +0200 |
---|---|---|
committer | Florian Frank <flori@ping.de> | 2009-10-26 22:56:27 +0100 |
commit | 41ae3d70d6dd141759eb6f3fddf460b327a90796 (patch) | |
tree | 54431a12877a97a50f8fb077a03b6e1cdecb9f12 | |
parent | 3a13313e9d231e7e3f99101812825dbe3d01d13a (diff) | |
download | json-41ae3d70d6dd141759eb6f3fddf460b327a90796.tar.gz |
some optimisations and cleanup
-rw-r--r-- | ext/json/ext/generator/extconf.rb | 7 | ||||
-rw-r--r-- | ext/json/ext/generator/generator.c | 24 | ||||
-rw-r--r-- | ext/json/ext/generator/unicode.c | 104 | ||||
-rw-r--r-- | ext/json/ext/generator/unicode.h | 14 | ||||
-rw-r--r-- | ext/json/ext/parser/extconf.rb | 7 | ||||
-rw-r--r-- | ext/json/ext/parser/parser.c | 41 | ||||
-rw-r--r-- | ext/json/ext/parser/parser.rl | 41 | ||||
-rw-r--r-- | ext/json/ext/parser/unicode.c | 88 | ||||
-rw-r--r-- | ext/json/ext/parser/unicode.h | 8 |
9 files changed, 161 insertions, 173 deletions
diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 797b566..a0760e2 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -1,9 +1,12 @@ require 'mkmf' require 'rbconfig' +unless $CFLAGS.gsub!(/ -O[\dsz]?/, ' -O3') + $CFLAGS << ' -O3' +end if CONFIG['CC'] =~ /gcc/ - $CFLAGS += ' -Wall' - #$CFLAGS += ' -O0 -ggdb' + $CFLAGS << ' -Wall' + #$CFLAGS.gsub!(/ -O[\dsz]?/, ' -O0 -ggdb') end have_header("ruby/st.h") || have_header("st.h") diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 558f28d..a40cbb4 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -24,7 +24,7 @@ #ifdef HAVE_RUBY_ENCODING_H #include "ruby/encoding.h" #define FORCE_UTF8(obj) rb_enc_associate((obj), rb_utf8_encoding()) -static VALUE mEncoding_UTF_8; +static VALUE CEncoding_UTF_8; static ID i_encoding, i_encode; #else #define FORCE_UTF8(obj) @@ -154,7 +154,7 @@ static int hash_to_json_i(VALUE key, VALUE value, VALUE buf) /* * call-seq: to_json(state = nil, depth = 0) * - * Returns a JSON string containing a JSON object, that is unparsed from + * Returns a JSON string containing a JSON object, that is generated from * this Hash instance. * _state_ is a JSON::State object, that can also be used to configure the * produced JSON string output further. @@ -260,7 +260,7 @@ inline static VALUE mArray_json_transfrom(VALUE self, VALUE Vstate, VALUE Vdepth /* * call-seq: to_json(state = nil, depth = 0) * - * Returns a JSON string containing a JSON array, that is unparsed from + * Returns a JSON string containing a JSON array, that is generated from * this Array instance. * _state_ is a JSON::State object, that can also be used to configure the * produced JSON string output further. @@ -360,14 +360,14 @@ static VALUE mString_to_json(int argc, VALUE *argv, VALUE self) VALUE result = rb_str_buf_new(RSTRING_LEN(self)); rb_str_buf_cat2(result, "\""); #ifdef HAVE_RUBY_ENCODING_H - if (rb_funcall(self, i_encoding, 0) == mEncoding_UTF_8) { - JSON_convert_UTF8_to_JSON(result, self, strictConversion); + if (rb_funcall(self, i_encoding, 0) == CEncoding_UTF_8) { + JSON_convert_UTF8_to_JSON(result, self); } else { - VALUE string = rb_funcall(self, i_encode, 1, mEncoding_UTF_8); - JSON_convert_UTF8_to_JSON(result, string, strictConversion); + VALUE string = rb_funcall(self, i_encode, 1, CEncoding_UTF_8); + JSON_convert_UTF8_to_JSON(result, string); } #else - JSON_convert_UTF8_to_JSON(result, self, strictConversion); + JSON_convert_UTF8_to_JSON(result, self); #endif rb_str_buf_cat2(result, "\""); FORCE_UTF8(result); @@ -378,7 +378,7 @@ static VALUE mString_to_json(int argc, VALUE *argv, VALUE self) * call-seq: to_json_raw_object() * * This method creates a raw object hash, that can be nested into - * other data structures and will be unparsed as a raw string. This + * other data structures and will be generated as a raw string. This * method should be used, if you want to convert raw strings to JSON * instead of UTF-8 strings, e. g. binary data. */ @@ -856,17 +856,19 @@ static VALUE cState_forget(VALUE self, VALUE object) void Init_generator() { rb_require("json/common"); + mJSON = rb_define_module("JSON"); mExt = rb_define_module_under(mJSON, "Ext"); mGenerator = rb_define_module_under(mExt, "Generator"); + eGeneratorError = rb_path2class("JSON::GeneratorError"); eCircularDatastructure = rb_path2class("JSON::CircularDatastructure"); eNestingError = rb_path2class("JSON::NestingError"); + cState = rb_define_class_under(mGenerator, "State", rb_cObject); rb_define_alloc_func(cState, cState_s_allocate); rb_define_singleton_method(cState, "from_state", cState_from_state_s, 1); rb_define_method(cState, "initialize", cState_initialize, -1); - rb_define_method(cState, "indent", cState_indent, 0); rb_define_method(cState, "indent=", cState_indent_set, 1); rb_define_method(cState, "space", cState_space, 0); @@ -928,7 +930,7 @@ void Init_generator() i_create_id = rb_intern("create_id"); i_extend = rb_intern("extend"); #ifdef HAVE_RUBY_ENCODING_H - mEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8")); + CEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8")); i_encoding = rb_intern("encoding"); i_encode = rb_intern("encode"); #endif diff --git a/ext/json/ext/generator/unicode.c b/ext/json/ext/generator/unicode.c index 3ddfbe0..1d0b675 100644 --- a/ext/json/ext/generator/unicode.c +++ b/ext/json/ext/generator/unicode.c @@ -1,9 +1,5 @@ #include "unicode.h" -#define unicode_escape(buffer, character) \ - snprintf(buf, 7, "\\u%04x", (unsigned int) (character)); \ - rb_str_buf_cat(buffer, buf, 6); - /* * Copyright 2001-2004 Unicode, Inc. * @@ -53,15 +49,6 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -/* * Utility routine to tell whether a sequence of bytes is legal UTF-8. * This must be called with the length pre-determined by the first byte. * If not calling this from ConvertUTF8to*, then the length can be set by: @@ -98,9 +85,21 @@ inline static unsigned char isLegalUTF8(const UTF8 *source, int length) return 1; } -void JSON_convert_UTF8_to_JSON(VALUE buffer, VALUE string, ConversionFlags flags) +inline static void unicode_escape(VALUE buffer, UTF16 character) +{ + const char *digits = "0123456789abcdef"; + char buf[7] = { '\\', 'u' }; + + buf[6] = 0; + buf[2] = digits[character >> 12]; + buf[3] = digits[(character >> 8) & 0xf]; + buf[4] = digits[(character >> 4) & 0xf]; + buf[5] = digits[character & 0xf]; + rb_str_buf_cat(buffer, buf, 6); +} + +inline void JSON_convert_UTF8_to_JSON(VALUE buffer, VALUE string) { - char buf[7]; const UTF8* source = (UTF8 *) RSTRING_PTR(string); const UTF8* sourceEnd = source + RSTRING_LEN(string); @@ -131,45 +130,54 @@ void JSON_convert_UTF8_to_JSON(VALUE buffer, VALUE string, ConversionFlags flags if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ /* UTF-16 surrogate values are illegal in UTF-32 */ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - rb_raise(rb_path2class("JSON::GeneratorError"), +#if UNI_STRICT_CONVERSION + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); - } else { - unicode_escape(buffer, UNI_REPLACEMENT_CHAR); - } +#else + unicode_escape(buffer, UNI_REPLACEMENT_CHAR); +#endif } else { /* normal case */ - if (ch == '"') { - rb_str_buf_cat2(buffer, "\\\""); - } else if (ch == '\\') { - rb_str_buf_cat2(buffer, "\\\\"); - } else if (ch >= 0x20 && ch <= 0x7f) { - rb_str_buf_cat(buffer, (char *) source - 1, 1); - } else if (ch == '\n') { - rb_str_buf_cat2(buffer, "\\n"); - } else if (ch == '\r') { - rb_str_buf_cat2(buffer, "\\r"); - } else if (ch == '\t') { - rb_str_buf_cat2(buffer, "\\t"); - } else if (ch == '\f') { - rb_str_buf_cat2(buffer, "\\f"); - } else if (ch == '\b') { - rb_str_buf_cat2(buffer, "\\b"); - } else if (ch < 0x20) { - unicode_escape(buffer, (UTF16) ch); - } else { - unicode_escape(buffer, (UTF16) ch); + switch(ch) { + case '\n': + rb_str_buf_cat2(buffer, "\\n"); + break; + case '\r': + rb_str_buf_cat2(buffer, "\\r"); + break; + case '\\': + rb_str_buf_cat2(buffer, "\\\\"); + break; + case '"': + rb_str_buf_cat2(buffer, "\\\""); + break; + case '\t': + rb_str_buf_cat2(buffer, "\\t"); + break; + case '\f': + rb_str_buf_cat2(buffer, "\\f"); + break; + case '\b': + rb_str_buf_cat2(buffer, "\\b"); + break; + default: + if (ch >= 0x20 && ch <= 0x7f) { + rb_str_buf_cat(buffer, (char *) source - 1, 1); + } else { + unicode_escape(buffer, (UTF16) ch); + } + break; } } } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the start */ - rb_raise(rb_path2class("JSON::GeneratorError"), - "source sequence is illegal/malformed utf8"); - } else { - unicode_escape(buffer, UNI_REPLACEMENT_CHAR); - } +#if UNI_STRICT_CONVERSION + source -= (extraBytesToRead+1); /* return to the start */ + rb_raise(rb_path2class("JSON::GeneratorError"), + "source sequence is illegal/malformed utf8"); +#else + unicode_escape(buffer, UNI_REPLACEMENT_CHAR); +#endif } else { /* target is a character in range 0xFFFF - 0x10FFFF. */ ch -= halfBase; diff --git a/ext/json/ext/generator/unicode.h b/ext/json/ext/generator/unicode.h index 841474b..77b29a4 100644 --- a/ext/json/ext/generator/unicode.h +++ b/ext/json/ext/generator/unicode.h @@ -3,17 +3,7 @@ #ifndef _GENERATOR_UNICODE_H_ #define _GENERATOR_UNICODE_H_ -typedef enum { - conversionOK = 0, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; +#define UNI_STRICT_CONVERSION 1 typedef unsigned long UTF32; /* at least 32 bits */ typedef unsigned short UTF16; /* at least 16 bits */ @@ -35,7 +25,7 @@ static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; -void JSON_convert_UTF8_to_JSON(VALUE buffer, VALUE string, ConversionFlags flags); +inline void JSON_convert_UTF8_to_JSON(VALUE buffer, VALUE string); #ifndef RARRAY_PTR #define RARRAY_PTR(ARRAY) RARRAY(ARRAY)->ptr diff --git a/ext/json/ext/parser/extconf.rb b/ext/json/ext/parser/extconf.rb index 6226394..9662e9a 100644 --- a/ext/json/ext/parser/extconf.rb +++ b/ext/json/ext/parser/extconf.rb @@ -1,9 +1,12 @@ require 'mkmf' require 'rbconfig' +unless $CFLAGS.gsub!(/ -O[\dsz]?/, ' -O3') + $CFLAGS << ' -O3' +end if CONFIG['CC'] =~ /gcc/ - $CFLAGS += ' -Wall' - #$CFLAGS += ' -O0 -ggdb' + $CFLAGS << ' -Wall' + #$CFLAGS.gsub!(/ -O[\dsz]?/, ' -O0 -ggdb') end have_header("ruby/st.h") || have_header("st.h") diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 1781381..d2dfe52 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1262,7 +1262,6 @@ static VALUE json_string_unescape(char *p, char *pe) while (p < pe) { if (*p == '\\') { p++; - if (p >= pe) return Qnil; /* raise an exception later, \ at end */ switch (*p) { case '"': case '\\': @@ -1293,7 +1292,7 @@ static VALUE json_string_unescape(char *p, char *pe) if (p > pe - 4) { return Qnil; } else { - p = JSON_convert_UTF16_to_UTF8(result, p, pe, strictConversion); + p = JSON_convert_UTF16_to_UTF8(result, p, pe); } break; default: @@ -1312,7 +1311,7 @@ static VALUE json_string_unescape(char *p, char *pe) } -#line 1316 "parser.c" +#line 1315 "parser.c" static const int JSON_string_start = 1; static const int JSON_string_first_final = 8; static const int JSON_string_error = 0; @@ -1320,7 +1319,7 @@ static const int JSON_string_error = 0; static const int JSON_string_en_main = 1; -#line 433 "parser.rl" +#line 432 "parser.rl" static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result) @@ -1329,15 +1328,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu *result = rb_str_new("", 0); -#line 1333 "parser.c" +#line 1332 "parser.c" { cs = JSON_string_start; } -#line 441 "parser.rl" +#line 440 "parser.rl" json->memo = p; -#line 1341 "parser.c" +#line 1340 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1362,7 +1361,7 @@ case 2: goto st0; goto st2; tr2: -#line 419 "parser.rl" +#line 418 "parser.rl" { *result = json_string_unescape(json->memo + 1, p); if (NIL_P(*result)) { @@ -1373,14 +1372,14 @@ tr2: {p = (( p + 1))-1;} } } -#line 430 "parser.rl" +#line 429 "parser.rl" { p--; {p++; cs = 8; goto _out;} } goto st8; st8: if ( ++p == pe ) goto _test_eof8; case 8: -#line 1384 "parser.c" +#line 1383 "parser.c" goto st0; st3: if ( ++p == pe ) @@ -1456,7 +1455,7 @@ case 7: _out: {} } -#line 443 "parser.rl" +#line 442 "parser.rl" if (cs >= JSON_string_first_final) { return p + 1; @@ -1467,7 +1466,7 @@ case 7: -#line 1471 "parser.c" +#line 1470 "parser.c" static const int JSON_start = 1; static const int JSON_first_final = 10; static const int JSON_error = 0; @@ -1475,7 +1474,7 @@ static const int JSON_error = 0; static const int JSON_en_main = 1; -#line 477 "parser.rl" +#line 476 "parser.rl" /* @@ -1518,7 +1517,7 @@ inline static VALUE convert_encoding(VALUE source) rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_16LE); source = rb_funcall(source, i_encode_bang, 1, mEncoding_UTF_8); } else { - source = rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_8); + FORCE_UTF8(source); } } else { source = rb_funcall(source, i_encode, 1, mEncoding_UTF_8); @@ -1647,16 +1646,16 @@ static VALUE cParser_parse(VALUE self) GET_STRUCT; -#line 1651 "parser.c" +#line 1650 "parser.c" { cs = JSON_start; } -#line 648 "parser.rl" +#line 647 "parser.rl" p = json->source; pe = p + json->len; -#line 1660 "parser.c" +#line 1659 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1712,7 +1711,7 @@ case 5: goto st1; goto st5; tr3: -#line 466 "parser.rl" +#line 465 "parser.rl" { char *np; json->current_nesting = 1; @@ -1721,7 +1720,7 @@ tr3: } goto st10; tr4: -#line 459 "parser.rl" +#line 458 "parser.rl" { char *np; json->current_nesting = 1; @@ -1733,7 +1732,7 @@ st10: if ( ++p == pe ) goto _test_eof10; case 10: -#line 1737 "parser.c" +#line 1736 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -1790,7 +1789,7 @@ case 9: _out: {} } -#line 651 "parser.rl" +#line 650 "parser.rl" if (cs >= JSON_first_final && p == pe) { return result; diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl index 7de7bb1..8eca179 100644 --- a/ext/json/ext/parser/parser.rl +++ b/ext/json/ext/parser/parser.rl @@ -19,8 +19,8 @@ #ifdef HAVE_RUBY_ENCODING_H #include "ruby/encoding.h" #define FORCE_UTF8(obj) rb_enc_associate((obj), rb_utf8_encoding()) -static VALUE mEncoding_ASCII_8BIT, mEncoding_UTF_8, mEncoding_UTF_16BE, - mEncoding_UTF_16LE, mEncoding_UTF_32BE, mEncoding_UTF_32LE; +static VALUE CEncoding_ASCII_8BIT, CEncoding_UTF_8, CEncoding_UTF_16BE, + CEncoding_UTF_16LE, CEncoding_UTF_32BE, CEncoding_UTF_32LE; static ID i_encoding, i_encode, i_encode_bang, i_force_encoding; #else #define FORCE_UTF8(obj) @@ -361,7 +361,6 @@ static VALUE json_string_unescape(char *p, char *pe) while (p < pe) { if (*p == '\\') { p++; - if (p >= pe) return Qnil; /* raise an exception later, \ at end */ switch (*p) { case '"': case '\\': @@ -392,7 +391,7 @@ static VALUE json_string_unescape(char *p, char *pe) if (p > pe - 4) { return Qnil; } else { - p = JSON_convert_UTF16_to_UTF8(result, p, pe, strictConversion); + p = JSON_convert_UTF16_to_UTF8(result, p, pe); } break; default: @@ -498,28 +497,28 @@ inline static VALUE convert_encoding(VALUE source) #ifdef HAVE_RUBY_ENCODING_H { VALUE encoding = rb_funcall(source, i_encoding, 0); - if (encoding == mEncoding_ASCII_8BIT) { + if (encoding == CEncoding_ASCII_8BIT) { if (len >= 4 && ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) { source = rb_str_dup(source); - rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_32BE); - source = rb_funcall(source, i_encode_bang, 1, mEncoding_UTF_8); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32BE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) { source = rb_str_dup(source); - rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_16BE); - source = rb_funcall(source, i_encode_bang, 1, mEncoding_UTF_8); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16BE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) { source = rb_str_dup(source); - rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_32LE); - source = rb_funcall(source, i_encode_bang, 1, mEncoding_UTF_8); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32LE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) { source = rb_str_dup(source); - rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_16LE); - source = rb_funcall(source, i_encode_bang, 1, mEncoding_UTF_8); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16LE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); } else { - source = rb_funcall(source, i_force_encoding, 1, mEncoding_UTF_8); + FORCE_UTF8(source); } } else { - source = rb_funcall(source, i_encode, 1, mEncoding_UTF_8); + source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8); } } #else @@ -721,12 +720,12 @@ void Init_parser() i_object_class = rb_intern("object_class"); i_array_class = rb_intern("array_class"); #ifdef HAVE_RUBY_ENCODING_H - mEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8")); - mEncoding_UTF_16BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16be")); - mEncoding_UTF_16LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16le")); - mEncoding_UTF_32BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32be")); - mEncoding_UTF_32LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32le")); - mEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit")); + CEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8")); + CEncoding_UTF_16BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16be")); + CEncoding_UTF_16LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16le")); + CEncoding_UTF_32BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32be")); + CEncoding_UTF_32LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32le")); + CEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit")); i_encoding = rb_intern("encoding"); i_encode = rb_intern("encode"); i_encode_bang = rb_intern("encode!"); diff --git a/ext/json/ext/parser/unicode.c b/ext/json/ext/parser/unicode.c index 6bd29e2..711aac5 100644 --- a/ext/json/ext/parser/unicode.c +++ b/ext/json/ext/parser/unicode.c @@ -23,32 +23,6 @@ */ /* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. @@ -57,34 +31,61 @@ static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080 */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; +static const char digit_values[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, + -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1 +}; + char *JSON_convert_UTF16_to_UTF8 ( VALUE buffer, char *source, - char *sourceEnd, - ConversionFlags flags) + char *sourceEnd) { UTF16 *tmp, *tmpPtr, *tmpEnd; char buf[5]; - long n = 0, i; - char *p = source - 1; + long n = 0; + char failed = 1, c, *p = source - 1; while (p < sourceEnd && p[0] == '\\' && p[1] == 'u') { p += 6; n++; } p = source + 1; - buf[4] = 0; tmpPtr = tmp = ALLOC_N(UTF16, n); tmpEnd = tmp + n; - for (i = 0; i < n; i++) { - buf[0] = *p++; - buf[1] = *p++; - buf[2] = *p++; - buf[3] = *p++; - tmpPtr[i] = (UTF16)strtol(buf, NULL, 16); + while (tmpPtr < tmpEnd) { + c = digit_values[(unsigned char) *p++]; + failed *= c; + *tmpPtr = c << 12; + c = digit_values[(unsigned char) *p++]; + failed *= c; + *tmpPtr |= c << 8; + c = digit_values[(unsigned char) *p++]; + failed *= c; + *tmpPtr |= c << 4; + c = digit_values[(unsigned char) *p++]; + failed *= c; + *tmpPtr++ |= c; p += 2; } + if (failed < 0) { + rb_raise(rb_path2class("JSON::ParserError"), + "illegal \\uXXXX unicode value near %s", source); + } + tmpPtr = tmp; while (tmpPtr < tmpEnd) { UTF32 ch; unsigned short bytesToWrite = 0; @@ -102,10 +103,6 @@ char *JSON_convert_UTF16_to_UTF8 ( ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) + halfBase; ++tmpPtr; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "\\uXXXX is illegal/malformed utf-16 near %s", source); } } else { /* We don't have the 16 bits following the high surrogate. */ ruby_xfree(tmp); @@ -113,13 +110,6 @@ char *JSON_convert_UTF16_to_UTF8 ( "partial character in source, but hit end near %s", source); break; } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "\\uXXXX is illegal/malformed utf-16 near %s", source); - } } /* Figure out how many bytes the result will require */ if (ch < (UTF32) 0x80) { @@ -149,6 +139,6 @@ char *JSON_convert_UTF16_to_UTF8 ( rb_str_buf_cat(buffer, p, bytesToWrite); } ruby_xfree(tmp); - source += 5 + (n - 1) * 6; + source += 6 * n - 1; return source; } diff --git a/ext/json/ext/parser/unicode.h b/ext/json/ext/parser/unicode.h index 155da0c..40de426 100644 --- a/ext/json/ext/parser/unicode.h +++ b/ext/json/ext/parser/unicode.h @@ -31,16 +31,10 @@ typedef enum { sourceIllegal /* source sequence is illegal/malformed */ } ConversionResult; -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; - char *JSON_convert_UTF16_to_UTF8 ( VALUE buffer, char *source, - char *sourceEnd, - ConversionFlags flags); + char *sourceEnd); #ifndef RARRAY_PTR #define RARRAY_PTR(ARRAY) RARRAY(ARRAY)->ptr |