diff options
author | Florian Frank <flori@ping.de> | 2009-10-26 15:58:00 +0100 |
---|---|---|
committer | Florian Frank <flori@ping.de> | 2009-10-26 22:58:08 +0100 |
commit | f1504ee153790f12ee43bb4ef2551fa76970f519 (patch) | |
tree | a1b7d9cff62d9b4c7d3e9947066488669ae96db9 | |
parent | dd06e48aa414674f52e81f9cdc7836b6456c04f8 (diff) | |
download | json-f1504ee153790f12ee43bb4ef2551fa76970f519.tar.gz |
improved parser a bit
-rw-r--r-- | ext/json/ext/generator/unicode.c | 3 | ||||
-rw-r--r-- | ext/json/ext/parser/parser.c | 123 | ||||
-rw-r--r-- | ext/json/ext/parser/parser.rl | 87 | ||||
-rw-r--r-- | ext/json/ext/parser/unicode.c | 161 | ||||
-rw-r--r-- | ext/json/ext/parser/unicode.h | 36 |
5 files changed, 163 insertions, 247 deletions
diff --git a/ext/json/ext/generator/unicode.c b/ext/json/ext/generator/unicode.c index 53a2ec1..e470eea 100644 --- a/ext/json/ext/generator/unicode.c +++ b/ext/json/ext/generator/unicode.c @@ -200,8 +200,7 @@ inline void JSON_convert_UTF8_to_JSON(FBuffer *buffer, VALUE string) int len = RSTRING_LEN(string), start = 0, end = 0; const char *escape = NULL; int escape_len; - char buf[7] = { '\\', 'u' }; - buf[6] = 0; + char buf[6] = { '\\', 'u' }; for (start = 0, end = 0; end < len;) { p = ptr + end; diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index 78b4ff9..f67ea0c 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -1255,63 +1255,78 @@ case 16: } } -static VALUE json_string_unescape(char *p, char *pe) +inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd) { - VALUE result = rb_str_buf_new(pe - p + 1); - - while (p < pe) { - if (*p == '\\') { - p++; - switch (*p) { + char *p = string, *pe = string, *unescape; + int unescape_len; + + while (pe < stringEnd) { + if (*pe == '\\') { + unescape = "?"; + unescape_len = 1; + if (pe > p) rb_str_buf_cat(result, p, pe - p); + switch (*++pe) { + case 'n': + unescape = "\n"; + break; + case 'r': + unescape = "\r"; + break; + case 't': + unescape = "\t"; + break; case '"': + unescape = "\""; + break; case '\\': - rb_str_buf_cat(result, p, 1); - p++; + unescape = "\\"; break; case 'b': - rb_str_buf_cat2(result, "\b"); - p++; + unescape = "\b"; break; case 'f': - rb_str_buf_cat2(result, "\f"); - p++; - break; - case 'n': - rb_str_buf_cat2(result, "\n"); - p++; - break; - case 'r': - rb_str_buf_cat2(result, "\r"); - p++; - break; - case 't': - rb_str_buf_cat2(result, "\t"); - p++; + unescape = "\f"; break; case 'u': - if (p > pe - 4) { + if (pe > stringEnd - 4) { return Qnil; } else { - p = JSON_convert_UTF16_to_UTF8(result, p, pe); + char buf[4]; + UTF32 ch = unescape_unicode((unsigned char *) ++pe); + pe += 3; + if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { + pe++; + if (pe > stringEnd - 6) return Qnil; + if (pe[0] == '\\' && pe[1] == 'u') { + UTF32 sur = unescape_unicode((unsigned char *) pe + 2); + ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) + | (sur & 0x3FF)); + pe += 5; + } else { + unescape = "?"; + break; + } + } + unescape_len = convert_UTF32_to_UTF8(buf, ch); + unescape = buf; } break; default: - rb_str_buf_cat(result, p, 1); - p++; - break; + p = pe; + continue; } + rb_str_buf_cat(result, unescape, unescape_len); + p = ++pe; } else { - char *q = p; - while (*q != '\\' && q < pe) q++; - rb_str_buf_cat(result, p, q - p); - p = q; + pe++; } } + rb_str_buf_cat(result, p, pe - p); return result; } -#line 1315 "parser.c" +#line 1330 "parser.c" static const int JSON_string_start = 1; static const int JSON_string_first_final = 8; static const int JSON_string_error = 0; @@ -1319,24 +1334,24 @@ static const int JSON_string_error = 0; static const int JSON_string_en_main = 1; -#line 432 "parser.rl" +#line 447 "parser.rl" static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result) { int cs = EVIL; - *result = rb_str_new("", 0); + *result = rb_str_buf_new(0); -#line 1332 "parser.c" +#line 1347 "parser.c" { cs = JSON_string_start; } -#line 440 "parser.rl" +#line 455 "parser.rl" json->memo = p; -#line 1340 "parser.c" +#line 1355 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1361,9 +1376,9 @@ case 2: goto st0; goto st2; tr2: -#line 418 "parser.rl" +#line 433 "parser.rl" { - *result = json_string_unescape(json->memo + 1, p); + *result = json_string_unescape(*result, json->memo + 1, p); if (NIL_P(*result)) { p--; {p++; cs = 8; goto _out;} @@ -1372,14 +1387,14 @@ tr2: {p = (( p + 1))-1;} } } -#line 429 "parser.rl" +#line 444 "parser.rl" { p--; {p++; cs = 8; goto _out;} } goto st8; st8: if ( ++p == pe ) goto _test_eof8; case 8: -#line 1383 "parser.c" +#line 1398 "parser.c" goto st0; st3: if ( ++p == pe ) @@ -1455,7 +1470,7 @@ case 7: _out: {} } -#line 442 "parser.rl" +#line 457 "parser.rl" if (cs >= JSON_string_first_final) { return p + 1; @@ -1466,7 +1481,7 @@ case 7: -#line 1470 "parser.c" +#line 1485 "parser.c" static const int JSON_start = 1; static const int JSON_first_final = 10; static const int JSON_error = 0; @@ -1474,7 +1489,7 @@ static const int JSON_error = 0; static const int JSON_en_main = 1; -#line 476 "parser.rl" +#line 491 "parser.rl" /* @@ -1646,16 +1661,16 @@ static VALUE cParser_parse(VALUE self) GET_STRUCT; -#line 1650 "parser.c" +#line 1665 "parser.c" { cs = JSON_start; } -#line 647 "parser.rl" +#line 662 "parser.rl" p = json->source; pe = p + json->len; -#line 1659 "parser.c" +#line 1674 "parser.c" { if ( p == pe ) goto _test_eof; @@ -1711,7 +1726,7 @@ case 5: goto st1; goto st5; tr3: -#line 465 "parser.rl" +#line 480 "parser.rl" { char *np; json->current_nesting = 1; @@ -1720,7 +1735,7 @@ tr3: } goto st10; tr4: -#line 458 "parser.rl" +#line 473 "parser.rl" { char *np; json->current_nesting = 1; @@ -1732,7 +1747,7 @@ st10: if ( ++p == pe ) goto _test_eof10; case 10: -#line 1736 "parser.c" +#line 1751 "parser.c" switch( (*p) ) { case 13: goto st10; case 32: goto st10; @@ -1789,7 +1804,7 @@ case 9: _out: {} } -#line 650 "parser.rl" +#line 665 "parser.rl" if (cs >= JSON_first_final && p == pe) { return result; diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl index 8eca179..02b2b6a 100644 --- a/ext/json/ext/parser/parser.rl +++ b/ext/json/ext/parser/parser.rl @@ -354,58 +354,73 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul } } -static VALUE json_string_unescape(char *p, char *pe) +inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd) { - VALUE result = rb_str_buf_new(pe - p + 1); - - while (p < pe) { - if (*p == '\\') { - p++; - switch (*p) { + char *p = string, *pe = string, *unescape; + int unescape_len; + + while (pe < stringEnd) { + if (*pe == '\\') { + unescape = "?"; + unescape_len = 1; + if (pe > p) rb_str_buf_cat(result, p, pe - p); + switch (*++pe) { + case 'n': + unescape = "\n"; + break; + case 'r': + unescape = "\r"; + break; + case 't': + unescape = "\t"; + break; case '"': + unescape = "\""; + break; case '\\': - rb_str_buf_cat(result, p, 1); - p++; + unescape = "\\"; break; case 'b': - rb_str_buf_cat2(result, "\b"); - p++; + unescape = "\b"; break; case 'f': - rb_str_buf_cat2(result, "\f"); - p++; - break; - case 'n': - rb_str_buf_cat2(result, "\n"); - p++; - break; - case 'r': - rb_str_buf_cat2(result, "\r"); - p++; - break; - case 't': - rb_str_buf_cat2(result, "\t"); - p++; + unescape = "\f"; break; case 'u': - if (p > pe - 4) { + if (pe > stringEnd - 4) { return Qnil; } else { - p = JSON_convert_UTF16_to_UTF8(result, p, pe); + char buf[4]; + UTF32 ch = unescape_unicode((unsigned char *) ++pe); + pe += 3; + if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { + pe++; + if (pe > stringEnd - 6) return Qnil; + if (pe[0] == '\\' && pe[1] == 'u') { + UTF32 sur = unescape_unicode((unsigned char *) pe + 2); + ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) + | (sur & 0x3FF)); + pe += 5; + } else { + unescape = "?"; + break; + } + } + unescape_len = convert_UTF32_to_UTF8(buf, ch); + unescape = buf; } break; default: - rb_str_buf_cat(result, p, 1); - p++; - break; + p = pe; + continue; } + rb_str_buf_cat(result, unescape, unescape_len); + p = ++pe; } else { - char *q = p; - while (*q != '\\' && q < pe) q++; - rb_str_buf_cat(result, p, q - p); - p = q; + pe++; } } + rb_str_buf_cat(result, p, pe - p); return result; } @@ -416,7 +431,7 @@ static VALUE json_string_unescape(char *p, char *pe) write data; action parse_string { - *result = json_string_unescape(json->memo + 1, p); + *result = json_string_unescape(*result, json->memo + 1, p); if (NIL_P(*result)) { fhold; fbreak; @@ -435,7 +450,7 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu { int cs = EVIL; - *result = rb_str_new("", 0); + *result = rb_str_buf_new(0); %% write init; json->memo = p; %% write exec; diff --git a/ext/json/ext/parser/unicode.c b/ext/json/ext/parser/unicode.c index 711aac5..45462c9 100644 --- a/ext/json/ext/parser/unicode.c +++ b/ext/json/ext/parser/unicode.c @@ -1,36 +1,5 @@ #include "unicode.h" -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - static const char digit_values[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -48,97 +17,47 @@ static const char digit_values[256] = { -1, -1, -1, -1, -1, -1, -1 }; -char *JSON_convert_UTF16_to_UTF8 ( - VALUE buffer, - char *source, - char *sourceEnd) +inline UTF32 unescape_unicode(const unsigned char *p) { - UTF16 *tmp, *tmpPtr, *tmpEnd; - char buf[5]; - long n = 0; - char failed = 1, c, *p = source - 1; - - while (p < sourceEnd && p[0] == '\\' && p[1] == 'u') { - p += 6; - n++; - } - p = source + 1; - tmpPtr = tmp = ALLOC_N(UTF16, n); - tmpEnd = tmp + n; - while (tmpPtr < tmpEnd) { - c = digit_values[(unsigned char) *p++]; - failed *= c; - *tmpPtr = c << 12; - c = digit_values[(unsigned char) *p++]; - failed *= c; - *tmpPtr |= c << 8; - c = digit_values[(unsigned char) *p++]; - failed *= c; - *tmpPtr |= c << 4; - c = digit_values[(unsigned char) *p++]; - failed *= c; - *tmpPtr++ |= c; - p += 2; - } - if (failed < 0) { - rb_raise(rb_path2class("JSON::ParserError"), - "illegal \\uXXXX unicode value near %s", source); - } - - tmpPtr = tmp; - while (tmpPtr < tmpEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *tmpPtr++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source - * buffer... */ - if (tmpPtr < tmpEnd) { - UTF32 ch2 = *tmpPtr; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++tmpPtr; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "partial character in source, but hit end near %s", source); - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32) 0x80) { - bytesToWrite = 1; - } else if (ch < (UTF32) 0x800) { - bytesToWrite = 2; - } else if (ch < (UTF32) 0x10000) { - bytesToWrite = 3; - } else if (ch < (UTF32) 0x110000) { - bytesToWrite = 4; - } else { - bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } + char b; + UTF32 result = 0; + b = digit_values[p[0]]; + if (b < 0) return UNI_REPLACEMENT_CHAR; + result = (result << 4) | b; + b = digit_values[p[1]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + b = digit_values[p[2]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + b = digit_values[p[3]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + return result; +} - buf[0] = 0; - buf[1] = 0; - buf[2] = 0; - buf[3] = 0; - p = buf + bytesToWrite; - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--p = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - rb_str_buf_cat(buffer, p, bytesToWrite); +inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch) +{ + int len = 1; + if (ch <= 0x7F) { + buf[0] = (char) ch; + } else if (ch <= 0x07FF) { + buf[0] = (char) ((ch >> 6) | 0xC0); + buf[1] = (char) ((ch & 0x3F) | 0x80); + len++; + } else if (ch <= 0xFFFF) { + buf[0] = (char) ((ch >> 12) | 0xE0); + buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80); + buf[2] = (char) ((ch & 0x3F) | 0x80); + len += 2; + } else if (ch <= 0x1fffff) { + buf[0] =(char) ((ch >> 18) | 0xF0); + buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80); + buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80); + buf[3] =(char) ((ch & 0x3F) | 0x80); + len += 3; + } else { + buf[0] = '?'; } - ruby_xfree(tmp); - source += 6 * n - 1; - return source; + return len; } diff --git a/ext/json/ext/parser/unicode.h b/ext/json/ext/parser/unicode.h index 40de426..1e327ae 100644 --- a/ext/json/ext/parser/unicode.h +++ b/ext/json/ext/parser/unicode.h @@ -9,44 +9,12 @@ typedef unsigned short UTF16; /* at least 16 bits */ typedef unsigned char UTF8; /* typically 8 bits */ #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD -#define UNI_MAX_BMP (UTF32)0x0000FFFF -#define UNI_MAX_UTF16 (UTF32)0x0010FFFF -#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF -#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF - #define UNI_SUR_HIGH_START (UTF32)0xD800 #define UNI_SUR_HIGH_END (UTF32)0xDBFF #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF -static const int halfShift = 10; /* used for shifting by 10 bits */ - -static const UTF32 halfBase = 0x0010000UL; -static const UTF32 halfMask = 0x3FFUL; - -typedef enum { - conversionOK = 0, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -char *JSON_convert_UTF16_to_UTF8 ( - VALUE buffer, - char *source, - char *sourceEnd); - -#ifndef RARRAY_PTR -#define RARRAY_PTR(ARRAY) RARRAY(ARRAY)->ptr -#endif -#ifndef RARRAY_LEN -#define RARRAY_LEN(ARRAY) RARRAY(ARRAY)->len -#endif -#ifndef RSTRING_PTR -#define RSTRING_PTR(string) RSTRING(string)->ptr -#endif -#ifndef RSTRING_LEN -#define RSTRING_LEN(string) RSTRING(string)->len -#endif +inline UTF32 unescape_unicode(const unsigned char *p); +inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch); #endif |