summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFlorian Frank <flori@ping.de>2009-10-26 15:58:00 +0100
committerFlorian Frank <flori@ping.de>2009-10-26 22:58:08 +0100
commitf1504ee153790f12ee43bb4ef2551fa76970f519 (patch)
treea1b7d9cff62d9b4c7d3e9947066488669ae96db9
parentdd06e48aa414674f52e81f9cdc7836b6456c04f8 (diff)
downloadjson-f1504ee153790f12ee43bb4ef2551fa76970f519.tar.gz
improved parser a bit
-rw-r--r--ext/json/ext/generator/unicode.c3
-rw-r--r--ext/json/ext/parser/parser.c123
-rw-r--r--ext/json/ext/parser/parser.rl87
-rw-r--r--ext/json/ext/parser/unicode.c161
-rw-r--r--ext/json/ext/parser/unicode.h36
5 files changed, 163 insertions, 247 deletions
diff --git a/ext/json/ext/generator/unicode.c b/ext/json/ext/generator/unicode.c
index 53a2ec1..e470eea 100644
--- a/ext/json/ext/generator/unicode.c
+++ b/ext/json/ext/generator/unicode.c
@@ -200,8 +200,7 @@ inline void JSON_convert_UTF8_to_JSON(FBuffer *buffer, VALUE string)
int len = RSTRING_LEN(string), start = 0, end = 0;
const char *escape = NULL;
int escape_len;
- char buf[7] = { '\\', 'u' };
- buf[6] = 0;
+ char buf[6] = { '\\', 'u' };
for (start = 0, end = 0; end < len;) {
p = ptr + end;
diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c
index 78b4ff9..f67ea0c 100644
--- a/ext/json/ext/parser/parser.c
+++ b/ext/json/ext/parser/parser.c
@@ -1255,63 +1255,78 @@ case 16:
}
}
-static VALUE json_string_unescape(char *p, char *pe)
+inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd)
{
- VALUE result = rb_str_buf_new(pe - p + 1);
-
- while (p < pe) {
- if (*p == '\\') {
- p++;
- switch (*p) {
+ char *p = string, *pe = string, *unescape;
+ int unescape_len;
+
+ while (pe < stringEnd) {
+ if (*pe == '\\') {
+ unescape = "?";
+ unescape_len = 1;
+ if (pe > p) rb_str_buf_cat(result, p, pe - p);
+ switch (*++pe) {
+ case 'n':
+ unescape = "\n";
+ break;
+ case 'r':
+ unescape = "\r";
+ break;
+ case 't':
+ unescape = "\t";
+ break;
case '"':
+ unescape = "\"";
+ break;
case '\\':
- rb_str_buf_cat(result, p, 1);
- p++;
+ unescape = "\\";
break;
case 'b':
- rb_str_buf_cat2(result, "\b");
- p++;
+ unescape = "\b";
break;
case 'f':
- rb_str_buf_cat2(result, "\f");
- p++;
- break;
- case 'n':
- rb_str_buf_cat2(result, "\n");
- p++;
- break;
- case 'r':
- rb_str_buf_cat2(result, "\r");
- p++;
- break;
- case 't':
- rb_str_buf_cat2(result, "\t");
- p++;
+ unescape = "\f";
break;
case 'u':
- if (p > pe - 4) {
+ if (pe > stringEnd - 4) {
return Qnil;
} else {
- p = JSON_convert_UTF16_to_UTF8(result, p, pe);
+ char buf[4];
+ UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+ pe += 3;
+ if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+ pe++;
+ if (pe > stringEnd - 6) return Qnil;
+ if (pe[0] == '\\' && pe[1] == 'u') {
+ UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
+ | (sur & 0x3FF));
+ pe += 5;
+ } else {
+ unescape = "?";
+ break;
+ }
+ }
+ unescape_len = convert_UTF32_to_UTF8(buf, ch);
+ unescape = buf;
}
break;
default:
- rb_str_buf_cat(result, p, 1);
- p++;
- break;
+ p = pe;
+ continue;
}
+ rb_str_buf_cat(result, unescape, unescape_len);
+ p = ++pe;
} else {
- char *q = p;
- while (*q != '\\' && q < pe) q++;
- rb_str_buf_cat(result, p, q - p);
- p = q;
+ pe++;
}
}
+ rb_str_buf_cat(result, p, pe - p);
return result;
}
-#line 1315 "parser.c"
+#line 1330 "parser.c"
static const int JSON_string_start = 1;
static const int JSON_string_first_final = 8;
static const int JSON_string_error = 0;
@@ -1319,24 +1334,24 @@ static const int JSON_string_error = 0;
static const int JSON_string_en_main = 1;
-#line 432 "parser.rl"
+#line 447 "parser.rl"
static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result)
{
int cs = EVIL;
- *result = rb_str_new("", 0);
+ *result = rb_str_buf_new(0);
-#line 1332 "parser.c"
+#line 1347 "parser.c"
{
cs = JSON_string_start;
}
-#line 440 "parser.rl"
+#line 455 "parser.rl"
json->memo = p;
-#line 1340 "parser.c"
+#line 1355 "parser.c"
{
if ( p == pe )
goto _test_eof;
@@ -1361,9 +1376,9 @@ case 2:
goto st0;
goto st2;
tr2:
-#line 418 "parser.rl"
+#line 433 "parser.rl"
{
- *result = json_string_unescape(json->memo + 1, p);
+ *result = json_string_unescape(*result, json->memo + 1, p);
if (NIL_P(*result)) {
p--;
{p++; cs = 8; goto _out;}
@@ -1372,14 +1387,14 @@ tr2:
{p = (( p + 1))-1;}
}
}
-#line 429 "parser.rl"
+#line 444 "parser.rl"
{ p--; {p++; cs = 8; goto _out;} }
goto st8;
st8:
if ( ++p == pe )
goto _test_eof8;
case 8:
-#line 1383 "parser.c"
+#line 1398 "parser.c"
goto st0;
st3:
if ( ++p == pe )
@@ -1455,7 +1470,7 @@ case 7:
_out: {}
}
-#line 442 "parser.rl"
+#line 457 "parser.rl"
if (cs >= JSON_string_first_final) {
return p + 1;
@@ -1466,7 +1481,7 @@ case 7:
-#line 1470 "parser.c"
+#line 1485 "parser.c"
static const int JSON_start = 1;
static const int JSON_first_final = 10;
static const int JSON_error = 0;
@@ -1474,7 +1489,7 @@ static const int JSON_error = 0;
static const int JSON_en_main = 1;
-#line 476 "parser.rl"
+#line 491 "parser.rl"
/*
@@ -1646,16 +1661,16 @@ static VALUE cParser_parse(VALUE self)
GET_STRUCT;
-#line 1650 "parser.c"
+#line 1665 "parser.c"
{
cs = JSON_start;
}
-#line 647 "parser.rl"
+#line 662 "parser.rl"
p = json->source;
pe = p + json->len;
-#line 1659 "parser.c"
+#line 1674 "parser.c"
{
if ( p == pe )
goto _test_eof;
@@ -1711,7 +1726,7 @@ case 5:
goto st1;
goto st5;
tr3:
-#line 465 "parser.rl"
+#line 480 "parser.rl"
{
char *np;
json->current_nesting = 1;
@@ -1720,7 +1735,7 @@ tr3:
}
goto st10;
tr4:
-#line 458 "parser.rl"
+#line 473 "parser.rl"
{
char *np;
json->current_nesting = 1;
@@ -1732,7 +1747,7 @@ st10:
if ( ++p == pe )
goto _test_eof10;
case 10:
-#line 1736 "parser.c"
+#line 1751 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
@@ -1789,7 +1804,7 @@ case 9:
_out: {}
}
-#line 650 "parser.rl"
+#line 665 "parser.rl"
if (cs >= JSON_first_final && p == pe) {
return result;
diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl
index 8eca179..02b2b6a 100644
--- a/ext/json/ext/parser/parser.rl
+++ b/ext/json/ext/parser/parser.rl
@@ -354,58 +354,73 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul
}
}
-static VALUE json_string_unescape(char *p, char *pe)
+inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd)
{
- VALUE result = rb_str_buf_new(pe - p + 1);
-
- while (p < pe) {
- if (*p == '\\') {
- p++;
- switch (*p) {
+ char *p = string, *pe = string, *unescape;
+ int unescape_len;
+
+ while (pe < stringEnd) {
+ if (*pe == '\\') {
+ unescape = "?";
+ unescape_len = 1;
+ if (pe > p) rb_str_buf_cat(result, p, pe - p);
+ switch (*++pe) {
+ case 'n':
+ unescape = "\n";
+ break;
+ case 'r':
+ unescape = "\r";
+ break;
+ case 't':
+ unescape = "\t";
+ break;
case '"':
+ unescape = "\"";
+ break;
case '\\':
- rb_str_buf_cat(result, p, 1);
- p++;
+ unescape = "\\";
break;
case 'b':
- rb_str_buf_cat2(result, "\b");
- p++;
+ unescape = "\b";
break;
case 'f':
- rb_str_buf_cat2(result, "\f");
- p++;
- break;
- case 'n':
- rb_str_buf_cat2(result, "\n");
- p++;
- break;
- case 'r':
- rb_str_buf_cat2(result, "\r");
- p++;
- break;
- case 't':
- rb_str_buf_cat2(result, "\t");
- p++;
+ unescape = "\f";
break;
case 'u':
- if (p > pe - 4) {
+ if (pe > stringEnd - 4) {
return Qnil;
} else {
- p = JSON_convert_UTF16_to_UTF8(result, p, pe);
+ char buf[4];
+ UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+ pe += 3;
+ if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+ pe++;
+ if (pe > stringEnd - 6) return Qnil;
+ if (pe[0] == '\\' && pe[1] == 'u') {
+ UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
+ | (sur & 0x3FF));
+ pe += 5;
+ } else {
+ unescape = "?";
+ break;
+ }
+ }
+ unescape_len = convert_UTF32_to_UTF8(buf, ch);
+ unescape = buf;
}
break;
default:
- rb_str_buf_cat(result, p, 1);
- p++;
- break;
+ p = pe;
+ continue;
}
+ rb_str_buf_cat(result, unescape, unescape_len);
+ p = ++pe;
} else {
- char *q = p;
- while (*q != '\\' && q < pe) q++;
- rb_str_buf_cat(result, p, q - p);
- p = q;
+ pe++;
}
}
+ rb_str_buf_cat(result, p, pe - p);
return result;
}
@@ -416,7 +431,7 @@ static VALUE json_string_unescape(char *p, char *pe)
write data;
action parse_string {
- *result = json_string_unescape(json->memo + 1, p);
+ *result = json_string_unescape(*result, json->memo + 1, p);
if (NIL_P(*result)) {
fhold;
fbreak;
@@ -435,7 +450,7 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
{
int cs = EVIL;
- *result = rb_str_new("", 0);
+ *result = rb_str_buf_new(0);
%% write init;
json->memo = p;
%% write exec;
diff --git a/ext/json/ext/parser/unicode.c b/ext/json/ext/parser/unicode.c
index 711aac5..45462c9 100644
--- a/ext/json/ext/parser/unicode.c
+++ b/ext/json/ext/parser/unicode.c
@@ -1,36 +1,5 @@
#include "unicode.h"
-/*
- * Copyright 2001-2004 Unicode, Inc.
- *
- * Disclaimer
- *
- * This source code is provided as is by Unicode, Inc. No claims are
- * made as to fitness for any particular purpose. No warranties of any
- * kind are expressed or implied. The recipient agrees to determine
- * applicability of information provided. If this file has been
- * purchased on magnetic or optical media from Unicode, Inc., the
- * sole remedy for any claim will be exchange of defective media
- * within 90 days of receipt.
- *
- * Limitations on Rights to Redistribute This Code
- *
- * Unicode, Inc. hereby grants the right to freely use the information
- * supplied in this file in the creation of products supporting the
- * Unicode Standard, and to make copies of this file in any form
- * for internal or external distribution as long as this notice
- * remains attached.
- */
-
-/*
- * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
- * into the first byte, depending on how many bytes follow. There are
- * as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
- * for *legal* UTF-8 will be 4 or fewer bytes total.
- */
-static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
static const char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -48,97 +17,47 @@ static const char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1
};
-char *JSON_convert_UTF16_to_UTF8 (
- VALUE buffer,
- char *source,
- char *sourceEnd)
+inline UTF32 unescape_unicode(const unsigned char *p)
{
- UTF16 *tmp, *tmpPtr, *tmpEnd;
- char buf[5];
- long n = 0;
- char failed = 1, c, *p = source - 1;
-
- while (p < sourceEnd && p[0] == '\\' && p[1] == 'u') {
- p += 6;
- n++;
- }
- p = source + 1;
- tmpPtr = tmp = ALLOC_N(UTF16, n);
- tmpEnd = tmp + n;
- while (tmpPtr < tmpEnd) {
- c = digit_values[(unsigned char) *p++];
- failed *= c;
- *tmpPtr = c << 12;
- c = digit_values[(unsigned char) *p++];
- failed *= c;
- *tmpPtr |= c << 8;
- c = digit_values[(unsigned char) *p++];
- failed *= c;
- *tmpPtr |= c << 4;
- c = digit_values[(unsigned char) *p++];
- failed *= c;
- *tmpPtr++ |= c;
- p += 2;
- }
- if (failed < 0) {
- rb_raise(rb_path2class("JSON::ParserError"),
- "illegal \\uXXXX unicode value near %s", source);
- }
-
- tmpPtr = tmp;
- while (tmpPtr < tmpEnd) {
- UTF32 ch;
- unsigned short bytesToWrite = 0;
- const UTF32 byteMask = 0xBF;
- const UTF32 byteMark = 0x80;
- ch = *tmpPtr++;
- /* If we have a surrogate pair, convert to UTF32 first. */
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
- /* If the 16 bits following the high surrogate are in the source
- * buffer... */
- if (tmpPtr < tmpEnd) {
- UTF32 ch2 = *tmpPtr;
- /* If it's a low surrogate, convert to UTF32. */
- if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
- ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
- + (ch2 - UNI_SUR_LOW_START) + halfBase;
- ++tmpPtr;
- }
- } else { /* We don't have the 16 bits following the high surrogate. */
- ruby_xfree(tmp);
- rb_raise(rb_path2class("JSON::ParserError"),
- "partial character in source, but hit end near %s", source);
- break;
- }
- }
- /* Figure out how many bytes the result will require */
- if (ch < (UTF32) 0x80) {
- bytesToWrite = 1;
- } else if (ch < (UTF32) 0x800) {
- bytesToWrite = 2;
- } else if (ch < (UTF32) 0x10000) {
- bytesToWrite = 3;
- } else if (ch < (UTF32) 0x110000) {
- bytesToWrite = 4;
- } else {
- bytesToWrite = 3;
- ch = UNI_REPLACEMENT_CHAR;
- }
+ char b;
+ UTF32 result = 0;
+ b = digit_values[p[0]];
+ if (b < 0) return UNI_REPLACEMENT_CHAR;
+ result = (result << 4) | b;
+ b = digit_values[p[1]];
+ result = (result << 4) | b;
+ if (b < 0) return UNI_REPLACEMENT_CHAR;
+ b = digit_values[p[2]];
+ result = (result << 4) | b;
+ if (b < 0) return UNI_REPLACEMENT_CHAR;
+ b = digit_values[p[3]];
+ result = (result << 4) | b;
+ if (b < 0) return UNI_REPLACEMENT_CHAR;
+ return result;
+}
- buf[0] = 0;
- buf[1] = 0;
- buf[2] = 0;
- buf[3] = 0;
- p = buf + bytesToWrite;
- switch (bytesToWrite) { /* note: everything falls through. */
- case 4: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
- case 3: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
- case 2: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
- case 1: *--p = (UTF8) (ch | firstByteMark[bytesToWrite]);
- }
- rb_str_buf_cat(buffer, p, bytesToWrite);
+inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch)
+{
+ int len = 1;
+ if (ch <= 0x7F) {
+ buf[0] = (char) ch;
+ } else if (ch <= 0x07FF) {
+ buf[0] = (char) ((ch >> 6) | 0xC0);
+ buf[1] = (char) ((ch & 0x3F) | 0x80);
+ len++;
+ } else if (ch <= 0xFFFF) {
+ buf[0] = (char) ((ch >> 12) | 0xE0);
+ buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80);
+ buf[2] = (char) ((ch & 0x3F) | 0x80);
+ len += 2;
+ } else if (ch <= 0x1fffff) {
+ buf[0] =(char) ((ch >> 18) | 0xF0);
+ buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80);
+ buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80);
+ buf[3] =(char) ((ch & 0x3F) | 0x80);
+ len += 3;
+ } else {
+ buf[0] = '?';
}
- ruby_xfree(tmp);
- source += 6 * n - 1;
- return source;
+ return len;
}
diff --git a/ext/json/ext/parser/unicode.h b/ext/json/ext/parser/unicode.h
index 40de426..1e327ae 100644
--- a/ext/json/ext/parser/unicode.h
+++ b/ext/json/ext/parser/unicode.h
@@ -9,44 +9,12 @@ typedef unsigned short UTF16; /* at least 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
-#define UNI_MAX_BMP (UTF32)0x0000FFFF
-#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
-#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
-#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
-
#define UNI_SUR_HIGH_START (UTF32)0xD800
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
#define UNI_SUR_LOW_START (UTF32)0xDC00
#define UNI_SUR_LOW_END (UTF32)0xDFFF
-static const int halfShift = 10; /* used for shifting by 10 bits */
-
-static const UTF32 halfBase = 0x0010000UL;
-static const UTF32 halfMask = 0x3FFUL;
-
-typedef enum {
- conversionOK = 0, /* conversion successful */
- sourceExhausted, /* partial character in source, but hit end */
- targetExhausted, /* insuff. room in target for conversion */
- sourceIllegal /* source sequence is illegal/malformed */
-} ConversionResult;
-
-char *JSON_convert_UTF16_to_UTF8 (
- VALUE buffer,
- char *source,
- char *sourceEnd);
-
-#ifndef RARRAY_PTR
-#define RARRAY_PTR(ARRAY) RARRAY(ARRAY)->ptr
-#endif
-#ifndef RARRAY_LEN
-#define RARRAY_LEN(ARRAY) RARRAY(ARRAY)->len
-#endif
-#ifndef RSTRING_PTR
-#define RSTRING_PTR(string) RSTRING(string)->ptr
-#endif
-#ifndef RSTRING_LEN
-#define RSTRING_LEN(string) RSTRING(string)->len
-#endif
+inline UTF32 unescape_unicode(const unsigned char *p);
+inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch);
#endif