improved parser a bit

author: Florian Frank <flori@ping.de> 2009-10-26 15:58:00 +0100
committer: Florian Frank <flori@ping.de> 2009-10-26 22:58:08 +0100
commit: f1504ee153790f12ee43bb4ef2551fa76970f519 (patch)
tree: a1b7d9cff62d9b4c7d3e9947066488669ae96db9
parent: dd06e48aa414674f52e81f9cdc7836b6456c04f8 (diff)
download: json-f1504ee153790f12ee43bb4ef2551fa76970f519.tar.gz
5 files changed, 163 insertions, 247 deletions
diff --git a/ext/json/ext/generator/unicode.c b/ext/json/ext/generator/unicode.c
index 53a2ec1..e470eea 100644
--- a/ext/json/ext/generator/unicode.c
+++ b/ext/json/ext/generator/unicode.c
@@ -200,8 +200,7 @@ inline void JSON_convert_UTF8_to_JSON(FBuffer *buffer, VALUE string)
     int len = RSTRING_LEN(string), start = 0, end = 0;
     const char *escape = NULL;
     int escape_len;
-    char buf[7] = { '\\', 'u' };
-    buf[6] = 0; 
+    char buf[6] = { '\\', 'u' };
 
     for (start = 0, end = 0; end < len;) {
         p = ptr + end;
diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c
index 78b4ff9..f67ea0c 100644
--- a/ext/json/ext/parser/parser.c
+++ b/ext/json/ext/parser/parser.c
@@ -1255,63 +1255,78 @@ case 16:
     }
 }
 
-static VALUE json_string_unescape(char *p, char *pe)
+inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd)
 {
-    VALUE result = rb_str_buf_new(pe - p + 1);
-
-    while (p < pe) {
-        if (*p == '\\') {
-            p++;
-            switch (*p) {
+    char *p = string, *pe = string, *unescape;
+    int unescape_len;
+
+    while (pe < stringEnd) {
+        if (*pe == '\\') {
+            unescape = "?";
+            unescape_len = 1;
+            if (pe > p) rb_str_buf_cat(result, p, pe - p);
+            switch (*++pe) {
+                case 'n':
+                    unescape = "\n";
+                    break;
+                case 'r':
+                    unescape = "\r";
+                    break;
+                case 't':
+                    unescape = "\t";
+                    break;
                 case '"':
+                    unescape = "\"";
+                    break;
                 case '\\':
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
+                    unescape = "\\";
                     break;
                 case 'b':
-                    rb_str_buf_cat2(result, "\b");
-                    p++;
+                    unescape = "\b";
                     break;
                 case 'f':
-                    rb_str_buf_cat2(result, "\f");
-                    p++;
-                    break;
-                case 'n':
-                    rb_str_buf_cat2(result, "\n");
-                    p++;
-                    break;
-                case 'r':
-                    rb_str_buf_cat2(result, "\r");
-                    p++;
-                    break;
-                case 't':
-                    rb_str_buf_cat2(result, "\t");
-                    p++;
+                    unescape = "\f";
                     break;
                 case 'u':
-                    if (p > pe - 4) { 
+                    if (pe > stringEnd - 4) { 
                         return Qnil;
                     } else {
-                        p = JSON_convert_UTF16_to_UTF8(result, p, pe);
+                        char buf[4];
+                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        pe += 3;
+                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                            pe++;
+                            if (pe > stringEnd - 6) return Qnil;
+                            if (pe[0] == '\\' && pe[1] == 'u') {
+                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
+                                        | (sur & 0x3FF));
+                                pe += 5;
+                            } else {
+                                unescape = "?";
+                                break;
+                            }
+                        }
+                        unescape_len = convert_UTF32_to_UTF8(buf, ch);
+                        unescape = buf;
                     }
                     break;
                 default:
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
-                    break;
+                    p = pe;
+                    continue;
             }
+            rb_str_buf_cat(result, unescape, unescape_len);
+            p = ++pe;
         } else {
-            char *q = p;
-            while (*q != '\\' && q < pe) q++;
-            rb_str_buf_cat(result, p, q - p);
-            p = q;
+            pe++;
         }
     }
+    rb_str_buf_cat(result, p, pe - p);
     return result;
 }
 
 
-#line 1315 "parser.c"
+#line 1330 "parser.c"
 static const int JSON_string_start = 1;
 static const int JSON_string_first_final = 8;
 static const int JSON_string_error = 0;
@@ -1319,24 +1334,24 @@ static const int JSON_string_error = 0;
 static const int JSON_string_en_main = 1;
 
 
-#line 432 "parser.rl"
+#line 447 "parser.rl"
 
 
 static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result)
 {
     int cs = EVIL;
 
-    *result = rb_str_new("", 0);
+    *result = rb_str_buf_new(0);
     
-#line 1332 "parser.c"
+#line 1347 "parser.c"
 	{
 	cs = JSON_string_start;
 	}
 
-#line 440 "parser.rl"
+#line 455 "parser.rl"
     json->memo = p;
     
-#line 1340 "parser.c"
+#line 1355 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1361,9 +1376,9 @@ case 2:
 		goto st0;
 	goto st2;
 tr2:
-#line 418 "parser.rl"
+#line 433 "parser.rl"
 	{
-        *result = json_string_unescape(json->memo + 1, p);
+        *result = json_string_unescape(*result, json->memo + 1, p);
         if (NIL_P(*result)) {
 			p--;
 			{p++; cs = 8; goto _out;}
@@ -1372,14 +1387,14 @@ tr2:
 			{p = (( p + 1))-1;}
 		}
 	}
-#line 429 "parser.rl"
+#line 444 "parser.rl"
 	{ p--; {p++; cs = 8; goto _out;} }
 	goto st8;
 st8:
 	if ( ++p == pe )
 		goto _test_eof8;
 case 8:
-#line 1383 "parser.c"
+#line 1398 "parser.c"
 	goto st0;
 st3:
 	if ( ++p == pe )
@@ -1455,7 +1470,7 @@ case 7:
 	_out: {}
 	}
 
-#line 442 "parser.rl"
+#line 457 "parser.rl"
 
     if (cs >= JSON_string_first_final) {
         return p + 1;
@@ -1466,7 +1481,7 @@ case 7:
 
 
 
-#line 1470 "parser.c"
+#line 1485 "parser.c"
 static const int JSON_start = 1;
 static const int JSON_first_final = 10;
 static const int JSON_error = 0;
@@ -1474,7 +1489,7 @@ static const int JSON_error = 0;
 static const int JSON_en_main = 1;
 
 
-#line 476 "parser.rl"
+#line 491 "parser.rl"
 
 
 /* 
@@ -1646,16 +1661,16 @@ static VALUE cParser_parse(VALUE self)
     GET_STRUCT;
 
     
-#line 1650 "parser.c"
+#line 1665 "parser.c"
 	{
 	cs = JSON_start;
 	}
 
-#line 647 "parser.rl"
+#line 662 "parser.rl"
     p = json->source;
     pe = p + json->len;
     
-#line 1659 "parser.c"
+#line 1674 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@@ -1711,7 +1726,7 @@ case 5:
 		goto st1;
 	goto st5;
 tr3:
-#line 465 "parser.rl"
+#line 480 "parser.rl"
 	{
         char *np;
         json->current_nesting = 1;
@@ -1720,7 +1735,7 @@ tr3:
     }
 	goto st10;
 tr4:
-#line 458 "parser.rl"
+#line 473 "parser.rl"
 	{
         char *np;
         json->current_nesting = 1;
@@ -1732,7 +1747,7 @@ st10:
 	if ( ++p == pe )
 		goto _test_eof10;
 case 10:
-#line 1736 "parser.c"
+#line 1751 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st10;
 		case 32: goto st10;
@@ -1789,7 +1804,7 @@ case 9:
 	_out: {}
 	}
 
-#line 650 "parser.rl"
+#line 665 "parser.rl"
 
     if (cs >= JSON_first_final && p == pe) {
         return result;
diff --git a/ext/json/ext/parser/parser.rl b/ext/json/ext/parser/parser.rl
index 8eca179..02b2b6a 100644
--- a/ext/json/ext/parser/parser.rl
+++ b/ext/json/ext/parser/parser.rl
@@ -354,58 +354,73 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul
     }
 }
 
-static VALUE json_string_unescape(char *p, char *pe)
+inline static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd)
 {
-    VALUE result = rb_str_buf_new(pe - p + 1);
-
-    while (p < pe) {
-        if (*p == '\\') {
-            p++;
-            switch (*p) {
+    char *p = string, *pe = string, *unescape;
+    int unescape_len;
+
+    while (pe < stringEnd) {
+        if (*pe == '\\') {
+            unescape = "?";
+            unescape_len = 1;
+            if (pe > p) rb_str_buf_cat(result, p, pe - p);
+            switch (*++pe) {
+                case 'n':
+                    unescape = "\n";
+                    break;
+                case 'r':
+                    unescape = "\r";
+                    break;
+                case 't':
+                    unescape = "\t";
+                    break;
                 case '"':
+                    unescape = "\"";
+                    break;
                 case '\\':
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
+                    unescape = "\\";
                     break;
                 case 'b':
-                    rb_str_buf_cat2(result, "\b");
-                    p++;
+                    unescape = "\b";
                     break;
                 case 'f':
-                    rb_str_buf_cat2(result, "\f");
-                    p++;
-                    break;
-                case 'n':
-                    rb_str_buf_cat2(result, "\n");
-                    p++;
-                    break;
-                case 'r':
-                    rb_str_buf_cat2(result, "\r");
-                    p++;
-                    break;
-                case 't':
-                    rb_str_buf_cat2(result, "\t");
-                    p++;
+                    unescape = "\f";
                     break;
                 case 'u':
-                    if (p > pe - 4) { 
+                    if (pe > stringEnd - 4) { 
                         return Qnil;
                     } else {
-                        p = JSON_convert_UTF16_to_UTF8(result, p, pe);
+                        char buf[4];
+                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        pe += 3;
+                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                            pe++;
+                            if (pe > stringEnd - 6) return Qnil;
+                            if (pe[0] == '\\' && pe[1] == 'u') {
+                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
+                                        | (sur & 0x3FF));
+                                pe += 5;
+                            } else {
+                                unescape = "?";
+                                break;
+                            }
+                        }
+                        unescape_len = convert_UTF32_to_UTF8(buf, ch);
+                        unescape = buf;
                     }
                     break;
                 default:
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
-                    break;
+                    p = pe;
+                    continue;
             }
+            rb_str_buf_cat(result, unescape, unescape_len);
+            p = ++pe;
         } else {
-            char *q = p;
-            while (*q != '\\' && q < pe) q++;
-            rb_str_buf_cat(result, p, q - p);
-            p = q;
+            pe++;
         }
     }
+    rb_str_buf_cat(result, p, pe - p);
     return result;
 }
 
@@ -416,7 +431,7 @@ static VALUE json_string_unescape(char *p, char *pe)
     write data;
 
     action parse_string {
-        *result = json_string_unescape(json->memo + 1, p);
+        *result = json_string_unescape(*result, json->memo + 1, p);
         if (NIL_P(*result)) {
 			fhold;
 			fbreak;
@@ -435,7 +450,7 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
 {
     int cs = EVIL;
 
-    *result = rb_str_new("", 0);
+    *result = rb_str_buf_new(0);
     %% write init;
     json->memo = p;
     %% write exec;
diff --git a/ext/json/ext/parser/unicode.c b/ext/json/ext/parser/unicode.c
index 711aac5..45462c9 100644
--- a/ext/json/ext/parser/unicode.c
+++ b/ext/json/ext/parser/unicode.c
@@ -1,36 +1,5 @@
 #include "unicode.h"
 
-/*
- * Copyright 2001-2004 Unicode, Inc.
- * 
- * Disclaimer
- * 
- * This source code is provided as is by Unicode, Inc. No claims are
- * made as to fitness for any particular purpose. No warranties of any
- * kind are expressed or implied. The recipient agrees to determine
- * applicability of information provided. If this file has been
- * purchased on magnetic or optical media from Unicode, Inc., the
- * sole remedy for any claim will be exchange of defective media
- * within 90 days of receipt.
- * 
- * Limitations on Rights to Redistribute This Code
- * 
- * Unicode, Inc. hereby grants the right to freely use the information
- * supplied in this file in the creation of products supporting the
- * Unicode Standard, and to make copies of this file in any form
- * for internal or external distribution as long as this notice
- * remains attached.
- */
-
-/*
- * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
- * into the first byte, depending on how many bytes follow.  There are
- * as many entries in this table as there are UTF-8 sequence types.
- * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
- * for *legal* UTF-8 will be 4 or fewer bytes total.
- */
-static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
-
 static const char digit_values[256] = { 
     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@@ -48,97 +17,47 @@ static const char digit_values[256] = {
     -1, -1, -1, -1, -1, -1, -1
 };
 
-char *JSON_convert_UTF16_to_UTF8 (
-        VALUE buffer,
-        char *source,
-        char *sourceEnd)
+inline UTF32 unescape_unicode(const unsigned char *p)
 {
-    UTF16 *tmp, *tmpPtr, *tmpEnd;
-    char buf[5];
-    long n = 0;
-    char failed = 1, c, *p = source - 1;
-
-    while (p < sourceEnd && p[0] == '\\' && p[1] == 'u') {
-        p += 6;
-        n++;
-    }
-    p = source + 1;
-    tmpPtr = tmp = ALLOC_N(UTF16, n);
-    tmpEnd = tmp + n;
-    while (tmpPtr < tmpEnd) {
-        c = digit_values[(unsigned char) *p++];
-        failed *= c;
-        *tmpPtr = c << 12;
-        c = digit_values[(unsigned char) *p++];
-        failed *= c;
-        *tmpPtr |= c << 8;
-        c = digit_values[(unsigned char) *p++];
-        failed *= c;
-        *tmpPtr |= c << 4;
-        c = digit_values[(unsigned char) *p++];
-        failed *= c;
-        *tmpPtr++ |= c;
-        p += 2;
-    }
-    if (failed < 0) {
-        rb_raise(rb_path2class("JSON::ParserError"),
-                "illegal \\uXXXX unicode value near %s", source);
-    }
-
-    tmpPtr = tmp;
-    while (tmpPtr < tmpEnd) {
-        UTF32 ch;
-        unsigned short bytesToWrite = 0;
-        const UTF32 byteMask = 0xBF;
-        const UTF32 byteMark = 0x80; 
-        ch = *tmpPtr++;
-        /* If we have a surrogate pair, convert to UTF32 first. */
-        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
-            /* If the 16 bits following the high surrogate are in the source
-             * buffer... */
-            if (tmpPtr < tmpEnd) {
-                UTF32 ch2 = *tmpPtr;
-                /* If it's a low surrogate, convert to UTF32. */
-                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
-                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
-                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
-                    ++tmpPtr;
-                }
-            } else { /* We don't have the 16 bits following the high surrogate. */
-                ruby_xfree(tmp);
-                rb_raise(rb_path2class("JSON::ParserError"),
-                    "partial character in source, but hit end near %s", source);
-                break;
-            }
-        }
-        /* Figure out how many bytes the result will require */
-        if (ch < (UTF32) 0x80) {
-            bytesToWrite = 1;
-        } else if (ch < (UTF32) 0x800) {
-            bytesToWrite = 2;
-        } else if (ch < (UTF32) 0x10000) {
-            bytesToWrite = 3;
-        } else if (ch < (UTF32) 0x110000) {
-            bytesToWrite = 4;
-        } else {
-            bytesToWrite = 3;
-            ch = UNI_REPLACEMENT_CHAR;
-        }
+    char b;
+    UTF32 result = 0;
+    b = digit_values[p[0]];
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    result = (result << 4) | b;
+    b = digit_values[p[1]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    b = digit_values[p[2]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    b = digit_values[p[3]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    return result;
+}
 
-        buf[0] = 0;
-        buf[1] = 0;
-        buf[2] = 0;
-        buf[3] = 0;
-        p = buf + bytesToWrite;
-        switch (bytesToWrite) { /* note: everything falls through. */
-            case 4: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
-            case 3: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
-            case 2: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6;
-            case 1: *--p = (UTF8) (ch | firstByteMark[bytesToWrite]);
-        }
-        rb_str_buf_cat(buffer, p, bytesToWrite);
+inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch) 
+{
+    int len = 1;
+    if (ch <= 0x7F) {
+        buf[0] = (char) ch;
+    } else if (ch <= 0x07FF) {
+        buf[0] = (char) ((ch >> 6) | 0xC0);
+        buf[1] = (char) ((ch & 0x3F) | 0x80);
+        len++;
+    } else if (ch <= 0xFFFF) {
+        buf[0] = (char) ((ch >> 12) | 0xE0);
+        buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80);
+        buf[2] = (char) ((ch & 0x3F) | 0x80);
+        len += 2;
+    } else if (ch <= 0x1fffff) {
+        buf[0] =(char) ((ch >> 18) | 0xF0);
+        buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80);
+        buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80);
+        buf[3] =(char) ((ch & 0x3F) | 0x80);
+        len += 3;
+    } else {
+        buf[0] = '?';
     }
-    ruby_xfree(tmp);
-    source += 6 * n - 1;
-    return source;
+    return len;
 }
diff --git a/ext/json/ext/parser/unicode.h b/ext/json/ext/parser/unicode.h
index 40de426..1e327ae 100644
--- a/ext/json/ext/parser/unicode.h
+++ b/ext/json/ext/parser/unicode.h
@@ -9,44 +9,12 @@ typedef unsigned short	UTF16;	/* at least 16 bits */
 typedef unsigned char	UTF8;	/* typically 8 bits */
 
 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
-#define UNI_MAX_BMP (UTF32)0x0000FFFF
-#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
-#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
-#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
-
 #define UNI_SUR_HIGH_START  (UTF32)0xD800
 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
 #define UNI_SUR_LOW_START   (UTF32)0xDC00
 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
 
-static const int halfShift  = 10; /* used for shifting by 10 bits */
-
-static const UTF32 halfBase = 0x0010000UL;
-static const UTF32 halfMask = 0x3FFUL;
-
-typedef enum {
-	conversionOK = 0, 	/* conversion successful */
-	sourceExhausted,	/* partial character in source, but hit end */
-	targetExhausted,	/* insuff. room in target for conversion */
-	sourceIllegal		/* source sequence is illegal/malformed */
-} ConversionResult;
-
-char *JSON_convert_UTF16_to_UTF8 (
-    VALUE buffer,
-    char *source,
-    char *sourceEnd);
-
-#ifndef RARRAY_PTR
-#define RARRAY_PTR(ARRAY) RARRAY(ARRAY)->ptr
-#endif
-#ifndef RARRAY_LEN
-#define RARRAY_LEN(ARRAY) RARRAY(ARRAY)->len
-#endif
-#ifndef RSTRING_PTR
-#define RSTRING_PTR(string) RSTRING(string)->ptr
-#endif
-#ifndef RSTRING_LEN
-#define RSTRING_LEN(string) RSTRING(string)->len
-#endif
+inline UTF32 unescape_unicode(const unsigned char *p);
+inline int convert_UTF32_to_UTF8(char *buf, UTF32 ch);
 
 #endif
author	Florian Frank <flori@ping.de>	2009-10-26 15:58:00 +0100
committer	Florian Frank <flori@ping.de>	2009-10-26 22:58:08 +0100
commit	f1504ee153790f12ee43bb4ef2551fa76970f519 (patch)
tree	a1b7d9cff62d9b4c7d3e9947066488669ae96db9
parent	dd06e48aa414674f52e81f9cdc7836b6456c04f8 (diff)
download	json-f1504ee153790f12ee43bb4ef2551fa76970f519.tar.gz