summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLloyd Hilaiel <lloyd@hilaiel.com>2011-04-25 08:06:05 -0600
committerLloyd Hilaiel <lloyd@hilaiel.com>2011-04-25 08:06:05 -0600
commitec8204ddf795a233d8c7053f026ba7e2208b529a (patch)
tree4678b68904bb8a3113181a1927c743394d7d63ab /src
parentde8e5ac7b557fffcaabc03492fb8b11b4fd2f445 (diff)
downloadyajl-ec8204ddf795a233d8c7053f026ba7e2208b529a.tar.gz
OMFG. string lexing improvements that yield about a 20% improvement in parsing performance. inspiration provided by mike hanson.
Diffstat (limited to 'src')
-rw-r--r--src/yajl_lex.c98
1 files changed, 70 insertions, 28 deletions
diff --git a/src/yajl_lex.c b/src/yajl_lex.c
index af0fadc..b098e6a 100644
--- a/src/yajl_lex.c
+++ b/src/yajl_lex.c
@@ -122,15 +122,19 @@ yajl_lex_free(yajl_lexer lxr)
}
/* a lookup table which lets us quickly determine three things:
- * VEC - valid escaped conrol char
+ * VEC - valid escaped control char
+ * note. the solidus '/' may be escaped or not.
* IJC - invalid json char
* VHC - valid hex char
- * note. the solidus '/' may be escaped or not.
- * note. the
+ * NFP - needs further processing (from a string scanning perspective)
+ * NUC - needs utf8 checking when enabled (from a string scanning perspective)
*/
-#define VEC 1
-#define IJC 2
-#define VHC 4
+#define VEC 0x01
+#define IJC 0x02
+#define VHC 0x04
+#define NFP 0x08
+#define NUC 0x10
+
static const char charLookupTable[256] =
{
/*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
@@ -138,7 +142,7 @@ static const char charLookupTable[256] =
/*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
/*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC ,
-/*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
+/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 ,
/*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC ,
/*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC ,
/*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 ,
@@ -146,33 +150,32 @@ static const char charLookupTable[256] =
/*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 ,
/*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
/*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-/*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 ,
+/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 ,
/*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 ,
/*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 ,
/*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 ,
/*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-/* include these so we don't have to always check the range of the char */
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC ,
+ NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC
};
/** process a variable length utf8 encoded codepoint.
@@ -244,6 +247,23 @@ if (*offset >= jsonTextLen) { \
goto finish_string_lex; \
}
+/** scan a string for interesting characters that might need further
+ * review. return the number of chars that are uninteresting and can
+ * be skipped.
+ * (lth) hi world, any thoughts on how to make this routine faster? */
+static size_t
+yajl_string_scan(const unsigned char * buf, size_t len, int utf8check)
+{
+ unsigned char mask = IJC|NFP|(utf8check ? NUC : 0);
+ size_t skip = 0;
+ while (skip < len && !(charLookupTable[*buf] & mask))
+ {
+ skip++;
+ buf++;
+ }
+ return skip;
+}
+
static yajl_tok
yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
size_t jsonTextLen, size_t * offset)
@@ -254,6 +274,28 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText,
for (;;) {
unsigned char curChar;
+ /* now jump into a faster scanning routine to skip as much
+ * of the buffers as possible */
+ {
+ const unsigned char * p;
+ size_t len;
+
+ if ((lexer->bufInUse && yajl_buf_len(lexer->buf) &&
+ lexer->bufOff < yajl_buf_len(lexer->buf)))
+ {
+ p = ((const unsigned char *) yajl_buf_data(lexer->buf) +
+ (lexer->bufOff));
+ len = yajl_buf_len(lexer->buf) - lexer->bufOff;
+ lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ else if (*offset < jsonTextLen)
+ {
+ p = jsonText + *offset;
+ len = jsonTextLen - *offset;
+ *offset += yajl_string_scan(p, len, lexer->validateUTF8);
+ }
+ }
+
STR_CHECK_EOF;
curChar = readChar(lexer, jsonText, offset);