diff options
Diffstat (limited to 'src/yajl_lex.c')
-rw-r--r-- | src/yajl_lex.c | 98 |
1 files changed, 70 insertions, 28 deletions
diff --git a/src/yajl_lex.c b/src/yajl_lex.c index af0fadc..b098e6a 100644 --- a/src/yajl_lex.c +++ b/src/yajl_lex.c @@ -122,15 +122,19 @@ yajl_lex_free(yajl_lexer lxr) } /* a lookup table which lets us quickly determine three things: - * VEC - valid escaped conrol char + * VEC - valid escaped control char + * note. the solidus '/' may be escaped or not. * IJC - invalid json char * VHC - valid hex char - * note. the solidus '/' may be escaped or not. - * note. the + * NFP - needs further processing (from a string scanning perspective) + * NUC - needs utf8 checking when enabled (from a string scanning perspective) */ -#define VEC 1 -#define IJC 2 -#define VHC 4 +#define VEC 0x01 +#define IJC 0x02 +#define VHC 0x04 +#define NFP 0x08 +#define NUC 0x10 + static const char charLookupTable[256] = { /*00*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , @@ -138,7 +142,7 @@ static const char charLookupTable[256] = /*10*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , /*18*/ IJC , IJC , IJC , IJC , IJC , IJC , IJC , IJC , -/*20*/ 0 , 0 , VEC|IJC, 0 , 0 , 0 , 0 , 0 , +/*20*/ 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , 0 , 0 , /*28*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , VEC , /*30*/ VHC , VHC , VHC , VHC , VHC , VHC , VHC , VHC , /*38*/ VHC , VHC , 0 , 0 , 0 , 0 , 0 , 0 , @@ -146,33 +150,32 @@ static const char charLookupTable[256] = /*40*/ 0 , VHC , VHC , VHC , VHC , VHC , VHC , 0 , /*48*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , /*50*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -/*58*/ 0 , 0 , 0 , 0 , VEC|IJC, 0 , 0 , 0 , +/*58*/ 0 , 0 , 0 , 0 , NFP|VEC|IJC, 0 , 0 , 0 , /*60*/ 0 , VHC , VEC|VHC, VHC , VHC , VHC , VEC|VHC, 0 , /*68*/ 0 , 0 , 0 , 0 , 0 , 0 , VEC , 0 , /*70*/ 0 , 0 , VEC , 0 , VEC , 0 , 0 , 0 , /*78*/ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , -/* include these so we don't have to always check the range of the char */ - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC , + NUC , NUC , NUC , NUC , NUC , NUC , NUC , NUC }; /** process a variable length utf8 encoded codepoint. @@ -244,6 +247,23 @@ if (*offset >= jsonTextLen) { \ goto finish_string_lex; \ } +/** scan a string for interesting characters that might need further + * review. return the number of chars that are uninteresting and can + * be skipped. + * (lth) hi world, any thoughts on how to make this routine faster? */ +static size_t +yajl_string_scan(const unsigned char * buf, size_t len, int utf8check) +{ + unsigned char mask = IJC|NFP|(utf8check ? NUC : 0); + size_t skip = 0; + while (skip < len && !(charLookupTable[*buf] & mask)) + { + skip++; + buf++; + } + return skip; +} + static yajl_tok yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, size_t jsonTextLen, size_t * offset) @@ -254,6 +274,28 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, for (;;) { unsigned char curChar; + /* now jump into a faster scanning routine to skip as much + * of the buffers as possible */ + { + const unsigned char * p; + size_t len; + + if ((lexer->bufInUse && yajl_buf_len(lexer->buf) && + lexer->bufOff < yajl_buf_len(lexer->buf))) + { + p = ((const unsigned char *) yajl_buf_data(lexer->buf) + + (lexer->bufOff)); + len = yajl_buf_len(lexer->buf) - lexer->bufOff; + lexer->bufOff += yajl_string_scan(p, len, lexer->validateUTF8); + } + else if (*offset < jsonTextLen) + { + p = jsonText + *offset; + len = jsonTextLen - *offset; + *offset += yajl_string_scan(p, len, lexer->validateUTF8); + } + } + STR_CHECK_EOF; curChar = readChar(lexer, jsonText, offset); |