diff options
author | Eric Haszlakiewicz <erh+git@nimenees.com> | 2020-06-21 03:10:55 +0000 |
---|---|---|
committer | Eric Haszlakiewicz <erh+git@nimenees.com> | 2020-06-21 03:10:55 +0000 |
commit | 36118b681ea3b8e99735beee73cbd25a63e942cd (patch) | |
tree | 53219dabd2000c615aa8dc9215a084b4762928ca /json_tokener.c | |
parent | 50179fb09f5d317e7192c781ca4a6b039d7818ae (diff) | |
download | json-c-36118b681ea3b8e99735beee73cbd25a63e942cd.tar.gz |
Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.
While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape. No other functional changes (yet).
Diffstat (limited to 'json_tokener.c')
-rw-r--r-- | json_tokener.c | 311 |
1 files changed, 159 insertions, 152 deletions
diff --git a/json_tokener.c b/json_tokener.c index 2a73ce2..15ddd17 100644 --- a/json_tokener.c +++ b/json_tokener.c @@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene /* PEEK_CHAR(dest, tok) macro: * Peeks at the current char and stores it in dest. * Returns 1 on success, sets tok->err and returns 0 if no more chars. - * Implicit inputs: str, len vars + * Implicit inputs: str, len, nBytesp vars */ #define PEEK_CHAR(dest, tok) \ (((tok)->char_offset == len) \ @@ -633,175 +633,182 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char * /* Handle a 4-byte sequence, or two sequences if a surrogate pair */ while (1) { - if (c && strchr(json_hex_chars, c)) + if (!c || !strchr(json_hex_chars, c)) { - tok->ucs_char += ((unsigned int)jt_hexdigit(c) - << ((3 - tok->st_pos++) * 4)); - if (tok->st_pos == 4) + tok->err = json_tokener_error_parse_string; + goto out; + } + tok->ucs_char |= ((unsigned int)jt_hexdigit(c) + << ((3 - tok->st_pos) * 4)); + tok->st_pos++; + if (tok->st_pos < 4) + { + ADVANCE_CHAR(str, tok); + if (!PEEK_CHAR(c, tok)) { - unsigned char unescaped_utf[4]; + /* + * We're out of characters in the current call to + * json_tokener_parse(), but a subsequent call might + * provide us with more, so leave our current state + * as-is (including tok->high_surrogate) and return. + */ + goto out; + } + continue; + } - if (tok->got_hi_surrogate) - { - if (IS_LOW_SURROGATE(tok->ucs_char)) - { - /* remove the utf8_replacement_char */ - /* which may generate during */ - /* parsing the high surrogate pair. */ - if (!strcmp( - tok->pb->buf, - (char *) - utf8_replacement_char)) - { - printbuf_reset(tok->pb); - } - /* Recalculate the ucs_char, then fall thru to process normally */ - tok->ucs_char = - DECODE_SURROGATE_PAIR( - tok->got_hi_surrogate, - tok->ucs_char); - } - else - { - /* Hi surrogate was not followed by a low surrogate */ - /* Replace the hi and process the rest normally */ - printbuf_memappend_fast( - tok->pb, - (char *)utf8_replacement_char, - 3); - } - tok->got_hi_surrogate = 0; - } + /* Now, we have a full \uNNNN sequence in tok->ucs_char */ - if (tok->ucs_char < 0x80) - { - unescaped_utf[0] = tok->ucs_char; - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 1); - } - else if (tok->ucs_char < 0x800) - { - unescaped_utf[0] = - 0xc0 | (tok->ucs_char >> 6); - unescaped_utf[1] = - 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 2); - } - else if (IS_HIGH_SURROGATE(tok->ucs_char)) - { - /* Got a high surrogate. Remember it and look for - * the beginning of another sequence, which - * should be the low surrogate. - */ - tok->got_hi_surrogate = tok->ucs_char; - /* Not at end, and the next two chars should be "\u" */ - if ((len == -1 || - len > (tok->char_offset + 2)) && - // str[0] != '0' && // implied by json_hex_chars, above. - (str[1] == '\\') && (str[2] == 'u')) - { - /* Advance through the 16 bit surrogate, and move - * on to the next sequence. The next step is to - * process the following characters. - */ - if (!ADVANCE_CHAR(str, tok) || - !ADVANCE_CHAR(str, tok)) - { - printbuf_memappend_fast( - tok->pb, - (char *) - utf8_replacement_char, - 3); - } - /* Advance to the first char of the next sequence and - * continue processing with the next sequence. - */ - if (!ADVANCE_CHAR(str, tok) || - !PEEK_CHAR(c, tok)) - { - printbuf_memappend_fast( - tok->pb, - (char *) - utf8_replacement_char, - 3); - tok->ucs_char = 0; - tok->st_pos = 0; - goto out; - } - tok->ucs_char = 0; - tok->st_pos = 0; - /* other json_tokener_state_escape_unicode */ - continue; - } - else - { - /* Got a high surrogate without another sequence following - * it. Put a replacement char in for the hi surrogate - * and pretend we finished. - */ - printbuf_memappend_fast( - tok->pb, - (char *)utf8_replacement_char, - 3); - } - } - else if (IS_LOW_SURROGATE(tok->ucs_char)) - { - /* Got a low surrogate not preceded by a high */ - printbuf_memappend_fast( - tok->pb, (char *)utf8_replacement_char, - 3); - } - else if (tok->ucs_char < 0x10000) + if (tok->high_surrogate) + { + if (IS_LOW_SURROGATE(tok->ucs_char)) + { + /* remove the utf8_replacement_char */ + /* which may generate during */ + /* parsing the high surrogate pair. */ + if (!strcmp( + tok->pb->buf, + (char *) + utf8_replacement_char)) { - unescaped_utf[0] = - 0xe0 | (tok->ucs_char >> 12); - unescaped_utf[1] = - 0x80 | ((tok->ucs_char >> 6) & 0x3f); - unescaped_utf[2] = - 0x80 | (tok->ucs_char & 0x3f); - printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 3); + printbuf_reset(tok->pb); } - else if (tok->ucs_char < 0x110000) + /* Recalculate the ucs_char, then fall thru to process normally */ + tok->ucs_char = + DECODE_SURROGATE_PAIR( + tok->high_surrogate, + tok->ucs_char); + } + else + { + /* High surrogate was not followed by a low surrogate + * Replace the high and process the rest normally + */ + printbuf_memappend_fast( + tok->pb, + (char *)utf8_replacement_char, + 3); + } + tok->high_surrogate = 0; + } + + if (tok->ucs_char < 0x80) + { + unsigned char unescaped_utf[1]; + unescaped_utf[0] = tok->ucs_char; + printbuf_memappend_fast( + tok->pb, (char *)unescaped_utf, 1); + } + else if (tok->ucs_char < 0x800) + { + unsigned char unescaped_utf[2]; + unescaped_utf[0] = + 0xc0 | (tok->ucs_char >> 6); + unescaped_utf[1] = + 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast( + tok->pb, (char *)unescaped_utf, 2); + } + else if (IS_HIGH_SURROGATE(tok->ucs_char)) + { + /* Got a high surrogate. Remember it and look for + * the beginning of another \uNNNN sequence, which + * should be the low surrogate. + */ + tok->high_surrogate = tok->ucs_char; + /* Not at end, and the next two chars should be "\u" */ + if ((len == -1 || + len > (tok->char_offset + 2)) && + // str[0] != '0' && // implied by json_hex_chars, above. + (str[1] == '\\') && (str[2] == 'u')) + { + /* Advance through the 16 bit surrogate, and move + * on to the next sequence. The next step is to + * process the following characters. + */ + if (!ADVANCE_CHAR(str, tok) || + !ADVANCE_CHAR(str, tok)) { - unescaped_utf[0] = - 0xf0 | ((tok->ucs_char >> 18) & 0x07); - unescaped_utf[1] = - 0x80 | ((tok->ucs_char >> 12) & 0x3f); - unescaped_utf[2] = - 0x80 | ((tok->ucs_char >> 6) & 0x3f); - unescaped_utf[3] = - 0x80 | (tok->ucs_char & 0x3f); printbuf_memappend_fast( - tok->pb, (char *)unescaped_utf, 4); + tok->pb, + (char *) + utf8_replacement_char, + 3); } - else + /* Advance to the first char of the next sequence and + * continue processing with the next sequence. + */ + if (!ADVANCE_CHAR(str, tok) || + !PEEK_CHAR(c, tok)) { - /* Don't know what we got--insert the replacement char */ printbuf_memappend_fast( - tok->pb, (char *)utf8_replacement_char, - 3); + tok->pb, + (char *) + utf8_replacement_char, + 3); + tok->ucs_char = 0; + tok->st_pos = 0; + goto out; } - state = saved_state; - break; + tok->ucs_char = 0; + tok->st_pos = 0; + /* other json_tokener_state_escape_unicode */ + continue; + } + else + { + /* Got a high surrogate without another sequence following + * it. Put a replacement char in for the high surrogate + * and pretend we finished. + */ + printbuf_memappend_fast( + tok->pb, + (char *)utf8_replacement_char, + 3); } } - else + else if (IS_LOW_SURROGATE(tok->ucs_char)) { - tok->err = json_tokener_error_parse_string; - goto out; + /* Got a low surrogate not preceded by a high */ + printbuf_memappend_fast( + tok->pb, (char *)utf8_replacement_char, + 3); } - if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok)) + else if (tok->ucs_char < 0x10000) { - /* Clean up any pending chars */ - if (tok->got_hi_surrogate && - strcmp(tok->pb->buf, (char *)utf8_replacement_char)) - printbuf_memappend_fast( - tok->pb, (char *)utf8_replacement_char, 3); - goto out; + unsigned char unescaped_utf[3]; + unescaped_utf[0] = + 0xe0 | (tok->ucs_char >> 12); + unescaped_utf[1] = + 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[2] = + 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast( + tok->pb, (char *)unescaped_utf, 3); + } + else if (tok->ucs_char < 0x110000) + { + unsigned char unescaped_utf[4]; + unescaped_utf[0] = + 0xf0 | ((tok->ucs_char >> 18) & 0x07); + unescaped_utf[1] = + 0x80 | ((tok->ucs_char >> 12) & 0x3f); + unescaped_utf[2] = + 0x80 | ((tok->ucs_char >> 6) & 0x3f); + unescaped_utf[3] = + 0x80 | (tok->ucs_char & 0x3f); + printbuf_memappend_fast( + tok->pb, (char *)unescaped_utf, 4); } + else + { + /* Don't know what we got--insert the replacement char */ + printbuf_memappend_fast( + tok->pb, (char *)utf8_replacement_char, + 3); + } + state = saved_state; // i.e. _state_string or _object_field + break; } } break; |