summaryrefslogtreecommitdiff
path: root/json_tokener.c
diff options
context:
space:
mode:
authorEric Haszlakiewicz <erh+git@nimenees.com>2020-06-21 03:10:55 +0000
committerEric Haszlakiewicz <erh+git@nimenees.com>2020-06-21 03:10:55 +0000
commit36118b681ea3b8e99735beee73cbd25a63e942cd (patch)
tree53219dabd2000c615aa8dc9215a084b4762928ca /json_tokener.c
parent50179fb09f5d317e7192c781ca4a6b039d7818ae (diff)
downloadjson-c-36118b681ea3b8e99735beee73cbd25a63e942cd.tar.gz
Rearrange the json_tokener_state_escape_unicode case in json_tokener to simplify the code slightly and make it a bit easier to understand.
While here, drop the utf8_replacement_char that is unnecesarily added if we run out of input in the middle of a unicode escape. No other functional changes (yet).
Diffstat (limited to 'json_tokener.c')
-rw-r--r--json_tokener.c311
1 files changed, 159 insertions, 152 deletions
diff --git a/json_tokener.c b/json_tokener.c
index 2a73ce2..15ddd17 100644
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -223,7 +223,7 @@ struct json_object *json_tokener_parse_verbose(const char *str, enum json_tokene
/* PEEK_CHAR(dest, tok) macro:
* Peeks at the current char and stores it in dest.
* Returns 1 on success, sets tok->err and returns 0 if no more chars.
- * Implicit inputs: str, len vars
+ * Implicit inputs: str, len, nBytesp vars
*/
#define PEEK_CHAR(dest, tok) \
(((tok)->char_offset == len) \
@@ -633,175 +633,182 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
while (1)
{
- if (c && strchr(json_hex_chars, c))
+ if (!c || !strchr(json_hex_chars, c))
{
- tok->ucs_char += ((unsigned int)jt_hexdigit(c)
- << ((3 - tok->st_pos++) * 4));
- if (tok->st_pos == 4)
+ tok->err = json_tokener_error_parse_string;
+ goto out;
+ }
+ tok->ucs_char |= ((unsigned int)jt_hexdigit(c)
+ << ((3 - tok->st_pos) * 4));
+ tok->st_pos++;
+ if (tok->st_pos < 4)
+ {
+ ADVANCE_CHAR(str, tok);
+ if (!PEEK_CHAR(c, tok))
{
- unsigned char unescaped_utf[4];
+ /*
+ * We're out of characters in the current call to
+ * json_tokener_parse(), but a subsequent call might
+ * provide us with more, so leave our current state
+ * as-is (including tok->high_surrogate) and return.
+ */
+ goto out;
+ }
+ continue;
+ }
- if (tok->got_hi_surrogate)
- {
- if (IS_LOW_SURROGATE(tok->ucs_char))
- {
- /* remove the utf8_replacement_char */
- /* which may generate during */
- /* parsing the high surrogate pair. */
- if (!strcmp(
- tok->pb->buf,
- (char *)
- utf8_replacement_char))
- {
- printbuf_reset(tok->pb);
- }
- /* Recalculate the ucs_char, then fall thru to process normally */
- tok->ucs_char =
- DECODE_SURROGATE_PAIR(
- tok->got_hi_surrogate,
- tok->ucs_char);
- }
- else
- {
- /* Hi surrogate was not followed by a low surrogate */
- /* Replace the hi and process the rest normally */
- printbuf_memappend_fast(
- tok->pb,
- (char *)utf8_replacement_char,
- 3);
- }
- tok->got_hi_surrogate = 0;
- }
+ /* Now, we have a full \uNNNN sequence in tok->ucs_char */
- if (tok->ucs_char < 0x80)
- {
- unescaped_utf[0] = tok->ucs_char;
- printbuf_memappend_fast(
- tok->pb, (char *)unescaped_utf, 1);
- }
- else if (tok->ucs_char < 0x800)
- {
- unescaped_utf[0] =
- 0xc0 | (tok->ucs_char >> 6);
- unescaped_utf[1] =
- 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend_fast(
- tok->pb, (char *)unescaped_utf, 2);
- }
- else if (IS_HIGH_SURROGATE(tok->ucs_char))
- {
- /* Got a high surrogate. Remember it and look for
- * the beginning of another sequence, which
- * should be the low surrogate.
- */
- tok->got_hi_surrogate = tok->ucs_char;
- /* Not at end, and the next two chars should be "\u" */
- if ((len == -1 ||
- len > (tok->char_offset + 2)) &&
- // str[0] != '0' && // implied by json_hex_chars, above.
- (str[1] == '\\') && (str[2] == 'u'))
- {
- /* Advance through the 16 bit surrogate, and move
- * on to the next sequence. The next step is to
- * process the following characters.
- */
- if (!ADVANCE_CHAR(str, tok) ||
- !ADVANCE_CHAR(str, tok))
- {
- printbuf_memappend_fast(
- tok->pb,
- (char *)
- utf8_replacement_char,
- 3);
- }
- /* Advance to the first char of the next sequence and
- * continue processing with the next sequence.
- */
- if (!ADVANCE_CHAR(str, tok) ||
- !PEEK_CHAR(c, tok))
- {
- printbuf_memappend_fast(
- tok->pb,
- (char *)
- utf8_replacement_char,
- 3);
- tok->ucs_char = 0;
- tok->st_pos = 0;
- goto out;
- }
- tok->ucs_char = 0;
- tok->st_pos = 0;
- /* other json_tokener_state_escape_unicode */
- continue;
- }
- else
- {
- /* Got a high surrogate without another sequence following
- * it. Put a replacement char in for the hi surrogate
- * and pretend we finished.
- */
- printbuf_memappend_fast(
- tok->pb,
- (char *)utf8_replacement_char,
- 3);
- }
- }
- else if (IS_LOW_SURROGATE(tok->ucs_char))
- {
- /* Got a low surrogate not preceded by a high */
- printbuf_memappend_fast(
- tok->pb, (char *)utf8_replacement_char,
- 3);
- }
- else if (tok->ucs_char < 0x10000)
+ if (tok->high_surrogate)
+ {
+ if (IS_LOW_SURROGATE(tok->ucs_char))
+ {
+ /* remove the utf8_replacement_char */
+ /* which may generate during */
+ /* parsing the high surrogate pair. */
+ if (!strcmp(
+ tok->pb->buf,
+ (char *)
+ utf8_replacement_char))
{
- unescaped_utf[0] =
- 0xe0 | (tok->ucs_char >> 12);
- unescaped_utf[1] =
- 0x80 | ((tok->ucs_char >> 6) & 0x3f);
- unescaped_utf[2] =
- 0x80 | (tok->ucs_char & 0x3f);
- printbuf_memappend_fast(
- tok->pb, (char *)unescaped_utf, 3);
+ printbuf_reset(tok->pb);
}
- else if (tok->ucs_char < 0x110000)
+ /* Recalculate the ucs_char, then fall thru to process normally */
+ tok->ucs_char =
+ DECODE_SURROGATE_PAIR(
+ tok->high_surrogate,
+ tok->ucs_char);
+ }
+ else
+ {
+ /* High surrogate was not followed by a low surrogate
+ * Replace the high and process the rest normally
+ */
+ printbuf_memappend_fast(
+ tok->pb,
+ (char *)utf8_replacement_char,
+ 3);
+ }
+ tok->high_surrogate = 0;
+ }
+
+ if (tok->ucs_char < 0x80)
+ {
+ unsigned char unescaped_utf[1];
+ unescaped_utf[0] = tok->ucs_char;
+ printbuf_memappend_fast(
+ tok->pb, (char *)unescaped_utf, 1);
+ }
+ else if (tok->ucs_char < 0x800)
+ {
+ unsigned char unescaped_utf[2];
+ unescaped_utf[0] =
+ 0xc0 | (tok->ucs_char >> 6);
+ unescaped_utf[1] =
+ 0x80 | (tok->ucs_char & 0x3f);
+ printbuf_memappend_fast(
+ tok->pb, (char *)unescaped_utf, 2);
+ }
+ else if (IS_HIGH_SURROGATE(tok->ucs_char))
+ {
+ /* Got a high surrogate. Remember it and look for
+ * the beginning of another \uNNNN sequence, which
+ * should be the low surrogate.
+ */
+ tok->high_surrogate = tok->ucs_char;
+ /* Not at end, and the next two chars should be "\u" */
+ if ((len == -1 ||
+ len > (tok->char_offset + 2)) &&
+ // str[0] != '0' && // implied by json_hex_chars, above.
+ (str[1] == '\\') && (str[2] == 'u'))
+ {
+ /* Advance through the 16 bit surrogate, and move
+ * on to the next sequence. The next step is to
+ * process the following characters.
+ */
+ if (!ADVANCE_CHAR(str, tok) ||
+ !ADVANCE_CHAR(str, tok))
{
- unescaped_utf[0] =
- 0xf0 | ((tok->ucs_char >> 18) & 0x07);
- unescaped_utf[1] =
- 0x80 | ((tok->ucs_char >> 12) & 0x3f);
- unescaped_utf[2] =
- 0x80 | ((tok->ucs_char >> 6) & 0x3f);
- unescaped_utf[3] =
- 0x80 | (tok->ucs_char & 0x3f);
printbuf_memappend_fast(
- tok->pb, (char *)unescaped_utf, 4);
+ tok->pb,
+ (char *)
+ utf8_replacement_char,
+ 3);
}
- else
+ /* Advance to the first char of the next sequence and
+ * continue processing with the next sequence.
+ */
+ if (!ADVANCE_CHAR(str, tok) ||
+ !PEEK_CHAR(c, tok))
{
- /* Don't know what we got--insert the replacement char */
printbuf_memappend_fast(
- tok->pb, (char *)utf8_replacement_char,
- 3);
+ tok->pb,
+ (char *)
+ utf8_replacement_char,
+ 3);
+ tok->ucs_char = 0;
+ tok->st_pos = 0;
+ goto out;
}
- state = saved_state;
- break;
+ tok->ucs_char = 0;
+ tok->st_pos = 0;
+ /* other json_tokener_state_escape_unicode */
+ continue;
+ }
+ else
+ {
+ /* Got a high surrogate without another sequence following
+ * it. Put a replacement char in for the high surrogate
+ * and pretend we finished.
+ */
+ printbuf_memappend_fast(
+ tok->pb,
+ (char *)utf8_replacement_char,
+ 3);
}
}
- else
+ else if (IS_LOW_SURROGATE(tok->ucs_char))
{
- tok->err = json_tokener_error_parse_string;
- goto out;
+ /* Got a low surrogate not preceded by a high */
+ printbuf_memappend_fast(
+ tok->pb, (char *)utf8_replacement_char,
+ 3);
}
- if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
+ else if (tok->ucs_char < 0x10000)
{
- /* Clean up any pending chars */
- if (tok->got_hi_surrogate &&
- strcmp(tok->pb->buf, (char *)utf8_replacement_char))
- printbuf_memappend_fast(
- tok->pb, (char *)utf8_replacement_char, 3);
- goto out;
+ unsigned char unescaped_utf[3];
+ unescaped_utf[0] =
+ 0xe0 | (tok->ucs_char >> 12);
+ unescaped_utf[1] =
+ 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+ unescaped_utf[2] =
+ 0x80 | (tok->ucs_char & 0x3f);
+ printbuf_memappend_fast(
+ tok->pb, (char *)unescaped_utf, 3);
+ }
+ else if (tok->ucs_char < 0x110000)
+ {
+ unsigned char unescaped_utf[4];
+ unescaped_utf[0] =
+ 0xf0 | ((tok->ucs_char >> 18) & 0x07);
+ unescaped_utf[1] =
+ 0x80 | ((tok->ucs_char >> 12) & 0x3f);
+ unescaped_utf[2] =
+ 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+ unescaped_utf[3] =
+ 0x80 | (tok->ucs_char & 0x3f);
+ printbuf_memappend_fast(
+ tok->pb, (char *)unescaped_utf, 4);
}
+ else
+ {
+ /* Don't know what we got--insert the replacement char */
+ printbuf_memappend_fast(
+ tok->pb, (char *)utf8_replacement_char,
+ 3);
+ }
+ state = saved_state; // i.e. _state_string or _object_field
+ break;
}
}
break;