fix issue 616: support the surrogate pair in split file.

author: dota17 <chenguopingdota@163.com> 2020-06-02 19:17:42 +0800
committer: dota17 <chenguopingdota@163.com> 2020-06-08 17:19:32 +0800
commit: c1b872d8174c9946f8d9383fe600e8ff6650647e (patch)
tree: c1f7dc271cfcc66a26a350bf3ad358aff6bdc621 /json_tokener.c
parent: 1c6086a86aa26c643a0c9e8bc4a1f7d652b33662 (diff)
download: json-c-c1b872d8174c9946f8d9383fe600e8ff6650647e.tar.gz
1 files changed, 18 insertions, 7 deletions
diff --git a/json_tokener.c b/json_tokener.c
index 0373d6f..69d7af2 100644
--- a/json_tokener.c
+++ b/json_tokener.c
@@ -630,8 +630,6 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 
 		case json_tokener_state_escape_unicode:
 		{
-			unsigned int got_hi_surrogate = 0;
-
 			/* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 			while (1)
 			{
@@ -643,14 +641,24 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 					{
 						unsigned char unescaped_utf[4];
 
-						if (got_hi_surrogate)
+						if (tok->got_hi_surrogate)
 						{
 							if (IS_LOW_SURROGATE(tok->ucs_char))
 							{
+								/* remove the utf8_replacement_char */
+								/* which may generate during */
+								/* parsing the high surrogate pair. */
+								if (!strcmp(
+								        tok->pb->buf,
+								        (char *)
+								            utf8_replacement_char))
+								{
+									printbuf_reset(tok->pb);
+								}
 								/* Recalculate the ucs_char, then fall thru to process normally */
 								tok->ucs_char =
 								    DECODE_SURROGATE_PAIR(
-								        got_hi_surrogate,
+								        tok->got_hi_surrogate,
 								        tok->ucs_char);
 							}
 							else
@@ -662,7 +670,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 								    (char *)utf8_replacement_char,
 								    3);
 							}
-							got_hi_surrogate = 0;
+							tok->got_hi_surrogate = 0;
 						}
 
 						if (tok->ucs_char < 0x80)
@@ -686,7 +694,7 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 							 * the beginning of another sequence, which
 							 * should be the low surrogate.
 							 */
-							got_hi_surrogate = tok->ucs_char;
+							tok->got_hi_surrogate = tok->ucs_char;
 							/* Not at end, and the next two chars should be "\u" */
 							if ((len == -1 ||
 							     len > (tok->char_offset + 2)) &&
@@ -717,6 +725,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 									    (char *)
 									        utf8_replacement_char,
 									    3);
+									tok->ucs_char = 0;
+									tok->st_pos = 0;
 									goto out;
 								}
 								tok->ucs_char = 0;
@@ -786,7 +796,8 @@ struct json_object *json_tokener_parse_ex(struct json_tokener *tok, const char *
 				if (!ADVANCE_CHAR(str, tok) || !PEEK_CHAR(c, tok))
 				{
 					/* Clean up any pending chars */
-					if (got_hi_surrogate)
+					if (tok->got_hi_surrogate &&
+					    strcmp(tok->pb->buf, (char *)utf8_replacement_char))
 						printbuf_memappend_fast(
 						    tok->pb, (char *)utf8_replacement_char, 3);
 					goto out;
author	dota17 <chenguopingdota@163.com>	2020-06-02 19:17:42 +0800
committer	dota17 <chenguopingdota@163.com>	2020-06-08 17:19:32 +0800
commit	c1b872d8174c9946f8d9383fe600e8ff6650647e (patch)
tree	c1f7dc271cfcc66a26a350bf3ad358aff6bdc621 /json_tokener.c
parent	1c6086a86aa26c643a0c9e8bc4a1f7d652b33662 (diff)
download	json-c-c1b872d8174c9946f8d9383fe600e8ff6650647e.tar.gz