summaryrefslogtreecommitdiff
path: root/json-glib/json-scanner.c
diff options
context:
space:
mode:
authorEmmanuele Bassi <ebassi@gnome.org>2014-03-02 14:26:44 +0000
committerEmmanuele Bassi <ebassi@gnome.org>2014-03-18 16:27:59 +0000
commit36872776b354d2b4b39f9da8e12dd51e949b6c90 (patch)
tree937fbb9cbe4fe023750f7688da5f8c9daa2e0420 /json-glib/json-scanner.c
parent16396ab205a5f9bfe7a68aec436fe916bbbee342 (diff)
downloadjson-glib-36872776b354d2b4b39f9da8e12dd51e949b6c90.tar.gz
scanner: Clean up the UTF-16 surrogate pairs decoding
We over-assert() our pre-conditions, and the conversion is a bit obfuscated. We should use a proper function, and de-obfuscate the code so that the intent is clear.
Diffstat (limited to 'json-glib/json-scanner.c')
-rw-r--r--json-glib/json-scanner.c42
1 files changed, 36 insertions, 6 deletions
diff --git a/json-glib/json-scanner.c b/json-glib/json-scanner.c
index 21339e1..5d7cb89 100644
--- a/json-glib/json-scanner.c
+++ b/json-glib/json-scanner.c
@@ -577,6 +577,30 @@ json_scanner_get_unichar (JsonScanner *scanner,
return uchar;
}
+/*
+ * decode_utf16_surrogate_pair:
+ * @units: (array length=2): a pair of UTF-16 code points
+ *
+ * Decodes a surrogate pair of UTF-16 code points into the equivalent
+ * Unicode code point.
+ *
+ * Returns: the Unicode code point equivalent to the surrogate pair
+ */
+static inline gunichar
+decode_utf16_surrogate_pair (const gunichar units[2])
+{
+ gunichar ucs;
+
+ g_assert (0xd800 <= units[0] && units[0] <= 0xdbff);
+ g_assert (0xdc00 <= units[1] && units[1] <= 0xdfff);
+
+ ucs = 0x10000;
+ ucs += (units[0] & 0x3ff) << 10;
+ ucs += (units[1] & 0x3ff);
+
+ return ucs;
+}
+
void
json_scanner_unexp_token (JsonScanner *scanner,
GTokenType expected_token,
@@ -1113,19 +1137,25 @@ json_scanner_get_token_ll (JsonScanner *scanner,
ucs = json_scanner_get_unichar (scanner, line_p, position_p);
+ /* resolve UTF-16 surrogates for Unicode characters not in the BMP,
+ * as per ECMA 404, ยง 9, "String"
+ */
if (g_unichar_type (ucs) == G_UNICODE_SURROGATE)
{
/* read next surrogate */
- if ('\\' == json_scanner_get_char (scanner, line_p, position_p)
- && 'u' == json_scanner_get_char (scanner, line_p, position_p))
+ if ('\\' == json_scanner_get_char (scanner, line_p, position_p) &&
+ 'u' == json_scanner_get_char (scanner, line_p, position_p))
{
- gunichar ucs_lo = json_scanner_get_unichar (scanner, line_p, position_p);
- g_assert (g_unichar_type (ucs_lo) == G_UNICODE_SURROGATE);
- ucs = (((ucs & 0x3ff) << 10) | (ucs_lo & 0x3ff)) + 0x10000;
+ gunichar units[2];
+
+ units[0] = ucs;
+ units[1] = json_scanner_get_unichar (scanner, line_p, position_p);
+
+ ucs = decode_utf16_surrogate_pair (units);
+ g_assert (g_unichar_validate (ucs));
}
}
- g_assert (g_unichar_validate (ucs));
gstring = g_string_append_unichar (gstring, ucs);
}
break;