scanner: Clean up the UTF-16 surrogate pairs decoding

We over-assert() our pre-conditions, and the conversion is a bit obfuscated. We should use a proper function, and de-obfuscate the code so that the intent is clear.
author: Emmanuele Bassi <ebassi@gnome.org> 2014-03-02 14:26:44 +0000
committer: Emmanuele Bassi <ebassi@gnome.org> 2014-03-18 16:27:59 +0000
commit: 36872776b354d2b4b39f9da8e12dd51e949b6c90 (patch)
tree: 937fbb9cbe4fe023750f7688da5f8c9daa2e0420 /json-glib/json-scanner.c
parent: 16396ab205a5f9bfe7a68aec436fe916bbbee342 (diff)
download: json-glib-36872776b354d2b4b39f9da8e12dd51e949b6c90.tar.gz
1 files changed, 36 insertions, 6 deletions
diff --git a/json-glib/json-scanner.c b/json-glib/json-scanner.c
index 21339e1..5d7cb89 100644
--- a/json-glib/json-scanner.c
+++ b/json-glib/json-scanner.c
@@ -577,6 +577,30 @@ json_scanner_get_unichar (JsonScanner *scanner,
   return uchar;
 }
 
+/*
+ * decode_utf16_surrogate_pair:
+ * @units: (array length=2): a pair of UTF-16 code points
+ *
+ * Decodes a surrogate pair of UTF-16 code points into the equivalent
+ * Unicode code point.
+ *
+ * Returns: the Unicode code point equivalent to the surrogate pair
+ */
+static inline gunichar
+decode_utf16_surrogate_pair (const gunichar units[2])
+{
+  gunichar ucs;
+
+  g_assert (0xd800 <= units[0] && units[0] <= 0xdbff);
+  g_assert (0xdc00 <= units[1] && units[1] <= 0xdfff);
+
+  ucs = 0x10000;
+  ucs += (units[0] & 0x3ff) << 10;
+  ucs += (units[1] & 0x3ff);
+
+  return ucs;
+}
+
 void
 json_scanner_unexp_token (JsonScanner *scanner,
                           GTokenType   expected_token,
@@ -1113,19 +1137,25 @@ json_scanner_get_token_ll (JsonScanner *scanner,
 
                               ucs = json_scanner_get_unichar (scanner, line_p, position_p);
 
+                              /* resolve UTF-16 surrogates for Unicode characters not in the BMP,
+                                * as per ECMA 404, § 9, "String"
+                                */
                               if (g_unichar_type (ucs) == G_UNICODE_SURROGATE)
                                 {
                                   /* read next surrogate */
-                                  if ('\\' == json_scanner_get_char (scanner, line_p, position_p)
-                                      && 'u' == json_scanner_get_char (scanner, line_p, position_p))
+                                  if ('\\' == json_scanner_get_char (scanner, line_p, position_p) &&
+                                      'u' == json_scanner_get_char (scanner, line_p, position_p))
                                     {
-                                      gunichar ucs_lo = json_scanner_get_unichar (scanner, line_p, position_p);
-                                      g_assert (g_unichar_type (ucs_lo) == G_UNICODE_SURROGATE);
-                                      ucs = (((ucs & 0x3ff) << 10) | (ucs_lo & 0x3ff)) + 0x10000;
+                                      gunichar units[2];
+
+                                      units[0] = ucs;
+                                      units[1] = json_scanner_get_unichar (scanner, line_p, position_p);
+
+                                      ucs = decode_utf16_surrogate_pair (units);
+                                      g_assert (g_unichar_validate (ucs));
                                     }
                                 }
 
-                              g_assert (g_unichar_validate (ucs));
                               gstring = g_string_append_unichar (gstring, ucs);
                             }
                           break;
author	Emmanuele Bassi <ebassi@gnome.org>	2014-03-02 14:26:44 +0000
committer	Emmanuele Bassi <ebassi@gnome.org>	2014-03-18 16:27:59 +0000
commit	36872776b354d2b4b39f9da8e12dd51e949b6c90 (patch)
tree	937fbb9cbe4fe023750f7688da5f8c9daa2e0420 /json-glib/json-scanner.c
parent	16396ab205a5f9bfe7a68aec436fe916bbbee342 (diff)
download	json-glib-36872776b354d2b4b39f9da8e12dd51e949b6c90.tar.gz