utf8.c: Speed up \X processing of Korean

\X matches according to a complicated pattern that is hard-coded in regexec.c. Part of that pattern involves checking if a code point is a component of a Hangul Syllable or not. For Korean code points, this involves checking against multiple tables. It turns out that two of those tables are arranged so that the checks for them can be done via an arithmetic expression; Unicode publishes algorithms for determining various characteristics based on their very structured ordering. This patch converts the routines that check these two tables to instead use the arithmetic expression.
author: Karl Williamson <public@khwilliamson.com> 2012-08-21 09:30:08 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-08-25 23:21:28 -0600
commit: 41de4811adc75d5bdcab9665a1cc19816e43c703 (patch)
tree: 279637f242c8c18cfedd96ea95d3c15dccc05235 /utf8.c
parent: 9e7f4f43739b2597e193f775558f39082b8a213f (diff)
download: perl-41de4811adc75d5bdcab9665a1cc19816e43c703.tar.gz
1 files changed, 48 insertions, 2 deletions
diff --git a/utf8.c b/utf8.c
index 39a6350a76..8f1b976931 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2254,24 +2254,70 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p)
     return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
 }
 
+/* These constants are for finding GCB=LV and GCB=LVT.  These are for the
+ * pre-composed Hangul syllables, which are all in a contiguous block and
+ * arranged there in such a way so as to facilitate alorithmic determination of
+ * their characteristics.  As such, they don't need a swash, but can be
+ * determined by simple arithmetic.  Almost all are GCB=LVT, but every 28th one
+ * is a GCB=LV */
+#define SBASE 0xAC00    /* Start of block */
+#define SCount 11172    /* Length of block */
+#define TCount 28
+
 bool
 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
 {
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
+
     dVAR;
 
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
 
-    return is_utf8_common(p, &PL_utf8_X_LV, "_X_GCB_LV");
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LV) {   /* Set up if this is the first time called */
+        PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LV);
+            PL_utf8_X_LV = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LV != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
 }
 
 bool
 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
 {
+    /* Unlike most other similarly named routines here, this does not create a
+     * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
+
     dVAR;
 
+    UV cp = valid_utf8_to_uvchr(p, NULL);
+
     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
 
-    return is_utf8_common(p, &PL_utf8_X_LVT, "_X_GCB_LVT");
+    /* The earliest Unicode releases did not have these precomposed Hangul
+     * syllables.  Set to point to undef in that case, so will return false on
+     * every call */
+    if (! PL_utf8_X_LVT) {   /* Set up if this is the first time called */
+        PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
+        if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
+            SvREFCNT_dec(PL_utf8_X_LVT);
+            PL_utf8_X_LVT = &PL_sv_undef;
+        }
+    }
+
+    return (PL_utf8_X_LVT != &PL_sv_undef
+            && cp >= SBASE && cp < SBASE + SCount
+            && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
 }
 
 bool
author	Karl Williamson <public@khwilliamson.com>	2012-08-21 09:30:08 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-08-25 23:21:28 -0600
commit	41de4811adc75d5bdcab9665a1cc19816e43c703 (patch)
tree	279637f242c8c18cfedd96ea95d3c15dccc05235 /utf8.c
parent	9e7f4f43739b2597e193f775558f39082b8a213f (diff)
download	perl-41de4811adc75d5bdcab9665a1cc19816e43c703.tar.gz