summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-08-21 09:30:08 -0600
committerKarl Williamson <public@khwilliamson.com>2012-08-25 23:21:28 -0600
commit41de4811adc75d5bdcab9665a1cc19816e43c703 (patch)
tree279637f242c8c18cfedd96ea95d3c15dccc05235 /utf8.c
parent9e7f4f43739b2597e193f775558f39082b8a213f (diff)
downloadperl-41de4811adc75d5bdcab9665a1cc19816e43c703.tar.gz
utf8.c: Speed up \X processing of Korean
\X matches according to a complicated pattern that is hard-coded in regexec.c. Part of that pattern involves checking if a code point is a component of a Hangul Syllable or not. For Korean code points, this involves checking against multiple tables. It turns out that two of those tables are arranged so that the checks for them can be done via an arithmetic expression; Unicode publishes algorithms for determining various characteristics based on their very structured ordering. This patch converts the routines that check these two tables to instead use the arithmetic expression.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c50
1 files changed, 48 insertions, 2 deletions
diff --git a/utf8.c b/utf8.c
index 39a6350a76..8f1b976931 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2254,24 +2254,70 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p)
return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L");
}
+/* These constants are for finding GCB=LV and GCB=LVT. These are for the
+ * pre-composed Hangul syllables, which are all in a contiguous block and
+ * arranged there in such a way so as to facilitate alorithmic determination of
+ * their characteristics. As such, they don't need a swash, but can be
+ * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
+ * is a GCB=LV */
+#define SBASE 0xAC00 /* Start of block */
+#define SCount 11172 /* Length of block */
+#define TCount 28
+
bool
Perl_is_utf8_X_LV(pTHX_ const U8 *p)
{
+ /* Unlike most other similarly named routines here, this does not create a
+ * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
+
dVAR;
+ UV cp = valid_utf8_to_uvchr(p, NULL);
+
PERL_ARGS_ASSERT_IS_UTF8_X_LV;
- return is_utf8_common(p, &PL_utf8_X_LV, "_X_GCB_LV");
+ /* The earliest Unicode releases did not have these precomposed Hangul
+ * syllables. Set to point to undef in that case, so will return false on
+ * every call */
+ if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
+ PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
+ if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
+ SvREFCNT_dec(PL_utf8_X_LV);
+ PL_utf8_X_LV = &PL_sv_undef;
+ }
+ }
+
+ return (PL_utf8_X_LV != &PL_sv_undef
+ && cp >= SBASE && cp < SBASE + SCount
+ && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
}
bool
Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
{
+ /* Unlike most other similarly named routines here, this does not create a
+ * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
+
dVAR;
+ UV cp = valid_utf8_to_uvchr(p, NULL);
+
PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
- return is_utf8_common(p, &PL_utf8_X_LVT, "_X_GCB_LVT");
+ /* The earliest Unicode releases did not have these precomposed Hangul
+ * syllables. Set to point to undef in that case, so will return false on
+ * every call */
+ if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
+ PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
+ if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
+ SvREFCNT_dec(PL_utf8_X_LVT);
+ PL_utf8_X_LVT = &PL_sv_undef;
+ }
+ }
+
+ return (PL_utf8_X_LVT != &PL_sv_undef
+ && cp >= SBASE && cp < SBASE + SCount
+ && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
}
bool