diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-08-21 09:30:08 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-08-25 23:21:28 -0600 |
commit | 41de4811adc75d5bdcab9665a1cc19816e43c703 (patch) | |
tree | 279637f242c8c18cfedd96ea95d3c15dccc05235 /utf8.c | |
parent | 9e7f4f43739b2597e193f775558f39082b8a213f (diff) | |
download | perl-41de4811adc75d5bdcab9665a1cc19816e43c703.tar.gz |
utf8.c: Speed up \X processing of Korean
\X matches according to a complicated pattern that is hard-coded in
regexec.c. Part of that pattern involves checking if a code point is a
component of a Hangul Syllable or not. For Korean code points, this
involves checking against multiple tables. It turns out that two of
those tables are arranged so that the checks for them can be done via an
arithmetic expression; Unicode publishes algorithms for determining
various characteristics based on their very structured ordering.
This patch converts the routines that check these two tables to instead
use the arithmetic expression.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 50 |
1 files changed, 48 insertions, 2 deletions
@@ -2254,24 +2254,70 @@ Perl_is_utf8_X_L(pTHX_ const U8 *p) return is_utf8_common(p, &PL_utf8_X_L, "_X_GCB_L"); } +/* These constants are for finding GCB=LV and GCB=LVT. These are for the + * pre-composed Hangul syllables, which are all in a contiguous block and + * arranged there in such a way so as to facilitate alorithmic determination of + * their characteristics. As such, they don't need a swash, but can be + * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one + * is a GCB=LV */ +#define SBASE 0xAC00 /* Start of block */ +#define SCount 11172 /* Length of block */ +#define TCount 28 + bool Perl_is_utf8_X_LV(pTHX_ const U8 *p) { + /* Unlike most other similarly named routines here, this does not create a + * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */ + dVAR; + UV cp = valid_utf8_to_uvchr(p, NULL); + PERL_ARGS_ASSERT_IS_UTF8_X_LV; - return is_utf8_common(p, &PL_utf8_X_LV, "_X_GCB_LV"); + /* The earliest Unicode releases did not have these precomposed Hangul + * syllables. Set to point to undef in that case, so will return false on + * every call */ + if (! PL_utf8_X_LV) { /* Set up if this is the first time called */ + PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0); + if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) { + SvREFCNT_dec(PL_utf8_X_LV); + PL_utf8_X_LV = &PL_sv_undef; + } + } + + return (PL_utf8_X_LV != &PL_sv_undef + && cp >= SBASE && cp < SBASE + SCount + && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */ } bool Perl_is_utf8_X_LVT(pTHX_ const U8 *p) { + /* Unlike most other similarly named routines here, this does not create a + * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */ + dVAR; + UV cp = valid_utf8_to_uvchr(p, NULL); + PERL_ARGS_ASSERT_IS_UTF8_X_LVT; - return is_utf8_common(p, &PL_utf8_X_LVT, "_X_GCB_LVT"); + /* The earliest Unicode releases did not have these precomposed Hangul + * syllables. Set to point to undef in that case, so will return false on + * every call */ + if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */ + PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0); + if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) { + SvREFCNT_dec(PL_utf8_X_LVT); + PL_utf8_X_LVT = &PL_sv_undef; + } + } + + return (PL_utf8_X_LVT != &PL_sv_undef + && cp >= SBASE && cp < SBASE + SCount + && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */ } bool |