summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-08-20 22:03:22 -0600
committerKarl Williamson <public@khwilliamson.com>2012-08-25 23:21:30 -0600
commita1853d78a51dcd1a14b408d87ce27e98b6fc8a23 (patch)
treefcd83e81ff3c9a485c5644111acdfb48dd61acd9 /utf8.c
parent971d486f115af743c94b65d0d90c23a6541bb7d2 (diff)
downloadperl-a1853d78a51dcd1a14b408d87ce27e98b6fc8a23.tar.gz
regex: Speed up \X processing
For most Unicode releases, GCB=prepend matches absolutely nothing. And that appears to be the case going forward, as they added things to it, and removed them later based on field experience. An earlier commit has improved the performance of this significantly by using a binary search of an empty array instead of a swash hash. However, that search requires several layers of function calls to discover that it is empty, which this commit avoids. This patch will use whatever swash_init() returns unless it is empty, preserving backwards compatibility with older Unicode releases. But if it is empty, the routine sets things up so that future calls will always fail without further testing.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c32
1 files changed, 31 insertions, 1 deletions
diff --git a/utf8.c b/utf8.c
index 0da2c0197d..8cc05c305b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2232,11 +2232,41 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p)
bool
Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
{
+ /* If no code points in the Unicode version being worked on match
+ * GCB=Prepend, this will set PL_utf8_X_prepend to &PL_sv_undef during its
+ * first call. Otherwise, it will set it to a swash created for it.
+ * swash_fetch() hence can't be used without checking first if it is valid
+ * to do so. */
+
dVAR;
+ bool initialized = cBOOL(PL_utf8_X_prepend);
+ bool ret;
PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
- return is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend");
+ if (PL_utf8_X_prepend == &PL_sv_undef) {
+ return FALSE;
+ }
+
+ if ((ret = is_utf8_common(p, &PL_utf8_X_prepend, "_X_GCB_Prepend"))
+ || initialized)
+ {
+ return ret;
+ }
+
+ /* Here the code point being checked was not a prepend, and we hadn't
+ * initialized PL_utf8_X_prepend, so we don't know if it is just this
+ * particular input code point that didn't match, or if the table is
+ * completely empty. The is_utf8_common() call did the initialization, so
+ * we can inspect the swash's inversion list to find out. If there are no
+ * elements in its inversion list, it's empty, and nothing will ever match,
+ * so set things up so we can skip the check in future calls. */
+ if (_invlist_len(_get_swash_invlist(PL_utf8_X_prepend)) == 0) {
+ SvREFCNT_dec(PL_utf8_X_prepend);
+ PL_utf8_X_prepend = &PL_sv_undef;
+ }
+
+ return FALSE;
}
bool