summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc3
-rw-r--r--embed.h1
-rw-r--r--embedvar.h3
-rw-r--r--intrpvar.h4
-rw-r--r--perl.c3
-rw-r--r--perlapi.h2
-rw-r--r--perlvars.h4
-rw-r--r--proto.h5
-rw-r--r--regcomp.c96
-rw-r--r--regexec.c55
-rw-r--r--sv.c1
-rw-r--r--utf8.c60
12 files changed, 136 insertions, 101 deletions
diff --git a/embed.fnc b/embed.fnc
index 4a9992fbd6..095b528637 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1892,6 +1892,9 @@ ApM |U8* |uvoffuni_to_utf8_flags_msgs|NN U8 *d|UV uv|const UV flags|NULLOK HV**
Ap |U8* |uvuni_to_utf8_flags |NN U8 *d|UV uv|UV flags
Apd |char* |pv_uni_display |NN SV *dsv|NN const U8 *spv|STRLEN len|STRLEN pvlim|UV flags
ApdR |char* |sv_uni_display |NN SV *dsv|NN SV *ssv|STRLEN pvlim|UV flags
+EXpR |Size_t |_inverse_folds |const UV cp \
+ |NN int * first_folds_to \
+ |NN const int ** remaining_folds_to
: Used by Data::Alias
EXp |void |vivify_defelem |NN SV* sv
: Used in pp.c
diff --git a/embed.h b/embed.h
index 97832ba9be..a6189a1571 100644
--- a/embed.h
+++ b/embed.h
@@ -918,6 +918,7 @@
#endif
#if defined(PERL_CORE) || defined(PERL_EXT)
#define _byte_dump_string(a,b,c) Perl__byte_dump_string(aTHX_ a,b,c)
+#define _inverse_folds(a,b,c) Perl__inverse_folds(aTHX_ a,b,c)
#define append_utf8_from_native_byte S_append_utf8_from_native_byte
#define av_reify(a) Perl_av_reify(aTHX_ a)
#define current_re_engine() Perl_current_re_engine(aTHX)
diff --git a/embedvar.h b/embedvar.h
index 70d640615b..e038ae74b4 100644
--- a/embedvar.h
+++ b/embedvar.h
@@ -336,7 +336,6 @@
#define PL_unitcheckav_save (vTHX->Iunitcheckav_save)
#define PL_unlockhook (vTHX->Iunlockhook)
#define PL_unsafe (vTHX->Iunsafe)
-#define PL_utf8_foldclosures (vTHX->Iutf8_foldclosures)
#define PL_utf8_mark (vTHX->Iutf8_mark)
#define PL_utf8cache (vTHX->Iutf8cache)
#define PL_utf8locale (vTHX->Iutf8locale)
@@ -466,6 +465,8 @@
#define PL_Gutf8_charname_continue (my_vars->Gutf8_charname_continue)
#define PL_utf8_foldable (my_vars->Gutf8_foldable)
#define PL_Gutf8_foldable (my_vars->Gutf8_foldable)
+#define PL_utf8_foldclosures (my_vars->Gutf8_foldclosures)
+#define PL_Gutf8_foldclosures (my_vars->Gutf8_foldclosures)
#define PL_utf8_idcont (my_vars->Gutf8_idcont)
#define PL_Gutf8_idcont (my_vars->Gutf8_idcont)
#define PL_utf8_idstart (my_vars->Gutf8_idstart)
diff --git a/intrpvar.h b/intrpvar.h
index 1dfc6034ae..f7b6ee326e 100644
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -753,10 +753,6 @@ PERLVAR(I, registered_mros, HV *)
/* Compile-time block start/end hooks */
PERLVAR(I, blockhooks, AV *)
-/* Everything that folds to a given character, for case insensitivity regex
- * matching */
-PERLVARI(I, utf8_foldclosures, HV *, NULL)
-
PERLVAR(I, custom_ops, HV *) /* custom op registrations */
PERLVAR(I, Xpv, XPV *) /* (unused) held temporary value */
diff --git a/perl.c b/perl.c
index 2a3df3dc85..6fb2cc6acd 100644
--- a/perl.c
+++ b/perl.c
@@ -336,6 +336,7 @@ perl_construct(pTHXx)
NonL1_Perl_Non_Final_Folds_invlist);
PL_utf8_charname_begin = _new_invlist_C_array(_Perl_Charname_Begin_invlist);
PL_utf8_charname_continue = _new_invlist_C_array(_Perl_Charname_Continue_invlist);
+ PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
#if defined(LOCAL_PATCH_COUNT)
@@ -1206,13 +1207,11 @@ perl_destruct(pTHXx)
/* clear character classes */
SvREFCNT_dec(PL_utf8_mark);
- SvREFCNT_dec(PL_utf8_foldclosures);
SvREFCNT_dec(PL_InBitmap);
#ifdef USE_LOCALE_CTYPE
SvREFCNT_dec(PL_warn_locale);
#endif
PL_utf8_mark = NULL;
- PL_utf8_foldclosures = NULL;
PL_InBitmap = NULL;
#ifdef USE_LOCALE_CTYPE
PL_warn_locale = NULL;
diff --git a/perlapi.h b/perlapi.h
index 6b90055773..e41d61f0fb 100644
--- a/perlapi.h
+++ b/perlapi.h
@@ -211,6 +211,8 @@ END_EXTERN_C
#define PL_utf8_charname_continue (*Perl_Gutf8_charname_continue_ptr(NULL))
#undef PL_utf8_foldable
#define PL_utf8_foldable (*Perl_Gutf8_foldable_ptr(NULL))
+#undef PL_utf8_foldclosures
+#define PL_utf8_foldclosures (*Perl_Gutf8_foldclosures_ptr(NULL))
#undef PL_utf8_idcont
#define PL_utf8_idcont (*Perl_Gutf8_idcont_ptr(NULL))
#undef PL_utf8_idstart
diff --git a/perlvars.h b/perlvars.h
index af48fa8266..b6cc9ca162 100644
--- a/perlvars.h
+++ b/perlvars.h
@@ -302,3 +302,7 @@ PERLVAR(G, utf8_tofold, SV *)
PERLVAR(G, utf8_tosimplefold, SV *)
PERLVAR(G, utf8_charname_begin, SV *)
PERLVAR(G, utf8_charname_continue, SV *)
+
+/* Everything that folds to a given character, for case insensitivity regex
+ * matching */
+PERLVAR(G, utf8_foldclosures, SV *)
diff --git a/proto.h b/proto.h
index 82ab3ebe91..ebead79cc2 100644
--- a/proto.h
+++ b/proto.h
@@ -62,6 +62,11 @@ PERL_CALLCONV char * Perl__byte_dump_string(pTHX_ const U8 * const start, const
PERL_CALLCONV void Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, const U8 * const e, const U32 flags, const bool die_here);
#define PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE \
assert(p); assert(e)
+PERL_CALLCONV Size_t Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT__INVERSE_FOLDS \
+ assert(first_folds_to); assert(remaining_folds_to)
+
PERL_CALLCONV bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category);
PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
__attribute__warn_unused_result__;
diff --git a/regcomp.c b/regcomp.c
index 42167e474d..9319e6084b 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10178,7 +10178,6 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
void
Perl__load_PL_utf8_foldclosures (pTHX)
{
- PL_utf8_foldclosures = _swash_inversion_hash();
}
#endif
@@ -10291,9 +10290,9 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
STRLEN foldlen = UTF8SKIP(s);
const U8* e = s + bytelen;
- SV** listp;
+ IV fc;
- uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+ fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
/* The only code points that aren't folded in a UTF EXACTFish
* node are are the problematic ones in EXACTFL nodes */
@@ -10305,14 +10304,21 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
U8 *d = folded;
int i;
+ fc = -1;
for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
if (isASCII(*s)) {
*(d++) = (U8) toFOLD(*s);
+ if (fc < 0) { /* Save the first fold */
+ fc = *(d-1);
+ }
s++;
}
else {
STRLEN len;
- toFOLD_utf8_safe(s, e, d, &len);
+ UV fold = toFOLD_utf8_safe(s, e, d, &len);
+ if (fc < 0) { /* Save the first fold */
+ fc = fold;
+ }
d += len;
s += UTF8SKIP(s);
}
@@ -10328,8 +10334,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
/* When we reach here 's' points to the fold of the first
* character(s) of the node; and 'e' points to far enough along
* the folded string to be just past any possible multi-char
- * fold. 'foldlen' is the length in bytes of the first
- * character in 's'
+ * fold.
*
* Unlike the non-UTF-8 case, the macro for determining if a
* string is a multi-char fold requires all the characters to
@@ -10342,33 +10347,29 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
}
else { /* Single char fold */
+ unsigned int k;
+ int first_folds_to;
+ const int * remaining_folds_to_list;
+ Size_t folds_to_count;
- /* It matches all the things that fold to it, which are
- * found in PL_utf8_foldclosures (including itself) */
- invlist = add_cp_to_invlist(invlist, uc);
- if (! PL_utf8_foldclosures)
- _load_PL_utf8_foldclosures();
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) s, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- assert(c_p);
+ /* It matches itself */
+ invlist = add_cp_to_invlist(invlist, fc);
- c = SvUV(*c_p);
+ /* ... plus all the things that fold to it, which are found in
+ * PL_utf8_foldclosures */
+ folds_to_count = _inverse_folds(fc, &first_folds_to,
+ &remaining_folds_to_list);
+ for (k = 0; k < folds_to_count; k++) {
+ UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
/* /aa doesn't allow folds between ASCII and non- */
if ((OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
- && isASCII(c) != isASCII(uc))
+ && isASCII(c) != isASCII(fc))
{
continue;
}
invlist = add_cp_to_invlist(invlist, c);
- }
}
}
}
@@ -17859,27 +17860,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
_invlist_intersection(PL_utf8_foldable, cp_foldable_list,
&fold_intersection);
- /* The folds for all the Latin1 characters are hard-coded into this
- * program, but we have to go out to disk to get the others. */
- if (invlist_highest(cp_foldable_list) >= 256) {
-
- /* This is a hash that for a particular fold gives all
- * characters that are involved in it */
- if (! PL_utf8_foldclosures) {
- _load_PL_utf8_foldclosures();
- }
- }
-
/* Now look at the foldable characters in this class individually */
invlist_iterinit(fold_intersection);
while (invlist_iternext(fold_intersection, &start, &end)) {
UV j;
+ UV folded;
/* Look at every character in the range */
for (j = start; j <= end; j++) {
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- SV** listp;
+ unsigned int k;
+ Size_t folds_to_count;
+ int first_folds_to;
+ const int * remaining_folds_to_list;
if (j < 256) {
@@ -17914,29 +17908,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
* rules hard-coded for it. First, get its fold. This is
* the simple fold, as the multi-character folds have been
* handled earlier and separated out */
- _to_uni_fold_flags(j, foldbuf, &foldlen,
+ folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
(ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0);
- /* Single character fold of above Latin1. Add everything in
- * its fold closure to the list that this node should match.
- * The fold closures data structure is a hash with the keys
- * being the UTF-8 of every character that is folded to, like
- * 'k', and the values each an array of all code points that
- * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
- * Multi-character folds are not included */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_tindex_skip_len_mg(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- assert(c_p);
+ /* Single character fold of above Latin1. Add everything
+ * in its fold closure to the list that this node should
+ * match. */
+ folds_to_count = _inverse_folds(folded, &first_folds_to,
+ &remaining_folds_to_list);
+ for (k = 0; k <= folds_to_count; k++) {
+ UV c = (k == 0) /* First time through use itself */
+ ? folded
+ : (k == 1) /* 2nd time use, the first fold */
+ ? first_folds_to
- c = SvUV(*c_p);
+ /* Then the remaining ones */
+ : remaining_folds_to_list[k-2];
/* /aa doesn't allow folds between ASCII and non- */
if ((ASCII_FOLD_RESTRICTED
@@ -17965,7 +17954,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
has_upper_latin1_only_utf8_matches,
c);
}
- }
}
}
}
diff --git a/regexec.c b/regexec.c
index 9f0abd749d..c1b89e57ad 100644
--- a/regexec.c
+++ b/regexec.c
@@ -4510,47 +4510,25 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
if (c1 > 255) {
- /* Load the folds hash, if not already done */
- SV** listp;
- if (! PL_utf8_foldclosures) {
- _load_PL_utf8_foldclosures();
+ const int * remaining_folds_to_list;
+ int first_folds_to;
+
+ /* Look up what code points (besides c1) fold to c1; e.g.,
+ * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
+ Size_t folds_to_count = _inverse_folds(c1,
+ &first_folds_to,
+ &remaining_folds_to_list);
+ if (folds_to_count == 0) {
+ c2 = c1; /* there is only a single character that could
+ match */
}
-
- /* The fold closures data structure is a hash with the keys
- * being the UTF-8 of every character that is folded to, like
- * 'k', and the values each an array of all code points that
- * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
- * Multi-character folds are not included */
- if ((! (listp = hv_fetch(PL_utf8_foldclosures,
- (char *) pat,
- UTF8SKIP(pat),
- FALSE))))
- {
- /* Not found in the hash, therefore there are no folds
- * containing it, so there is only a single character that
- * could match */
- c2 = c1;
- }
- else { /* Does participate in folds */
- AV* list = (AV*) *listp;
- if (av_tindex_skip_len_mg(list) != 1) {
-
- /* If there aren't exactly two folds to this, it is
- * outside the scope of this function */
+ else if (folds_to_count != 1) {
+ /* If there aren't exactly two folds to this (itself and
+ * another), it is outside the scope of this function */
use_chrtest_void = TRUE;
}
- else { /* There are two. Get them */
- SV** c_p = av_fetch(list, 0, FALSE);
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c1 = SvUV(*c_p);
-
- c_p = av_fetch(list, 1, FALSE);
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c2 = SvUV(*c_p);
+ else { /* There are two. We already have one, get the other */
+ c2 = first_folds_to;
/* Folds that cross the 255/256 boundary are forbidden
* if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and
@@ -4575,7 +4553,6 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
}
}
}
- }
}
else /* Here, c1 is <= 255 */
if (utf8_target
diff --git a/sv.c b/sv.c
index d38ccdcb91..07865bb2c1 100644
--- a/sv.c
+++ b/sv.c
@@ -15688,7 +15688,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
PL_registered_mros = hv_dup_inc(proto_perl->Iregistered_mros, param);
PL_blockhooks = av_dup_inc(proto_perl->Iblockhooks, param);
- PL_utf8_foldclosures = hv_dup_inc(proto_perl->Iutf8_foldclosures, param);
/* Call the ->CLONE method, if it exists, for each of the stashes
identified by sv_dup() above.
diff --git a/utf8.c b/utf8.c
index 77115f3c26..7c5262e476 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3637,6 +3637,66 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
}
+Size_t
+Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to)
+{
+ /* Returns the count of the number of code points that fold to the input
+ * 'cp' (besides itself).
+ *
+ * If the return is 0, there is nothing else that folds to it, and
+ * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
+ *
+ * If the return is 1, '*first_folds_to' is set to the single code point,
+ * and '*remaining_folds_to' is set to NULL.
+ *
+ * Otherwise, '*first_folds_to' is set to a code point, and
+ * '*remaining_fold_to' is set to an array that contains the others. The
+ * length of this array is the returned count minus 1.
+ *
+ * The reason for this convolution is to avoid having to deal with
+ * allocating and freeing memory. The lists are already constructed, so
+ * the return can point to them, but single code points aren't, so would
+ * need to be constructed if we didn't employ something like this API */
+
+ SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
+ int base = _Perl_IVCF_invmap[index];
+
+ PERL_ARGS_ASSERT__INVERSE_FOLDS;
+
+ if (base == 0) { /* No fold */
+ *first_folds_to = 0;
+ *remaining_folds_to = NULL;
+ return 0;
+ }
+
+#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
+
+ assert(base > 0);
+
+#else
+
+ if (UNLIKELY(base < 0)) { /* Folds to more than one character */
+
+ /* The data structure is set up so that the absolute value of 'base' is
+ * an index into a table of pointers to arrays, with the array
+ * corresponding to the index being the list of code points that fold
+ * to 'cp', and the parallel array containing the length of the list
+ * array */
+ *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
+ *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
+ *first_folds_to
+ */
+ return IVCF_AUX_TABLE_lengths[-base];
+ }
+
+#endif
+
+ /* Only the single code point. This works like 'fc(G) = G - A + a' */
+ *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+ *remaining_folds_to = NULL;
+ return 1;
+}
+
STATIC UV
S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
U8* const ustrp, STRLEN *lenp)