From b74fe592196bcf9f41b2e13d774c2c6513a650df Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 29 Mar 2018 16:32:49 -0600 Subject: Use compiled-in C structure for inverted case folds This commit changes to use the C data structures generated by the previous commit to compute what characters fold to a given one. This is used to find out what things should match under /i. This now avoids the expensive start up cost of switching to perl utf8_heavy.pl, loading a file from disk, and constructing a hash from it. --- embed.fnc | 3 ++ embed.h | 1 + embedvar.h | 3 +- intrpvar.h | 4 --- perl.c | 3 +- perlapi.h | 2 ++ perlvars.h | 4 +++ proto.h | 5 ++++ regcomp.c | 96 +++++++++++++++++++++++++++----------------------------------- regexec.c | 55 +++++++++++------------------------ sv.c | 1 - utf8.c | 60 +++++++++++++++++++++++++++++++++++++++ 12 files changed, 136 insertions(+), 101 deletions(-) diff --git a/embed.fnc b/embed.fnc index 4a9992fbd6..095b528637 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1892,6 +1892,9 @@ ApM |U8* |uvoffuni_to_utf8_flags_msgs|NN U8 *d|UV uv|const UV flags|NULLOK HV** Ap |U8* |uvuni_to_utf8_flags |NN U8 *d|UV uv|UV flags Apd |char* |pv_uni_display |NN SV *dsv|NN const U8 *spv|STRLEN len|STRLEN pvlim|UV flags ApdR |char* |sv_uni_display |NN SV *dsv|NN SV *ssv|STRLEN pvlim|UV flags +EXpR |Size_t |_inverse_folds |const UV cp \ + |NN int * first_folds_to \ + |NN const int ** remaining_folds_to : Used by Data::Alias EXp |void |vivify_defelem |NN SV* sv : Used in pp.c diff --git a/embed.h b/embed.h index 97832ba9be..a6189a1571 100644 --- a/embed.h +++ b/embed.h @@ -918,6 +918,7 @@ #endif #if defined(PERL_CORE) || defined(PERL_EXT) #define _byte_dump_string(a,b,c) Perl__byte_dump_string(aTHX_ a,b,c) +#define _inverse_folds(a,b,c) Perl__inverse_folds(aTHX_ a,b,c) #define append_utf8_from_native_byte S_append_utf8_from_native_byte #define av_reify(a) Perl_av_reify(aTHX_ a) #define current_re_engine() Perl_current_re_engine(aTHX) diff --git a/embedvar.h b/embedvar.h index 70d640615b..e038ae74b4 100644 --- a/embedvar.h +++ b/embedvar.h @@ -336,7 +336,6 @@ #define PL_unitcheckav_save (vTHX->Iunitcheckav_save) #define PL_unlockhook (vTHX->Iunlockhook) #define PL_unsafe (vTHX->Iunsafe) -#define PL_utf8_foldclosures (vTHX->Iutf8_foldclosures) #define PL_utf8_mark (vTHX->Iutf8_mark) #define PL_utf8cache (vTHX->Iutf8cache) #define PL_utf8locale (vTHX->Iutf8locale) @@ -466,6 +465,8 @@ #define PL_Gutf8_charname_continue (my_vars->Gutf8_charname_continue) #define PL_utf8_foldable (my_vars->Gutf8_foldable) #define PL_Gutf8_foldable (my_vars->Gutf8_foldable) +#define PL_utf8_foldclosures (my_vars->Gutf8_foldclosures) +#define PL_Gutf8_foldclosures (my_vars->Gutf8_foldclosures) #define PL_utf8_idcont (my_vars->Gutf8_idcont) #define PL_Gutf8_idcont (my_vars->Gutf8_idcont) #define PL_utf8_idstart (my_vars->Gutf8_idstart) diff --git a/intrpvar.h b/intrpvar.h index 1dfc6034ae..f7b6ee326e 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -753,10 +753,6 @@ PERLVAR(I, registered_mros, HV *) /* Compile-time block start/end hooks */ PERLVAR(I, blockhooks, AV *) -/* Everything that folds to a given character, for case insensitivity regex - * matching */ -PERLVARI(I, utf8_foldclosures, HV *, NULL) - PERLVAR(I, custom_ops, HV *) /* custom op registrations */ PERLVAR(I, Xpv, XPV *) /* (unused) held temporary value */ diff --git a/perl.c b/perl.c index 2a3df3dc85..6fb2cc6acd 100644 --- a/perl.c +++ b/perl.c @@ -336,6 +336,7 @@ perl_construct(pTHXx) NonL1_Perl_Non_Final_Folds_invlist); PL_utf8_charname_begin = _new_invlist_C_array(_Perl_Charname_Begin_invlist); PL_utf8_charname_continue = _new_invlist_C_array(_Perl_Charname_Continue_invlist); + PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); #if defined(LOCAL_PATCH_COUNT) @@ -1206,13 +1207,11 @@ perl_destruct(pTHXx) /* clear character classes */ SvREFCNT_dec(PL_utf8_mark); - SvREFCNT_dec(PL_utf8_foldclosures); SvREFCNT_dec(PL_InBitmap); #ifdef USE_LOCALE_CTYPE SvREFCNT_dec(PL_warn_locale); #endif PL_utf8_mark = NULL; - PL_utf8_foldclosures = NULL; PL_InBitmap = NULL; #ifdef USE_LOCALE_CTYPE PL_warn_locale = NULL; diff --git a/perlapi.h b/perlapi.h index 6b90055773..e41d61f0fb 100644 --- a/perlapi.h +++ b/perlapi.h @@ -211,6 +211,8 @@ END_EXTERN_C #define PL_utf8_charname_continue (*Perl_Gutf8_charname_continue_ptr(NULL)) #undef PL_utf8_foldable #define PL_utf8_foldable (*Perl_Gutf8_foldable_ptr(NULL)) +#undef PL_utf8_foldclosures +#define PL_utf8_foldclosures (*Perl_Gutf8_foldclosures_ptr(NULL)) #undef PL_utf8_idcont #define PL_utf8_idcont (*Perl_Gutf8_idcont_ptr(NULL)) #undef PL_utf8_idstart diff --git a/perlvars.h b/perlvars.h index af48fa8266..b6cc9ca162 100644 --- a/perlvars.h +++ b/perlvars.h @@ -302,3 +302,7 @@ PERLVAR(G, utf8_tofold, SV *) PERLVAR(G, utf8_tosimplefold, SV *) PERLVAR(G, utf8_charname_begin, SV *) PERLVAR(G, utf8_charname_continue, SV *) + +/* Everything that folds to a given character, for case insensitivity regex + * matching */ +PERLVAR(G, utf8_foldclosures, SV *) diff --git a/proto.h b/proto.h index 82ab3ebe91..ebead79cc2 100644 --- a/proto.h +++ b/proto.h @@ -62,6 +62,11 @@ PERL_CALLCONV char * Perl__byte_dump_string(pTHX_ const U8 * const start, const PERL_CALLCONV void Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, const U8 * const e, const U32 flags, const bool die_here); #define PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE \ assert(p); assert(e) +PERL_CALLCONV Size_t Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT__INVERSE_FOLDS \ + assert(first_folds_to); assert(remaining_folds_to) + PERL_CALLCONV bool Perl__is_in_locale_category(pTHX_ const bool compiling, const int category); PERL_CALLCONV bool Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c) __attribute__warn_unused_result__; diff --git a/regcomp.c b/regcomp.c index 42167e474d..9319e6084b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10178,7 +10178,6 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, void Perl__load_PL_utf8_foldclosures (pTHX) { - PL_utf8_foldclosures = _swash_inversion_hash(); } #endif @@ -10291,9 +10290,9 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; STRLEN foldlen = UTF8SKIP(s); const U8* e = s + bytelen; - SV** listp; + IV fc; - uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); /* The only code points that aren't folded in a UTF EXACTFish * node are are the problematic ones in EXACTFL nodes */ @@ -10305,14 +10304,21 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) U8 *d = folded; int i; + fc = -1; for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { if (isASCII(*s)) { *(d++) = (U8) toFOLD(*s); + if (fc < 0) { /* Save the first fold */ + fc = *(d-1); + } s++; } else { STRLEN len; - toFOLD_utf8_safe(s, e, d, &len); + UV fold = toFOLD_utf8_safe(s, e, d, &len); + if (fc < 0) { /* Save the first fold */ + fc = fold; + } d += len; s += UTF8SKIP(s); } @@ -10328,8 +10334,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) /* When we reach here 's' points to the fold of the first * character(s) of the node; and 'e' points to far enough along * the folded string to be just past any possible multi-char - * fold. 'foldlen' is the length in bytes of the first - * character in 's' + * fold. * * Unlike the non-UTF-8 case, the macro for determining if a * string is a multi-char fold requires all the characters to @@ -10342,33 +10347,29 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) invlist = _add_range_to_invlist(invlist, 0, UV_MAX); } else { /* Single char fold */ + unsigned int k; + int first_folds_to; + const int * remaining_folds_to_list; + Size_t folds_to_count; - /* It matches all the things that fold to it, which are - * found in PL_utf8_foldclosures (including itself) */ - invlist = add_cp_to_invlist(invlist, uc); - if (! PL_utf8_foldclosures) - _load_PL_utf8_foldclosures(); - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) s, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_tindex_skip_len_mg(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - assert(c_p); + /* It matches itself */ + invlist = add_cp_to_invlist(invlist, fc); - c = SvUV(*c_p); + /* ... plus all the things that fold to it, which are found in + * PL_utf8_foldclosures */ + folds_to_count = _inverse_folds(fc, &first_folds_to, + &remaining_folds_to_list); + for (k = 0; k < folds_to_count; k++) { + UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1]; /* /aa doesn't allow folds between ASCII and non- */ if ((OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE) - && isASCII(c) != isASCII(uc)) + && isASCII(c) != isASCII(fc)) { continue; } invlist = add_cp_to_invlist(invlist, c); - } } } } @@ -17859,27 +17860,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, _invlist_intersection(PL_utf8_foldable, cp_foldable_list, &fold_intersection); - /* The folds for all the Latin1 characters are hard-coded into this - * program, but we have to go out to disk to get the others. */ - if (invlist_highest(cp_foldable_list) >= 256) { - - /* This is a hash that for a particular fold gives all - * characters that are involved in it */ - if (! PL_utf8_foldclosures) { - _load_PL_utf8_foldclosures(); - } - } - /* Now look at the foldable characters in this class individually */ invlist_iterinit(fold_intersection); while (invlist_iternext(fold_intersection, &start, &end)) { UV j; + UV folded; /* Look at every character in the range */ for (j = start; j <= end; j++) { U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - SV** listp; + unsigned int k; + Size_t folds_to_count; + int first_folds_to; + const int * remaining_folds_to_list; if (j < 256) { @@ -17914,29 +17908,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * rules hard-coded for it. First, get its fold. This is * the simple fold, as the multi-character folds have been * handled earlier and separated out */ - _to_uni_fold_flags(j, foldbuf, &foldlen, + folded = _to_uni_fold_flags(j, foldbuf, &foldlen, (ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0); - /* Single character fold of above Latin1. Add everything in - * its fold closure to the list that this node should match. - * The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_tindex_skip_len_mg(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - assert(c_p); + /* Single character fold of above Latin1. Add everything + * in its fold closure to the list that this node should + * match. */ + folds_to_count = _inverse_folds(folded, &first_folds_to, + &remaining_folds_to_list); + for (k = 0; k <= folds_to_count; k++) { + UV c = (k == 0) /* First time through use itself */ + ? folded + : (k == 1) /* 2nd time use, the first fold */ + ? first_folds_to - c = SvUV(*c_p); + /* Then the remaining ones */ + : remaining_folds_to_list[k-2]; /* /aa doesn't allow folds between ASCII and non- */ if ((ASCII_FOLD_RESTRICTED @@ -17965,7 +17954,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, has_upper_latin1_only_utf8_matches, c); } - } } } } diff --git a/regexec.c b/regexec.c index 9f0abd749d..c1b89e57ad 100644 --- a/regexec.c +++ b/regexec.c @@ -4510,47 +4510,25 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, else { /* an EXACTFish node which doesn't begin with a multi-char fold */ c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; if (c1 > 255) { - /* Load the folds hash, if not already done */ - SV** listp; - if (! PL_utf8_foldclosures) { - _load_PL_utf8_foldclosures(); + const int * remaining_folds_to_list; + int first_folds_to; + + /* Look up what code points (besides c1) fold to c1; e.g., + * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ + Size_t folds_to_count = _inverse_folds(c1, + &first_folds_to, + &remaining_folds_to_list); + if (folds_to_count == 0) { + c2 = c1; /* there is only a single character that could + match */ } - - /* The fold closures data structure is a hash with the keys - * being the UTF-8 of every character that is folded to, like - * 'k', and the values each an array of all code points that - * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. - * Multi-character folds are not included */ - if ((! (listp = hv_fetch(PL_utf8_foldclosures, - (char *) pat, - UTF8SKIP(pat), - FALSE)))) - { - /* Not found in the hash, therefore there are no folds - * containing it, so there is only a single character that - * could match */ - c2 = c1; - } - else { /* Does participate in folds */ - AV* list = (AV*) *listp; - if (av_tindex_skip_len_mg(list) != 1) { - - /* If there aren't exactly two folds to this, it is - * outside the scope of this function */ + else if (folds_to_count != 1) { + /* If there aren't exactly two folds to this (itself and + * another), it is outside the scope of this function */ use_chrtest_void = TRUE; } - else { /* There are two. Get them */ - SV** c_p = av_fetch(list, 0, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c1 = SvUV(*c_p); - - c_p = av_fetch(list, 1, FALSE); - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c2 = SvUV(*c_p); + else { /* There are two. We already have one, get the other */ + c2 = first_folds_to; /* Folds that cross the 255/256 boundary are forbidden * if EXACTFL (and isnt a UTF8 locale), or EXACTFAA and @@ -4575,7 +4553,6 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } } } - } } else /* Here, c1 is <= 255 */ if (utf8_target diff --git a/sv.c b/sv.c index d38ccdcb91..07865bb2c1 100644 --- a/sv.c +++ b/sv.c @@ -15688,7 +15688,6 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_registered_mros = hv_dup_inc(proto_perl->Iregistered_mros, param); PL_blockhooks = av_dup_inc(proto_perl->Iblockhooks, param); - PL_utf8_foldclosures = hv_dup_inc(proto_perl->Iutf8_foldclosures, param); /* Call the ->CLONE method, if it exists, for each of the stashes identified by sv_dup() above. diff --git a/utf8.c b/utf8.c index 77115f3c26..7c5262e476 100644 --- a/utf8.c +++ b/utf8.c @@ -3637,6 +3637,66 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, } +Size_t +Perl__inverse_folds(pTHX_ const UV cp, int * first_folds_to, const int ** remaining_folds_to) +{ + /* Returns the count of the number of code points that fold to the input + * 'cp' (besides itself). + * + * If the return is 0, there is nothing else that folds to it, and + * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL. + * + * If the return is 1, '*first_folds_to' is set to the single code point, + * and '*remaining_folds_to' is set to NULL. + * + * Otherwise, '*first_folds_to' is set to a code point, and + * '*remaining_fold_to' is set to an array that contains the others. The + * length of this array is the returned count minus 1. + * + * The reason for this convolution is to avoid having to deal with + * allocating and freeing memory. The lists are already constructed, so + * the return can point to them, but single code points aren't, so would + * need to be constructed if we didn't employ something like this API */ + + SSize_t index = _invlist_search(PL_utf8_foldclosures, cp); + int base = _Perl_IVCF_invmap[index]; + + PERL_ARGS_ASSERT__INVERSE_FOLDS; + + if (base == 0) { /* No fold */ + *first_folds_to = 0; + *remaining_folds_to = NULL; + return 0; + } + +#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */ + + assert(base > 0); + +#else + + if (UNLIKELY(base < 0)) { /* Folds to more than one character */ + + /* The data structure is set up so that the absolute value of 'base' is + * an index into a table of pointers to arrays, with the array + * corresponding to the index being the list of code points that fold + * to 'cp', and the parallel array containing the length of the list + * array */ + *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0]; + *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes + *first_folds_to + */ + return IVCF_AUX_TABLE_lengths[-base]; + } + +#endif + + /* Only the single code point. This works like 'fc(G) = G - A + a' */ + *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index]; + *remaining_folds_to = NULL; + return 1; +} + STATIC UV S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp) -- cgit v1.2.1