diff options
author | Karl Williamson <khw@cpan.org> | 2014-04-28 20:55:50 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2014-05-30 10:33:02 -0600 |
commit | 8954b91a52258523bade4624c06e76d720ed31ae (patch) | |
tree | 9a5b6b54a3d4a01eb6da9e9b8d421f62edafe6b1 | |
parent | 4f3da64afef7a66791ba9b5f80dd8a1c044604a6 (diff) | |
download | perl-8954b91a52258523bade4624c06e76d720ed31ae.tar.gz |
regcomp.c: Move code into a function
This is in preparation for it to be called from another place
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embed.h | 1 | ||||
-rw-r--r-- | lib/diagnostics.t | 2 | ||||
-rw-r--r-- | pod/perldiag.pod | 2 | ||||
-rw-r--r-- | proto.h | 6 | ||||
-rw-r--r-- | regcomp.c | 143 |
6 files changed, 82 insertions, 74 deletions
@@ -2070,6 +2070,8 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ |bool allow_multi_fold \ |const bool silence_non_portable \ |NULLOK SV** ret_invlist +Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ + |NN SV** invlist Es |bool|could_it_be_a_POSIX_class|NN RExC_state_t *pRExC_state Es |regnode*|handle_regex_sets|NN RExC_state_t *pRExC_state \ |NULLOK SV ** return_invlist \ @@ -907,6 +907,7 @@ # if defined(PERL_IN_REGCOMP_C) #define _append_range_to_invlist(a,b,c) S__append_range_to_invlist(aTHX_ a,b,c) #define _invlist_array_init(a,b) S__invlist_array_init(aTHX_ a,b) +#define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) #define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) #define add_data S_add_data #define alloc_maybe_populate_EXACT(a,b,c,d,e,f) S_alloc_maybe_populate_EXACT(aTHX_ a,b,c,d,e,f) diff --git a/lib/diagnostics.t b/lib/diagnostics.t index 8868edabed..7c04342735 100644 --- a/lib/diagnostics.t +++ b/lib/diagnostics.t @@ -112,7 +112,7 @@ like $warning, # ; at end of entry in perldiag.pod seek STDERR, 0,0; $warning = ''; -warn "Perl folding rules are not up-to-date for 0xA; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/"; +warn "Perl folding rules are not up-to-date for 0x0A; please use the perlbug utility to report; in regex; marked by <-- HERE in m/\ <-- HERE q/"; like $warning, qr/You used a regular expression with case-insensitive matching/s, '; works at the end of entries in perldiag.pod'; diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 6c02d65e37..0ccf5fb20d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -4147,7 +4147,7 @@ redirected it with select().) "Can't locate object method \"%s\" via package \"%s\"". It often means that a method requires a package that has not been loaded. -=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug +=item Perl folding rules are not up-to-date for 0x%X; please use the perlbug utility to report; in regex; marked by S<<-- HERE> in m/%s/ (S regexp) You used a regular expression with case-insensitive matching, @@ -6642,6 +6642,12 @@ PERL_STATIC_INLINE UV* S__invlist_array_init(pTHX_ SV* const invlist, const bool #define PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT \ assert(invlist) +STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) + __attribute__nonnull__(pTHX_1) + __attribute__nonnull__(pTHX_3); +#define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \ + assert(pRExC_state); assert(invlist) + PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) __attribute__warn_unused_result__; @@ -13119,6 +13119,74 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a fold with other code points above Latin1. It + * would give false results if /aa has been specified. Multi-char folds + * are outside the scope of this, and must be handled specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code points + * that require UTF8 to express, so they can't match unless the + * target string is in UTF-8, so no action here is necessary, as + * regexec.c properly handles the general case for UTF-8 matching + * and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ @@ -14338,15 +14406,6 @@ parseit: if (j < 256) { - /* We have the latin1 folding rules hard-coded here so - * that an innocent-looking character class, like - * /[ks]/i won't have to go out to disk to find the - * possible matches. XXX It would be better to - * generate these via regen, in case a new version of - * the Unicode standard adds new mappings, though that - * is not really likely, and may be caught by the - * default: case of the switch below. */ - if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched @@ -14366,69 +14425,9 @@ parseit: if (HAS_NONLATIN1_FOLD_CLOSURE(j) && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, <j> is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for <j> to the - * inversion list. */ - - switch (j) { - case 'k': - case 'K': - *use_list = - add_cp_to_invlist(*use_list, KELVIN_SIGN); - break; - case 's': - case 'S': - *use_list = add_cp_to_invlist(*use_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - *use_list = add_cp_to_invlist(*use_list, - GREEK_CAPITAL_LETTER_MU); - *use_list = add_cp_to_invlist(*use_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - *use_list = - add_cp_to_invlist(*use_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 - * to express, so they can't match unless - * the target string is in UTF-8, so no - * action here is necessary, as regexec.c - * properly handles the general case for - * UTF-8 matching and multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } continue; } |